In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [2]:
!python -V

Python 3.10.13


In [3]:
import pickle
import pandas as pd
import os

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

y_pred 

array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,
       13.11317847, 12.89999218])

In [8]:
# Calculate the standard deviation
std_dev = y_pred.std()

print("Standard Deviation of y_pred:", std_dev)

Standard Deviation of y_pred: 6.247488852238703


In [9]:
year = 2023
month = 3

# Create ride_id column based on year, month, and index
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [10]:
df['predicted_duration'] = y_pred 

# Create df_result with ride_id and predictions
df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predicted_duration': y_pred
})

df_result.head()

Unnamed: 0,ride_id,predicted_duration
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.99772
4,2023/03_4,10.234486


In [11]:
output_file = 'results.parquet'  # Specify your output file path

# Save df_result as Parquet
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,  # You can choose a compression method like 'gzip' or 'snappy' if needed
    index=False  # Do not include the index in the output
)


In [12]:
# Get the size of the output file
file_size = os.path.getsize(output_file)

print(f"Size of the output file '{output_file}': {file_size} bytes")

Size of the output file 'results.parquet': 68641880 bytes
