In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [2]:
import pickle
import pandas as pd
from datetime import datetime

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [4]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [7]:
y_pred.std()

5.556602554785001

In [8]:
y_pred.mean()

12.758556818790902

In [9]:
df['ride_id'] = f'{datetime.now().year:04d}/{datetime.now().month:02d}_' + df.index.astype('str')

In [10]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
count,3539827.0,3539827,3539827,3429669.0,3539827.0,3429669.0,3539827.0,3539827.0,3539827.0,3539827.0,3539827.0,3539827.0,3539827.0,3539827.0,3429669.0,3429669.0,3539827.0
mean,1.707246,2022-03-16 16:37:35.938137856,2022-03-16 16:51:59.895788800,1.387912,5.665108,1.342189,1.175141,13.33845,1.031789,0.4936627,2.588406,0.4475753,0.2973891,19.90878,2.321207,0.08129392,14.39929
min,1.0,2008-12-31 23:19:56,2008-12-31 23:44:06,0.0,0.0,1.0,0.0,-300.0,-4.5,-0.5,-84.32,-83.0,-0.3,-302.8,-2.5,-1.25,1.0
25%,1.0,2022-03-08 20:13:02,2022-03-08 20:25:49.500000,1.0,1.1,1.0,1.0,7.0,0.0,0.5,1.0,0.0,0.3,11.85,2.5,0.0,7.283333
50%,2.0,2022-03-16 18:26:29,2022-03-16 18:42:01,1.0,1.83,1.0,1.0,9.5,0.5,0.5,2.16,0.0,0.3,15.36,2.5,0.0,11.71667
75%,2.0,2022-03-24 16:28:01,2022-03-24 16:43:48.500000,1.0,3.31,1.0,1.0,15.0,2.5,0.5,3.25,0.0,0.3,21.36,2.5,0.0,18.45
max,6.0,2022-05-15 20:33:18,2022-05-15 20:38:57,8.0,286259.8,99.0,4.0,700.0,9.05,17.1,466.0,235.7,0.3,843.36,2.75,1.25,60.0
std,0.4681243,,,0.9728462,574.8943,5.418762,0.4846125,11.43029,1.235186,0.0732884,2.825181,1.798,0.03935751,14.6504,0.6818401,0.3096991,10.11784


In [11]:
df['ride_id']

df_result = pd.DataFrame()

In [12]:
df_result['ride_id'] = df['ride_id']

In [13]:
df_result['y_pred'] = y_pred

In [14]:
output_file="yellow_tripdata_ride_2022-02"
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [15]:
df_result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3539827 entries, 0 to 3627881
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   ride_id  object 
 1   y_pred   float64
dtypes: float64(1), object(1)
memory usage: 81.0+ MB


In [16]:
df_result['ride_id']

0                2023/06_0
1                2023/06_1
2                2023/06_2
3                2023/06_3
4                2023/06_4
                ...       
3627877    2023/06_3627877
3627878    2023/06_3627878
3627879    2023/06_3627879
3627880    2023/06_3627880
3627881    2023/06_3627881
Name: ride_id, Length: 3539827, dtype: object

In [17]:
y_pred.mean()

12.758556818790902