In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [3]:
import pickle
from pathlib import Path

import pandas as pd

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [5]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [7]:
df = read_data("/Users/rgareev/data/ny-tlc/src/fhv_tripdata_2021-02.parquet")

In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [9]:
y_pred.mean()

16.191691679979066

# Q2

In [10]:
df['predicted_duration'] = y_pred

In [11]:
df['ride_id'] =  df.pickup_datetime.apply(lambda dt: f'{dt.year:04d}/{dt.month:02d}_') + df.index.astype('str')

In [12]:
df.sample(10)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,predicted_duration,ride_id
649751,B00112,2021-02-18 18:06:57,2021-02-18 18:33:06,-1,14,,B00112,26.15,13.910644,2021/02_649751
1010652,B02536,2021-02-28 00:06:38,2021-02-28 00:16:02,108,-1,,B02536,9.4,20.235869,2021/02_1010652
290967,B00160,2021-02-09 15:24:00,2021-02-09 15:31:00,-1,-1,,B00160,7.0,23.052085,2021/02_290967
259543,B01509,2021-02-08 21:53:28,2021-02-08 22:04:00,-1,29,,B01509,10.533333,19.562814,2021/02_259543
16963,B01231,2021-02-02 10:29:27,2021-02-02 10:39:21,-1,17,,B02682,9.9,11.536334,2021/02_16963
192921,B01051,2021-02-06 22:57:42,2021-02-06 23:09:03,-1,47,,B01051,11.35,13.300669,2021/02_192921
396035,B01233,2021-02-12 06:39:23,2021-02-12 07:00:41,-1,174,,B01233,21.3,14.787019,2021/02_396035
778875,B01231,2021-02-22 12:03:01,2021-02-22 12:04:02,-1,217,,B02918,1.016667,7.924538,2021/02_778875
49463,B01231,2021-02-03 10:36:43,2021-02-03 10:47:41,-1,17,,B01231,10.966667,11.536334,2021/02_49463
143330,B01239,2021-02-05 13:45:50,2021-02-05 13:50:01,-1,51,,B02872,4.183333,12.448414,2021/02_143330


In [13]:
output_dir = Path('./wrk')
output_dir.mkdir(exist_ok=True, parents=True)

In [14]:
output_path = output_dir / 'predictions.parquet'

In [15]:
df[['ride_id', 'predicted_duration']].dtypes

ride_id                object
predicted_duration    float64
dtype: object

In [16]:
df[['ride_id', 'predicted_duration']].to_parquet(
    output_path,
    engine='pyarrow',
    compression=None,
    index=False
)

In [17]:
!ls -al wrk

total 38920
drwxr-xr-x  3 rgareev  staff        96 Jun 26 22:51 [1m[36m.[m[m
drwxr-xr-x  8 rgareev  staff       256 Jun 27 11:16 [1m[36m..[m[m
-rw-r--r--  1 rgareev  staff  19711507 Jun 27 11:17 predictions.parquet
