In [5]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [4]:
!python -V

Python 3.10.13


In [3]:
import pickle
import pandas as pd

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression

In [6]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [7]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [8]:
df = read_data('yellow_tripdata_2023-03.parquet')

In [15]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)
y_val = df['duration'].values

In [21]:
y_pred.std()

6.247488852238703

In [39]:
year = 2023
month = 3

df["predicted_duration"] = y_pred


df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result = df[["ride_id", "predicted_duration"]]
df_result.head()

Unnamed: 0,ride_id,predicted_duration
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.99772
4,2023/03_4,10.234486


In [40]:
output_file = f'yellow_predict_{year:04d}-{month:02d}.parquet'

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

df.shape

(3316216, 22)

In [44]:
%ls -la --block-size=M

total 120M
drwxrwxrwx+ 2 codespace codespace  1M Jun 17 22:24 [0m[34;42m.[0m/
drwxrwxrwx+ 7 codespace root       1M Jun 17 21:19 [34;42m..[0m/
-rw-rw-rw-  1 codespace codespace  1M Jun 17 21:20 model.bin
-rw-rw-rw-  1 codespace codespace  1M Jun 17 22:25 starter.ipynb
-rw-rw-rw-  1 codespace codespace 66M Jun 17 22:24 yellow_predict_2023-03.parquet
-rw-rw-rw-  1 codespace codespace 54M Jun 17 21:21 yellow_tripdata_2023-03.parquet


In [43]:
%ls "yellow_predict_2023-03.parquet" -l --block-size=M

-rw-rw-rw- 1 codespace codespace 66M Jun 17 22:24 yellow_predict_2023-03.parquet


In [45]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1708 bytes to starter.py


In [46]:
!conda list | grep scikit-learn

In [48]:
!pipenv install scikit-learn==1.5.0 pandas --python=3.10


[1mCreating a virtualenv for this project...[0m
Pipfile: [33m[1m/workspaces/mlops/4_deployment/Pipfile[0m
[1mUsing[0m [33m[1m/usr/local/python/3.10.13/bin/python[0m [32m(3.10.13)[0m [1mto create virtualenv...[0m
[2K[32m⠇[0m Creating virtual environment...[36mcreated virtual environment CPython3.10.13.final.0-64 in 1035ms
  creator CPython3Posix(dest=/home/codespace/.local/share/virtualenvs/4_deployment-zoR2OdsI, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/home/codespace/.local/share/virtualenv)
    added seed packages: pip==24.0, setuptools==69.5.1, wheel==0.43.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
[0m
✔ Successfully created virtual environment!
[2K[32m⠇[0m Creating virtual environment...
[1A[2K[32mVirtualenv location: /home/codespace/.local/share/virtualenvs/4_deployment-zoR2OdsI