In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.1.1


In [2]:
import pickle
import pandas as pd

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [8]:
y_pred.mean()

16.191691679979066

In [9]:
from datetime import datetime
year= datetime.today().year
month=datetime.today().month

In [10]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [11]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,ride_id
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173,82,,B00021,10.666667,2022/06_1
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173,56,,B00021,14.566667,2022/06_2
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82,129,,B00021,7.95,2022/06_3
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1,225,,B00037,13.8,2022/06_4
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1,61,,B00037,8.966667,2022/06_5


In [12]:
df["prediction"]=y_pred

In [13]:
df_result=df[["ride_id","prediction"]]

In [14]:
df_result.to_parquet(
    "df_result",
    engine='pyarrow',
    compression=None,
    index=False
)

In [15]:
!ls -ltr

total 19280
-rw-r--r-- 1 saiteja saiteja     2360 Jun 28 22:22 starter.ipynb
-rw-r--r-- 1 saiteja saiteja    17760 Jun 28 22:22 model.bin
-rw-r--r-- 1 saiteja saiteja       72 Jun 28 22:22 Dockerfile
-rw-r--r-- 1 saiteja saiteja 19711435 Jun 28 22:45 df_result


In [16]:
def data_with_date(month,year):
    datetime_object1 = datetime.strptime(month,'%B')
    datetime_object2 = datetime.strptime(year,'%Y')    
    filename= "https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_"+ str(datetime_object2.year) + "-" + str(datetime_object1.month).zfill(2)+".parquet"
    df = read_data(filename)
    return df

In [17]:
def mean_predicted_duration(df):
    dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(dicts)
    y_pred = lr.predict(X_val)
    return(y_pred.mean())

In [18]:
!jupyter nbconvert starter.ipynb --to script

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 860 bytes to starter.py


"scikit-learn": {
            "hashes": [
                "sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b",
                "sha256:158faf30684c92a78e12da19c73feff9641a928a8024b4fa5ec11d583f3d8a87",

In [23]:
"sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b"

'sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b'

In [26]:
!python starter.py March 2021

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
March 2021
16.298821614015107


In [27]:
month="March"
year="2021"
df_new=data_with_date(month,year)
mean_predicted_duration(df_new)

16.298821614015107

In [None]:
docker build -t mlops-zoomcamp-sai-homework-week4:v1 .

In [29]:
!docker run -it mlops-zoomcamp-sai-homework-week4:v1

April 2021
9.967573179784523


In [31]:
!docker tag mlops-zoomcamp-sai-homework-week4:v1 agrigorev/mlops-zoomcamp-sai-homework-week4:v1

In [32]:
!docker push agrigorev/mlops-zoomcamp-sai-homework-week4:v1

The push refers to repository [docker.io/agrigorev/mlops-zoomcamp-sai-homework-week4]

[1Bff5b9576: Preparing 
[1B86e66a42: Preparing 
[1Ba6b530f3: Preparing 
[1B3c5d65df: Preparing 
[1B6733ae32: Preparing 
[1Be81be153: Preparing 
[1B4c1b2331: Preparing 
[1B57ad7c90: Preparing 
[1Bffa0f231: Preparing 
[1Bf2f3031a: Preparing 
[1B9e19fc12: Preparing 
[1B579f0c8c: Preparing 
[1B81af66dd: Preparing 
[1Beb12ee3f: Preparing 
[1B205045a6: Preparing 
[1B6c64b82c: Preparing 
[1Bedeb736e: Preparing 
[1Bce0964b8: Preparing 
[11Bdenied: requested access to the resource is denied
