# Homework 4

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [None]:
df_train = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')
df_val = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

with open('model.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

## Q1. Notebook

We'll start with the same notebook we ended up with in homework 1. We cleaned it a little bit and kept only the scoring part. You can find the initial notebook here. Run this notebook for the March 2023 data.

What's the standard deviation of the predicted duration for this dataset?
* **6.24**

In [None]:
year = 2023
month = 3
taxi_type = "yellow"

input_file = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet"
output_file = f"{taxi_type}_{year:04d}-{month:02d}.parquet"

In [None]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [None]:
def apply_model(input_file):
    df = read_data(input_file)
    dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(dicts)
    y_pred = model.predict(X_val)
    print(np.std(y_pred))

    return df, y_pred

In [None]:
df, y_pred = apply_model(input_file)

## Q2. Preparing the output

Like in the course videos, we want to prepare the dataframe with the output.

First, let's create an artificial ride_id column.

Next, write the ride id and the predictions to a dataframe with results. Save it as parque.
    
What's the size of the output file?
* **66M**

In [None]:
df.head()

In [None]:
def save_results(df, y_pred, year, month):
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['predicted'] = y_pred

    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )

In [None]:
save_results(df, y_pred, year, month)

## Q3. Creating the scoring script

Now let's turn the notebook into a script.

Which command you need to execute for that?

`jupyter nbconvert --to script homework.ipynb --output score`

In [None]:
!jupyter nbconvert --to script homework.ipynb --output score

## Q4. Virtual environment

Now let's put everything into a virtual environment. We'll use pipenv for that.

Install all the required libraries. Pay attention to the Scikit-Learn version: it should be the same as in the starter notebook.

After installing the libraries, pipenv creates two files: Pipfile and Pipfile.lock. The Pipfile.lock file keeps the hashes of the dependencies we use for the virtual env.

What's the first hash for the Scikit-Learn dependency?

`"sha256": "8f34726ce5eb22cfa6dce50b2608b1ce222e1bddc222513191a091e019f7517f"`

## Q5. Parametrize the script

Let's now make the script configurable via CLI. We'll create two parameters: year and month. Run the script for April 2023.

What's the mean predicted duration?
* **14.29**

## Q6. Docker container

Finally, we'll package the script in the docker container. For that, you'll need to use a base image that we prepared.

This image already has a pickle file with a dictionary vectorizer and a model. You will need to use them.

Important: don't copy the model to the docker image. You will need to use the pickle file already in the image.

Now run the script with docker. What's the mean predicted duration for May 2023?
* 0.19
* 7.24
* 14.24
* 21.19