In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error
from sklearn.feature_extraction import DictVectorizer
import pickle
import mlflow

In [2]:
def prepare_the_data(url):
    df = pd.read_parquet(url)

    # Convert into minutes
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration_minutes'] = df['duration'].dt.total_seconds() / 60

    # Filder df by trip time
    df = df[(df['duration_minutes'] >= 1) & (df['duration_minutes'] <= 60)]

    # Features defining
    categorical_features = ['PULocationID', 'DOLocationID']
    
    # Data Type preparation
    df[categorical_features] = df[categorical_features].astype(str)

    return df[:100000]

In [3]:
def lr_training(df):
    features = df[['PULocationID', 'DOLocationID']]
    y_true = df['duration_minutes']
    features_dict = features.to_dict(orient="records")

    # Fit a dictionary vectorizer
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(features_dict)

    lr = LinearRegression()
    lr.fit(X, y_true)

    df['y_pred'] = lr.predict(X)

    with open('models/simple_lin_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out) 

    return mean_absolute_error(y_true, df['y_pred'])

In [4]:
df_train = prepare_the_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet')

In [5]:
df_train.shape

(100000, 21)

In [6]:
lr_training(df_train)

6.0781997096443

In [7]:
# Let's use this model in our valid set
df_valid = prepare_the_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet')

In [8]:
df_valid.shape

(100000, 21)

In [9]:
def prediction(df):
    # Load the model and DictVectorizer
    with open('models/simple_lin_reg.bin', 'rb') as f_in:
        dv, lr = pickle.load(f_in)
        
        features = df[['PULocationID', 'DOLocationID']]
        y_true = df['duration_minutes']
        features_dict = features.to_dict(orient="records")
    
        # Use the already fitted DictVectorizer
        X = dv.transform(features_dict)
        
        df['y_pred'] = lr.predict(X)
    
        return mean_absolute_error(y_true, df['y_pred'])

In [10]:
prediction(df_valid)

6.375235023435177