In [None]:
import pandas as pd

data_jan = "/home/pastor/projects/mlops-zoomcamp/data/yellow_tripdata_2023-01.parquet"
data_feb = "/home/pastor/projects/mlops-zoomcamp/data/yellow_tripdata_2023-02.parquet"

In [None]:
df = pd.read_parquet(data_jan)

## Q1 How many columns does the January data 2023 has

In [None]:
len(df.columns)

## Q2 Compute the duration and the standard deviation

In [None]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].dt.total_seconds() / 60

In [None]:
df['duration'].std()

## Q3 Dropping outliers

In [None]:
df.duration.describe()

In [None]:
((df['duration'] >= 1) & (df['duration'] <= 60)).mean()

In [None]:
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

## Q4 One-Hot encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [None]:
df[categorical] = df[categorical].astype('str')

In [None]:
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [None]:
X_train
# What's the dimensionality of this matrix (number of columns)?
X_train.shape[1]

## Q5 Training a model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
target = 'duration'
y_train = df[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_train)

In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
root_mean_squared_error(y_train, y_pred)

## Q6 Evaluating the model

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    # convert to minutes
    df['duration'] = df['duration'].dt.total_seconds() / 60

    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype('str')
    return df

In [None]:
df_train = read_dataframe(data_jan)
df_val = read_dataframe(data_feb)

In [None]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

In [None]:
root_mean_squared_error(y_val, y_pred)