In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
pd.__version__

'2.1.4'

In [3]:
df_jan_2023 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [4]:
df_feb_2023 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [5]:
print('Total columns in Jan 2023 yellow cab data is' , len(df_jan_2023.columns))

Total columns in Jan 2023 yellow cab data is 19


In [6]:
#for timedelta type, use dt accessor
#https://stackoverflow.com/questions/70554811/the-difference-between-pandas-timedelta-and-timedelta64ns
df_jan_2023['duration'] = (df_jan_2023.tpep_dropoff_datetime - df_jan_2023.tpep_pickup_datetime).dt.total_seconds()/60
print('Standard deviation of the trips duration in January' , np.std(df_jan_2023['duration']))

Standard deviation of the trips duration in January 42.5943442974141


In [7]:
#By default, includes boundaries 
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.between.html
df_jan_2023_no_outliers = df_jan_2023[df_jan_2023['duration'].between(1, 60)]
print('Fraction of the records left after dropping the outliers:', len(df_jan_2023_no_outliers)/len(df_jan_2023))

Fraction of the records left after dropping the outliers: 0.9812202822125979


In [8]:
categorical = ['PULocationID', 'DOLocationID']
df_jan_2023_no_outliers[categorical] = df_jan_2023_no_outliers[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jan_2023_no_outliers[categorical] = df_jan_2023_no_outliers[categorical].astype(str)


In [9]:
train_dicts = df_jan_2023_no_outliers[categorical].to_dict(orient='records')

In [10]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [11]:
print('Dimensionality(no of columns) of the matrix: ', X_train.shape[1])

Dimensionality(no of columns) of the matrix:  515


In [12]:
target = 'duration'
y_train = df_jan_2023_no_outliers[target].values

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [14]:
y_pred = lr.predict(X_train)

In [15]:
print('rmse train:',mean_squared_error(y_train, y_pred, squared=False))

rmse train: 7.649261930819891


In [16]:
del df_jan_2023
del X_train

In [17]:
df_feb_2023['duration'] = (df_feb_2023.tpep_dropoff_datetime - df_feb_2023.tpep_pickup_datetime).dt.total_seconds()/60
df_feb_2023_no_outliers = df_feb_2023[df_feb_2023['duration'].between(1, 60)]
categorical = ['PULocationID', 'DOLocationID']
df_feb_2023_no_outliers[categorical] = df_feb_2023_no_outliers[categorical].astype(str)
val_dicts = df_feb_2023_no_outliers[categorical].to_dict(orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feb_2023_no_outliers[categorical] = df_feb_2023_no_outliers[categorical].astype(str)


In [18]:
del df_feb_2023

In [19]:
X_val = dv.transform(val_dicts)
target = 'duration'
y_val = df_feb_2023_no_outliers[target].values

In [20]:
y_pred_val = lr.predict(X_val)
print('rmse val:', mean_squared_error(y_val, y_pred_val, squared=False))

rmse val: 7.811817675774269


In [22]:
#capture model for later use
import pickle
with open('model.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)