In [211]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_squared_error


In [230]:
def read_dataframe(filename):   
    df=pd.read_parquet(filename)
    df.lpep_dropoff_datetime=pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime=pd.to_datetime(df.lpep_pickup_datetime)

    df['duration']=df.lpep_dropoff_datetime-df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda td:td.total_seconds()/60)
    df=df[(df.duration>=1)&(df.duration<=60)]
    categorical=['PULocationID','DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df

In [231]:
df_train=read_dataframe('green_tripdata_2021-01.parquet')
df_val=read_dataframe('green_tripdata_2021-02.parquet')

In [232]:
len(df_train),len(df_val)

(73908, 61921)

In [233]:
df_train

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.50,...,0.5,0.00,0.00,,0.3,6.80,2.0,1.0,0.00,3.933333
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.00,...,0.5,2.81,0.00,,0.3,16.86,1.0,1.0,2.75,8.750000
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.00,...,0.5,1.00,0.00,,0.3,8.30,1.0,1.0,0.00,5.966667
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.00,...,0.5,0.00,0.00,,0.3,9.30,2.0,1.0,0.00,7.083333
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.50,...,0.5,0.96,0.00,,0.3,5.76,1.0,1.0,0.00,2.316667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,2,2021-01-31 21:38:00,2021-01-31 22:16:00,,,81,90,,17.63,56.23,...,0.0,0.00,6.12,,0.3,65.40,,,,38.000000
76514,2,2021-01-31 22:43:00,2021-01-31 23:21:00,,,35,213,,18.36,46.66,...,0.0,12.20,6.12,,0.3,65.28,,,,38.000000
76515,2,2021-01-31 22:16:00,2021-01-31 22:27:00,,,74,69,,2.50,18.95,...,0.0,0.00,0.00,,0.3,22.00,,,,11.000000
76516,2,2021-01-31 23:10:00,2021-01-31 23:37:00,,,168,215,,14.48,48.87,...,0.0,0.00,6.12,,0.3,58.04,,,,27.000000


In [235]:
dv=DictVectorizer()
categorical=['PULocationID','DOLocationID']
train_dict=df_train[categorical].to_dict(orient='records')
X_train=dv.fit_transform(train_dict)

val_dicts=df_val[categorical].to_dict(orient='records')
x_val=dv.transform(val_dicts)
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(x_val)
mean_squared_error(y_val,y_pred,squared=False)
 

10.473736328214747

In [236]:
lr=Lasso(alpha=0.2)
lr.fit(X_train,y_train)
y_pred=lr.predict(x_val)
mean_squared_error(y_val,y_pred,squared=False)
 

11.913623603162394

In [237]:
lr=Ridge(alpha=10)
lr.fit(X_train,y_train)
y_pred=lr.predict(x_val)
mean_squared_error(y_val,y_pred,squared=False)
 

10.472802205331451

In [239]:
import pickle
with open('lasso.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)