In [2]:
!python -V

Python 3.9.18


In [2]:
import pandas as pd
import pickle
import seaborn as sns
from sklearn import feature_extraction
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pyarrow.parquet as pq
import requests

In [10]:
# Download the file
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-11.parquet"
response = requests.get(url)
file_path = "green_tripdata_2023-11.parquet"

with open(file_path, "wb") as file:
    file.write(response.content)

# Read the Parquet file
df = pd.read_parquet(file_path)

# Display the dataframe
df.head()


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-11-01 00:56:15,2023-11-01 00:59:34,N,1.0,24,238,1.0,0.73,6.5,1.0,0.5,1.8,0.0,,1.0,10.8,1.0,1.0,0.0
1,2,2023-11-01 00:25:06,2023-11-01 00:30:59,N,1.0,74,75,1.0,1.53,9.3,1.0,0.5,2.36,0.0,,1.0,14.16,1.0,1.0,0.0
2,2,2023-11-01 00:40:11,2023-11-01 00:44:08,N,1.0,74,74,1.0,0.7,6.5,1.0,0.5,0.0,0.0,,1.0,9.0,1.0,1.0,0.0
3,2,2023-11-01 00:22:16,2023-11-01 00:33:06,N,1.0,129,82,1.0,1.12,10.7,1.0,0.5,0.0,0.0,,1.0,13.2,2.0,1.0,0.0
4,2,2023-11-01 00:30:48,2023-11-01 00:34:37,N,1.0,82,82,1.0,0.85,6.5,1.0,0.5,0.0,0.0,,1.0,9.0,2.0,1.0,0.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64025 entries, 0 to 64024
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               64025 non-null  int32         
 1   lpep_pickup_datetime   64025 non-null  datetime64[ns]
 2   lpep_dropoff_datetime  64025 non-null  datetime64[ns]
 3   store_and_fwd_flag     59315 non-null  object        
 4   RatecodeID             59315 non-null  float64       
 5   PULocationID           64025 non-null  int32         
 6   DOLocationID           64025 non-null  int32         
 7   passenger_count        59315 non-null  float64       
 8   trip_distance          64025 non-null  float64       
 9   fare_amount            64025 non-null  float64       
 10  extra                  64025 non-null  float64       
 11  mta_tax                64025 non-null  float64       
 12  tip_amount             64025 non-null  float64       
 13  t

In [15]:
df.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'duration'],
      dtype='object')

In [16]:
df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
df.duration.head()

0   0 days 00:03:19
1   0 days 00:05:53
2   0 days 00:03:57
3   0 days 00:10:50
4   0 days 00:03:49
Name: duration, dtype: timedelta64[ns]

In [17]:
# Convert the duration to minutes
df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)
df.duration.head()

0     3.316667
1     5.883333
2     3.950000
3    10.833333
4     3.816667
Name: duration, dtype: float64

In [18]:
# drop the rows which have duration less than 0 and greater than 60 minutes
df = df[(df.duration<60) & (df.duration >=1)]
df.shape

(61094, 21)

In [19]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [21]:
df[categorical] = df[categorical].astype('str')

In [22]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

In [23]:
train_dicts[0]

{'PULocationID': '24', 'DOLocationID': '238', 'trip_distance': 0.73}

In [24]:
dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

In [25]:
y_train

array([ 3.31666667,  5.88333333,  3.95      , ..., 24.        ,
        5.        ,  5.        ])