## Import libraries and packages

In [1]:
# Import libraries and packages
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## 1. Download and read yellow taxi trips data for January 2023

In [2]:
# Read yellow taxi trips data
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
print(df.shape)
df.info()
print(f"The number of columns for yellow taxi trip records data are {df.shape[1]}")

(3066766, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_

### The number of columns for yellow taxi trip records data in Jan 2023 are 19

## 2. Computing trips duration
* Calculate the trip duration by subtracting pickup time from drop-off time, and then convert them to minutes.

In [4]:
# Subtract the pickup time from drop-off time, and then convert to minutes
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

# Convert the duration from timedelta to minutes using a lambda function
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

# Print the shape of the DataFrame
print(df.shape)

# Assign the total number of observations to a variable
total_observations = df.shape[0]

# Print the total number of observations
print(f"The total number of observations is {total_observations}")

# Display the first few rows of the DataFrame
df.head()

(3066766, 20)
The total number of observations is 3066766


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6.316667
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,12.75
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,9.616667
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,10.833333


In [5]:
# Compute the standard deviation of the trips duration in January 2023
std = df['duration'].describe()['std']
print(f"The standard deviation of the trip durations in January 2023 is {round(std, 2)}")

The standard deviation of the trip durations in January 2023 is 42.59


### The standard deviation of the trips duration in January 2023 is 42.59

## 3. Remove the outliers
* Drop the outliers by keeping only the trip records with durations between 1 and 60 minutes (inclusive).

In [6]:
# Filter the records to the trip records with durations between 1 and 60 minutes (inclusive)
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Assign the number of observations left after dropping the outliers to a variable
working_observations = df.shape[0]

# Print the numbers of working observation
print(f"The number of observations without outliers is {working_observations}")

#  Calculate the fraction of the records left after dropping the outliers
percentage = (working_observations / total_observations) * 100

# Print the fraction of the records left after dropping the outliers
print(f"The fraction of the records left after dropping the outliers is {round(percentage, 2)}%")

The number of observations without outliers is 3009173
The fraction of the records left after dropping the outliers is 98.12%


### The fraction of the records left after removing the outliers is 98%

## 4. One-hot encoding
* Select the two features; pickup and dropoff location IDs
* Convert the dataframe into a list of dictionaries
* Fit a dictionary vectorizer
* Get a feature matrix

In [7]:
# Select the two features; pickup and dropoff location IDs
categorical = ['PULocationID', 'DOLocationID']

# Re-cast the ids to strings
df[categorical] = df[categorical].astype(str)

# Convert the dataframe into a list of dictionaries
train_dicts = df[categorical].to_dict(orient='records')

# Fit a dictionary vectorizer
dv = DictVectorizer()

# Get a feature matrix
X_train = dv.fit_transform(train_dicts)

# print the feature matrix
print(f"The shape of the feature matrix is {X_train.shape}")
print(f"The dimensionality of the feature matrix is {X_train.shape[1]}")

The shape of the feature matrix is (3009173, 515)
The dimensionality of the feature matrix is 515


### The dimensionality of the feature matrix 515

## 5. Training a model
* Train a plain linear regression model with default parameters (feature matrix)
* Calculate the RMSE of the model on the training data

In [8]:
# Define the name of the target feature
target = 'duration'

# Extract the values of the target feature and assign them to y_train
y_train = df[target].values

# Instantiate Linear Regression
lr = LinearRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predict the target variable using the trained model
y_pred = lr.predict(X_train)

# Calculate the Root Mean Squared Error (RMSE) of the model on the training data
rmse = mean_squared_error(y_train, y_pred, squared=False)

# Print the RMSE of the model on the training data
print(f"The Root Mean Squared Error (RMSE) of the model on the training data {round(rmse, 2)}")

The Root Mean Squared Error (RMSE) of the model on the training data 7.65


### The Root Mean Squared Error (RMSE) of the model on the training data 7.64

## 6. Evaluating the model

In [9]:
# Download and read the yellow taxi trips records for Jan and Feb 2023
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    else:
        raise ValueError("Unsupported file format. Only Parquet files are supported.")

    # Convert pickup and dropoff datetimes to pandas datetime objects
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

    # Calculate trip duration in minutes
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

    # Filter out trips with duration less than 1 minute or greater than 60 minutes
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Convert categorical columns to strings
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [10]:
# Read the training and validation DataFrames from the yellow taxi trips data URLs for Jan and Feb 2023 respectively
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

# Length of observations in df_train and df_val
len(df_train), len(df_val)

(3009173, 2855951)

### Reduce the training and validation datasets to decrease memory consumption due to memory limitations

In [11]:
# Reduce df_train by 30%
df_train = df_train.sample(frac=0.7, random_state=42)

# Reduce df_val by 30%
df_val = df_val.sample(frac=0.7, random_state=42)

# Print the shapes of the reduced DataFrames to verify the reduction
print(f"Reduced df_train shape: {df_train.shape}, Reduced df_val shape: {df_val.shape}")

Reduced df_train shape: (2106421, 20), Reduced df_val shape: (1999166, 20)


### a. Consider only 'PULocationID' and 'DOLocationID' 

In [12]:
# Instantiate a DictVectorizer object
dv = DictVectorizer()

# Convert the categorical columns in the training DataFrame to a list of dictionaries
train_dicts = df_train[categorical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with DictVectorizer, fitting it to the training data
X_train = dv.fit_transform(train_dicts)

# Convert the categorical columns in the validation DataFrame to a list of dictionaries
val_dicts = df_val[categorical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with d pre-fitted DictVectorizer, applied to d validation data
X_val = dv.transform(val_dicts)

# Define the target variable
target = 'duration'

# Extract the target variable from the training DataFrame and convert it to a NumPy array
y_train = df_train[target].values

# Extract the target variable from the validation DataFrame and convert it to a NumPy array
y_val = df_val[target].values

# Instantiate a Linear Regression model
lr = LinearRegression()

# Fit the Linear Regression model to the training data
lr.fit(X_train, y_train)

# Use the trained model to predict target values for the validation data
y_pred = lr.predict(X_val)

# Calculate the Root Mean Squared Error (RMSE) between the predicted and actual target values
rmse = mean_squared_error(y_val, y_pred, squared=False)

# Print the Root Mean Squared Error (RMSE) between the predicted and actual target values
print(f"The Root Mean Squared Error (RMSE) between the predicted and actual target values is {round(rmse, 2)}")

The Root Mean Squared Error (RMSE) between the predicted and actual target values is 7.82


###  Concatenate 'PULocationID' and 'DOLocationID'

In [13]:
# Concatenate 'PULocationID' and 'DOLocationID' columns with '_' separator and store in a new column 'PU_DO'
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']

# Concatenate 'PULocationID' and 'DOLocationID' columns with '_' separator and store in a new column 'PU_DO'
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

# Define a list containing the name of the categorical column(s)
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']

# Instantiate a DictVectorizer object
dv = DictVectorizer()

# Convert the categorical columns in the training DataFrame to a list of dictionaries
train_dicts = df_train[categorical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with DictVectorizer, fitting it to the training data
X_train = dv.fit_transform(train_dicts)

# Convert the categorical columns in the validation DataFrame to a list of dictionaries
val_dicts = df_val[categorical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with d pre-fitted DictVectorizer, applied to d validation data
X_val = dv.transform(val_dicts)

# Define the target variable
target = 'duration'

# Extract the target variable from the training DataFrame and convert it to a NumPy array
y_train = df_train[target].values

# Extract the target variable from the validation DataFrame and convert it to a NumPy array
y_val = df_val[target].values

# Instantiate a Linear Regression model
lr = LinearRegression()

# Fit the Linear Regression model to the training data
lr.fit(X_train, y_train)

# Use the trained model to predict target values for the validation data
y_pred = lr.predict(X_val)

# Calculate the Root Mean Squared Error (RMSE) between the predicted and actual target values
rmse = mean_squared_error(y_val, y_pred, squared=False)

# Print the Root Mean Squared Error (RMSE) between the predicted and actual target values
print(f"The Root Mean Squared Error (RMSE) between the predicted and actual target values is {round(rmse, 2)}")

The Root Mean Squared Error (RMSE) between the predicted and actual target values is 5.22


### Add a numerical feature, 'trip_distance'

In [14]:
# Add a numeriacal variable, 'trip_distance'
numerical = ['trip_distance']

# Instantiate a DictVectorizer object
dv = DictVectorizer()

# Convert the columns in the training DataFrame to a list of dictionaries
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with DictVectorizer, fitting it to the training data
X_train = dv.fit_transform(train_dicts)

# Convert the columns in the validation DataFrame to a list of dictionaries
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

# Transform the list of dictionaries into a sparse matrix with d pre-fitted DictVectorizer, applied to d validation data
X_val = dv.transform(val_dicts)

# Define the target variable
target = 'duration'

# Extract the target variable from the training DataFrame and convert it to a NumPy array
y_train = df_train[target].values

# Extract the target variable from the validation DataFrame and convert it to a NumPy array
y_val = df_val[target].values

# Instantiate a Linear Regression model
lr = LinearRegression()

# Fit the Linear Regression model to the training data
lr.fit(X_train, y_train)

# Use the trained model to predict target values for the validation data
y_pred = lr.predict(X_val)

# Calculate the Root Mean Squared Error (RMSE) between the predicted and actual target values
rmse = mean_squared_error(y_val, y_pred, squared=False)

# Print the Root Mean Squared Error (RMSE) between the predicted and actual target values
print(f"The Root Mean Squared Error (RMSE) between the predicted and actual target values is {round(rmse, 2)}")

The Root Mean Squared Error (RMSE) between the predicted and actual target values is 5.28


In [15]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

9.400827678471593