In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
# read data
train = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
valid = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")

# number of columns in January 2023
print(
    f"Number of columns in January 2023 Yellow Taxi trip records = {len(train.columns)}"
)

Number of columns in January 2023 Yellow Taxi trip records = 19


In [3]:
# compute trip duration
train["duration"] = train["tpep_dropoff_datetime"] - train["tpep_pickup_datetime"]
train["duration"] = train["duration"].apply(lambda td: td.total_seconds() / 60)

# standard deviation of the trips duration in January 2023
duration_std = round(train["duration"].std(), 2)
print(
    f"The standard deviation of the trip duration (minutes) in January 2023 = {duration_std}"
)

The standard deviation of the trip duration (minutes) in January 2023 = 42.59


In [4]:
# remove outliers and keep only the records where the duration was between 1 and 60 minutes (inclusive)
train_wo_outliers = train[(train["duration"] >= 1) & (train["duration"] <= 60)]

# fraction of the records left after dropping the outliers
fraction = round(len(train_wo_outliers) * 100 / len(train))
print(f"The fraction of the records left after dropping the outliers = {fraction}%")

The fraction of the records left after dropping the outliers = 98%


In [5]:
# apply one-hot encoding to the pickup and dropoff location IDs
categorical = ["PULocationID", "DOLocationID"]
train_wo_outliers[categorical] = train_wo_outliers[categorical].astype(str)
train_dicts = train_wo_outliers[categorical].to_dict(orient="records")
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# number of columns in the feature matrix
print(f"Number of columns in the feature matrix = {X_train.shape[1]}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_wo_outliers[categorical] = train_wo_outliers[categorical].astype(str)


Number of columns in the feature matrix = 515


In [6]:
# train a linear regression model with default parameters
target = "duration"
y_train = train_wo_outliers[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

# calculate the RMSE of the model on the training data
train_rmse = round(root_mean_squared_error(y_train, lr.predict(X_train)), 2)
print(f"RMSE on train data = {train_rmse}")

RMSE on train data = 7.65


In [7]:
# apply model to validation data
valid["duration"] = valid["tpep_dropoff_datetime"] - valid["tpep_pickup_datetime"]
valid["duration"] = valid["duration"].apply(lambda td: td.total_seconds() / 60)
valid = valid[(valid["duration"] >= 1) & (valid["duration"] <= 60)]
valid[categorical] = valid[categorical].astype(str)
valid_dicts = valid[categorical].to_dict(orient="records")
X_valid = dv.transform(valid_dicts)
y_valid = valid[target].values
print(f"Number of columns in the feature matrix = {X_valid.shape[1]}")

# calculate the RMSE of the model on the validation data
valid_rmse = round(root_mean_squared_error(y_valid, lr.predict(X_valid)), 2)
print(f"RMSE on validation data = {valid_rmse}")

Number of columns in the feature matrix = 515
RMSE on validation data = 7.81
