In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Download the data for January and February 2023
url_january = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
url_february = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

In [3]:
df_january = pd.read_parquet(url_january)
df_february = pd.read_parquet(url_february)

In [4]:
# Question 1: How many columns does the January dataset have?
num_columns_january = df_january.shape[1]
print("The number of columns in the January DataFrame is:", num_columns_january)

The number of columns in the January DataFrame is: 19


In [5]:
# Question 2: Compute the duration variable in minutes
df_january['tpep_pickup_datetime'] = pd.to_datetime(df_january['tpep_pickup_datetime'])
df_january['tpep_dropoff_datetime'] = pd.to_datetime(df_january['tpep_dropoff_datetime'])
df_january['duration'] = (df_january['tpep_dropoff_datetime'] - df_january['tpep_pickup_datetime']).dt.total_seconds() / 60
std_typical_deviation = df_january['duration'].std()
print("The standard deviation of January travel times is:", std_typical_deviation, "minutes.")

The standard deviation of January travel times is: 42.594351241920904 minutes.


In [6]:
# Question 3: Drop outliers
df_filtered = df_january[(df_january['duration'] >= 1) & (df_january['duration'] <= 60)]
fraction_records = len(df_filtered) / len(df_january)
print("The fraction of records after removing outliers is:", fraction_records)

The fraction of records after removing outliers is: 0.9812202822125979


In [7]:
# Question 4: Apply one-hot encoding
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
df_filtered[categorical] = df_filtered[categorical].astype(str)
features = categorical + numerical
train_dicts = df_filtered[features].to_dict(orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[categorical] = df_filtered[categorical].astype(str)


In [8]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
num_columns = len(dv.feature_names_)
print("The number of columns in the matrix is:", num_columns)

The number of columns in the matrix is: 516


In [9]:
# Calculate the standard deviation of trip lengths in January
std_typical_deviation = df['duration'].std()
print("The standard deviation of January travel times is:", std_typical_deviation, "minutes.")

The standard deviation of January travel times is: 42.594351241920904 minutes.


In [9]:
# Question 5: Train the model
y_train = df_filtered['duration'].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred, squared=False)
print("The RMSE on train is:", rmse_train)

The RMSE on train is: 7.64913436187382


In [12]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [11]:
# Evaluate the model on the validation dataset (February 2023)
df_february['tpep_pickup_datetime'] = pd.to_datetime(df_february['tpep_pickup_datetime'])
df_february['tpep_dropoff_datetime'] = pd.to_datetime(df_february['tpep_dropoff_datetime'])
df_february['duration'] = (df_february['tpep_dropoff_datetime'] - df_february['tpep_pickup_datetime']).dt.total_seconds() / 60
df_february_filtered = df_february[(df_february['duration'] >= 1) & (df_february['duration'] <= 60)]
df_february_filtered[categorical] = df_february_filtered[categorical].astype(str)
val_dicts = df_february_filtered[features].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_february_filtered['duration'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_february_filtered[categorical] = df_february_filtered[categorical].astype(str)


In [12]:
y_val_pred = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print("The RMSE on validation is:", rmse_val)

The RMSE on validation is: 7.81142176097977
