In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
# Q1 Download 2023-01 yellow tripdata
dfh = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')
dfh.sample(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
305574,2,2023-01-04 21:51:56,2023-01-04 22:03:44,1.0,2.59,1.0,N,50,137,1,14.9,1.0,0.5,1.0,0.0,1.0,20.9,2.5,0.0
2639236,1,2023-01-28 08:16:56,2023-01-28 08:20:52,2.0,1.3,1.0,N,229,141,1,7.2,2.5,0.5,2.2,0.0,1.0,13.4,2.5,0.0
1571732,2,2023-01-17 20:04:32,2023-01-17 20:15:25,2.0,1.31,1.0,N,162,48,1,11.4,2.5,0.5,1.0,0.0,1.0,18.9,2.5,0.0
1800656,2,2023-01-20 00:29:59,2023-01-20 00:32:27,1.0,0.47,1.0,N,164,161,4,-5.1,-1.0,-0.5,0.0,0.0,-1.0,-10.1,-2.5,0.0
1711928,2,2023-01-19 10:38:47,2023-01-19 10:54:25,1.0,1.45,1.0,N,249,137,2,14.2,0.0,0.5,0.0,0.0,1.0,18.2,2.5,0.0


In [3]:
print(f">> Answer 1: Our data file contains {len(dfh.columns)} columns")

>> Answer 1: Our data file contains 19 columns


In [4]:
# Q2 Computing Duration, std with outliers
dfh['duration'] = dfh.tpep_dropoff_datetime - dfh.tpep_pickup_datetime
dfh['duration'] = dfh.duration.dt.total_seconds() / 60

count_with_outliers  = dfh.duration.describe()["count"]
round(dfh.duration.describe([.9, .92, .95, .98]), 2)

count    3066766.00
mean          15.67
std           42.59
min          -29.20
50%           11.52
90%           27.94
92%           30.55
95%           36.47
98%           48.73
max        10029.18
Name: duration, dtype: float64

In [5]:
print(f">> Answer 2: The duration's standard deviation is {round(dfh.duration.describe()["std"], 2)} minutes")

>> Answer 2: The duration's standard deviation is 42.59 minutes


In [6]:
# Q3 Dropping Outliers
def duration_feature(df: pd.DataFrame):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60
    return df[(df.duration >= 1) & (df.duration <= 60)]

dfh = duration_feature(dfh)
round(dfh.duration.describe([.9, .92, .95, .98]), 2)

count    3009173.00
mean          14.20
std            9.94
min            1.00
50%           11.55
90%           27.27
92%           29.65
95%           34.77
98%           44.50
max           60.00
Name: duration, dtype: float64

In [7]:
print(f">> After removing the outliers, {(round(len(dfh)/count_with_outliers, 2))*100}% of the inital records are left")

>> After removing the outliers, 98.0% of the inital records are left


In [8]:
# Q4 OneHot Encoding, straightforwardly used pd.get_dummies instead of vectorizer
def one_hot(df: pd.DataFrame):
    dummy_var1 = pd.get_dummies(df["PULocationID"])
    dummy_var2 = pd.get_dummies(df["DOLocationID"])

    return pd.concat([dummy_var1, dummy_var2], axis=1)

features = one_hot(dfh)

In [9]:
print(f">> The dimensonality of the feature matrix is {features.shape[1]}")

>> The dimensonality of the feature matrix is 515


In [10]:
# Q5 Model Training, (plus again OneHot Encoding, but with sklearn)
enc = OneHotEncoder(handle_unknown='ignore')
model = LinearRegression()

X_train = enc.fit_transform(dfh[['PULocationID', 'DOLocationID']].astype('str'))
y_train = dfh['duration'].values
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
print(f">> RMSE on the training data is {round(root_mean_squared_error(y_train, y_pred), 2)} minutes")

>> RMSE on the training data is 7.65 minutes


In [11]:
# Q6 Validation Set Evaluation
dfval = duration_feature(pd.read_parquet('../data/yellow_tripdata_2023-02.parquet'))

X_val = enc.transform(dfval[['PULocationID', 'DOLocationID']].astype('str'))
y_val = model.predict(X_val)

print(f">> RMSE for validation data is {round(root_mean_squared_error(dfval["duration"].values, y_val), 2)} minutes")

>> RMSE for validation data is 7.81 minutes
