In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from geopy.distance import geodesic

In [3]:
base = "/content/drive/MyDrive/nyc-taxi-trip-duration"
df = pd.read_csv(f"{base}/data/train_clean.csv")

In [4]:
# feature engineering
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_is_weekend'] = df['pickup_dayofweek'].isin([5,6]).astype(int)

In [5]:
# Distance (in km)
df['distance_km'] = df.apply(lambda x: geodesic(
    (x['pickup_latitude'], x['pickup_longitude']),
    (x['dropoff_latitude'], x['dropoff_longitude'])
).km, axis=1)

✅ Feature engineering complete.


In [6]:
# Base Model
features = ['vendor_id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']
X1 = sm.add_constant(df[features])
y = df['trip_duration']
model1 = sm.OLS(y, X1).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     621.9
Date:                Fri, 17 Oct 2025   Prob (F-statistic):               0.00
Time:                        20:40:06   Log-Likelihood:            -1.4559e+07
No. Observations:             1458644   AIC:                         2.912e+07
Df Residuals:                 1458638   BIC:                         2.912e+07
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              3.647e+05   8131.37

In [7]:
# Improved Model
features2 = features + ['distance_km','pickup_hour','pickup_month','pickup_is_weekend']
X2 = sm.add_constant(df[features2])
model2 = sm.OLS(y, X2).fit()
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1564.
Date:                Fri, 17 Oct 2025   Prob (F-statistic):               0.00
Time:                        20:40:33   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458634   BIC:                         2.911e+07
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1.016e+05   8493.18

In [8]:
print(f"Model 1 R²: {model1.rsquared:.4f} | Model 2 R²: {model2.rsquared:.4f}")

Model 1 R²: 0.0021 | Model 2 R²: 0.0096
