In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from joblib import dump

In [2]:
path = "./data/feature_engineered_dataset.csv"
df = pd.read_csv(path, index_col=None )

In [3]:
print(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} cols')

The dataset has 5,644,012 rows and 15 cols


Train Test Split

In [4]:
# train set is any day before 16 Mar 2016
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
cutoff_date = pd.Timestamp("2016-03-16")
train = df[df['tpep_pickup_datetime'] < cutoff_date]
test = df[df['tpep_pickup_datetime'] >= cutoff_date]

In [8]:
# This is about 80/20 train test split
print(f'The train set has {train.shape[0]:,} rows')
print(f'The test set has {test.shape[0]:,} rows')

The train set has 4,643,397 rows
The test set has 1,000,615 rows


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4643397 entries, 0 to 5643320
Data columns (total 15 columns):
 #   Column                Dtype         
---  ------                -----         
 0   pickup_longitude      float64       
 1   pickup_latitude       float64       
 2   tpep_pickup_datetime  datetime64[ns]
 3   current_pickup_count  int64         
 4   is_weekend            int64         
 5   day_of_week           int64         
 6   hour_of_day           int64         
 7   temp                  float64       
 8   prcp                  float64       
 9   pickup_geohash        object        
 10  target_pickup_count   int64         
 11  pickup_count_1h_ago   int64         
 12  pickup_count_2h_ago   int64         
 13  month                 int64         
 14  day_of_month          int64         
dtypes: datetime64[ns](1), float64(4), int64(9), object(1)
memory usage: 566.8+ MB


In [10]:
# Create x and y for train set
x_train = train.drop(columns=["tpep_pickup_datetime", "target_pickup_count","pickup_geohash"])
y_train = train["target_pickup_count"]

# Create x and y for test set
x_test = test.drop(columns=["tpep_pickup_datetime", "target_pickup_count","pickup_geohash"])
y_test = test["target_pickup_count"]

In [11]:
print("x_train:", x_train.shape)
print("y_train", y_train.shape)
print("x_test:", x_test.shape)
print("y_test",y_test.shape)

x_train: (4643397, 12)
y_train (4643397,)
x_test: (1000615, 12)
y_test (1000615,)


Preprocess Data

In [12]:
# Define OHE preprocessor for categorical features
categorical_preprocessor = Pipeline(steps=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
        ])

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_preprocessor, ['day_of_week','hour_of_day','month','day_of_month'])
    ],
    remainder='passthrough'
)

Define Preprocessing and Training Pipeline

In [13]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('linear_model', LinearRegression())
    ]
)

Train model

In [14]:
pipeline.fit(x_train, y_train) 

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



Make Prediction

In [22]:
y_pred_test = pipeline.predict(x_test)

Compute performance metrics

In [23]:
# Metrics on test set
test_rmse = root_mean_squared_error(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
print("Test RMSE:", test_rmse)
print("Test MAE:", test_mae)

Test RMSE: 4.992162472990026
Test MAE: 2.669505646554662


In [24]:
# Metrics on train set
y_pred_train = pipeline.predict(x_train)
train_rmse = root_mean_squared_error(y_train, y_pred_train)
train_mae = mean_absolute_error(y_train, y_pred_train)
print("Train RMSE:", train_rmse)
print("Train MAE:", train_mae)

Train RMSE: 5.002948693209742
Train MAE: 2.660311429083304


In [17]:
# save model 
filepath = './models/linear_reg_model.joblib'
dump(pipeline, filepath)

['./models/linear_reg_model.joblib']

In [18]:
compare_results = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
compare_results

Unnamed: 0,Actual,Predicted
0,2,16.249292
1,2,4.031761
2,1,1.889174
3,2,0.586309
4,5,1.605217
...,...,...
1000610,2,3.349718
1000611,4,4.193982
1000612,2,4.637264
1000613,2,3.539697
