In [67]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from joblib import dump

In [34]:
path = "./data/feature_engineered_dataset.csv"
df = pd.read_csv(path, index_col=None )

In [35]:
print(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} cols')

The dataset has 5,707,647 rows and 13 cols


Train Test Split

In [44]:
# train set is any day before 16 Mar 2016
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
cutoff_date = pd.Timestamp("2016-03-16")
train = df[df['tpep_pickup_datetime'] < cutoff_date]
test = df[df['tpep_pickup_datetime'] >= cutoff_date]

In [45]:
# This is about 80/20 train test split
print(f'The train set has {train.shape[0]:,} rows')
print(f'The test set has {test.shape[0]:,} rows')

The train set has 4,693,641 rows
The test set has 1,014,006 rows


In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4693641 entries, 0 to 5705548
Data columns (total 13 columns):
 #   Column                Dtype         
---  ------                -----         
 0   pickup_longitude      float64       
 1   pickup_latitude       float64       
 2   tpep_pickup_datetime  datetime64[ns]
 3   current_pickup_count  int64         
 4   is_weekend            int64         
 5   day_of_week           int64         
 6   hour_of_day           int64         
 7   temp                  float64       
 8   prcp                  float64       
 9   pickup_geohash        object        
 10  target_pickup_count   int64         
 11  month                 int64         
 12  day_of_month          int64         
dtypes: datetime64[ns](1), float64(4), int64(7), object(1)
memory usage: 501.3+ MB


In [49]:
# Create x and y for train set
x_train = train.drop(columns=["tpep_pickup_datetime", "target_pickup_count","pickup_geohash"])
y_train = train["target_pickup_count"]

# Create x and y for test set
x_test = test.drop(columns=["tpep_pickup_datetime", "target_pickup_count","pickup_geohash"])
y_test = test["target_pickup_count"]

In [50]:
print("x_train:", x_train.shape)
print("y_train", y_train.shape)
print("x_test:", x_test.shape)
print("y_test",y_test.shape)

x_train: (4693641, 10)
y_train (4693641,)
x_test: (1014006, 10)
y_test (1014006,)


Preprocess Data

In [55]:
# Define OHE preprocessor for categorical features
categorical_preprocessor = Pipeline(steps=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
        ])

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_preprocessor, ['day_of_week','hour_of_day','month','day_of_month'])
    ],
    remainder='passthrough'
)

Define Preprocessing and Training Pipeline

In [56]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('linear_model', LinearRegression())
    ]
)

Train model

In [57]:
pipeline.fit(x_train, y_train) 

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



Make Prediction

In [68]:
y_pred = pipeline.predict(x_test)

Compute performance metrics

In [69]:
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("RMSE:", rmse)
print("MAE:", mae)

RMSE: 5.024340156180973
MAE: 2.689181185804495


In [64]:
# save model 
filepath = './models/linear_reg_model.joblib'
dump(pipeline, filepath)

['./models/linear_reg_model.joblib']

In [66]:
compare_results = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
compare_results

Unnamed: 0,Actual,Predicted
0,2,15.698125
1,2,1.712766
2,1,1.666363
3,2,0.747887
4,5,1.966087
...,...,...
1014001,1,1.712219
1014002,1,1.917003
1014003,1,2.574279
1014004,2,2.506307
