In [1]:
# data manipulation and storage
import pandas as pd

# plotting and graphs
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing
# from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# models
# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Lasso
# from sklearn.linear_model import Ridge
# import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor

# model performance metrics
from sklearn.metrics import mean_squared_error

# saving model to file
import pickle

# mlflow for experiment tracking
import mlflow

# hyper-parameter optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# sklearn pipeline creation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# misc utilities
import copy

# intel sklearn optimization library
from sklearnex import patch_sklearn
patch_sklearn()

def cleaned_train_and_target(df,clean=True):
    
    # create concatenated categorical feature
    df['PU_DO_pair'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)                
    # create target feature
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)

    if clean == True:
        # filter out rows based on various conditions
        df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
        df = df[(df['trip_distance'] > 1)&(df['trip_distance'] < 25)]
        df = df[(df['total_amount'] > 1)&(df['total_amount'] < 150)]
        df = df[df['passenger_count'] > 0]  
 
    y = df['duration']
    X = df[['PU_DO_pair','trip_distance','fare_amount']]
    return X,y

numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

categorical = ['PU_DO_pair']
numerical = ['trip_distance','fare_amount']
target = ['duration']

preprocessor_pipeline = ColumnTransformer(transformers=[
    ('numeric', numeric_pipeline, numerical),
    ('categoric', categorical_pipeline, categorical)
])

gradientboost_regressor = GradientBoostingRegressor()
# linear_regressor = 


regression_model = Pipeline(steps=[
    ('preprocess', preprocessor_pipeline),
    ('model', gradientboost_regressor)
])


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
train_path = 'data/green_tripdata_2021-01.parquet'
val_path = 'data/green_tripdata_2021-01.parquet'
df_train = pd.read_parquet(train_path)
df_val = pd.read_parquet(val_path)
X_train,y_train = cleaned_train_and_target(df_train,clean=True)
X_val, y_val = cleaned_train_and_target(df_val,clean=True)

In [3]:
regression_model.fit(X_train,y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['trip_distance',
                                                   'fare_amount']),
                                                 ('categoric',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot',
                                                                   OneHotEncoder(handle_unknown='ignore',
             

In [4]:
y_pred_linear = regression_model.predict(X_val)
mean_squared_error(y_val,y_pred_linear,squared=False)


58.841718290811556