# imports

In [1]:
import numpy as np
import pandas as pd
import warnings



from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

warnings.simplefilter('ignore')

# Config

In [2]:
BASE_PATH = 'E:/Mohsen/Rahnama College/'

LABELED_DATA_PATH = BASE_PATH + 'data/Features/label.parquet'
FEATURE_DATAFRAME_PATH = BASE_PATH + 'data/Features/features.parquet'
START_DATE = '2023-01-09'
TEST_DATE = '2023-04-01'
LAST_DATE = '2023-05-01'
FEATURE_LIST = [
#     'week_of_month',
    'PU_day_of_week',
    'last_day_demand',
    'last_week_demand',
    'lag1-8',
    'lag2-9',
    'lag3-10',
    'lag4-11',
    'arima'
]
TARGET = 'label'
VALIDATION_SPLIT_RATIO = 0.2
LR_OUTPUT_PATH_HIGH = BASE_PATH + 'data/output/lr_model_High_daily_result.parquet'
LR_OUTPUT_PATH_MID = BASE_PATH + 'data/output/lr_model_Mid_daily_result.parquet'
LR_OUTPUT_PATH_LOW = BASE_PATH + 'data/output/lr_model_low_daily_result.parquet'
XGB_OUTPUT_PATH_HIGH = BASE_PATH + 'data/output/xgboost_model_High_daily_result.parquet'
XGB_OUTPUT_PATH_MID = BASE_PATH + 'data/output/xgboost_model_Mid_daily_result.parquet'
XGB_OUTPUT_PATH_LOW = BASE_PATH + 'data/output/xgboost_model_low_daily_result.parquet'

LR_OUTPUT_PATH = BASE_PATH + 'data/output/lr_model_Split_daily_result.parquet'
XGB_OUTPUT_PATH = BASE_PATH + 'data/output/xgboost_Split_model_daily_results.parquet'

# Load Data

In [3]:
def load_labeled_data(path):
    return pd.read_parquet(path)


label_df = load_labeled_data(LABELED_DATA_PATH)
print(label_df.shape)
label_df.head()

(31964, 4)


Unnamed: 0,date,PULocationID,count,label
96,2023-01-02,1,32.0,
358,2023-01-03,1,28.0,
620,2023-01-04,1,8.0,
882,2023-01-05,1,16.0,
1144,2023-01-06,1,12.0,


## adding calender features

In [4]:
def load_features(path):
    return pd.read_parquet(path)


feature_df = load_features(FEATURE_DATAFRAME_PATH)
print(feature_df.shape)
feature_df.head()

(28296, 12)


Unnamed: 0,date,PULocationID,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11
2978,2023-01-13,1,0.916667,13,2,4,9.0,12.0,0.5625,1.25,0.178571,0.46875
3240,2023-01-14,1,2.142857,14,2,5,22.0,7.0,1.833333,0.5625,1.25,0.178571
3502,2023-01-15,1,0.846154,15,3,6,8.0,13.0,1.142857,1.833333,0.5625,1.25
3764,2023-01-16,1,1.0,16,3,0,20.0,15.0,1.538462,1.142857,1.833333,0.5625
4026,2023-01-17,1,3.0,17,3,1,22.0,5.0,1.466667,1.538462,1.142857,1.833333


### merge features and label

In [5]:
label_df['date'] = label_df['date'].astype(str)
feature_df['date'] = feature_df['date'].astype(str)

rides_df = pd.merge(label_df, feature_df, on=['date', 'PULocationID'])
rides_df

Unnamed: 0,date,PULocationID,count,label,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11
0,2023-01-13,1,22.0,1.833333,0.916667,13,2,4,9.0,12.0,0.562500,1.250000,0.178571,0.468750
1,2023-01-14,1,8.0,1.142857,2.142857,14,2,5,22.0,7.0,1.833333,0.562500,1.250000,0.178571
2,2023-01-15,1,20.0,1.538462,0.846154,15,3,6,8.0,13.0,1.142857,1.833333,0.562500,1.250000
3,2023-01-16,1,22.0,1.466667,1.000000,16,3,0,20.0,15.0,1.538462,1.142857,1.833333,0.562500
4,2023-01-17,1,19.0,3.800000,3.000000,17,3,1,22.0,5.0,1.466667,1.538462,1.142857,1.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28291,2023-04-26,265,79.0,1.837209,1.302326,26,4,2,57.0,43.0,1.140000,0.965517,0.857143,0.661972
28292,2023-04-27,265,45.0,0.703125,1.218750,27,4,3,79.0,64.0,1.837209,1.140000,0.965517,0.857143
28293,2023-04-28,265,56.0,1.037037,0.814815,28,4,4,45.0,54.0,0.703125,1.837209,1.140000,0.965517
28294,2023-04-29,265,46.0,0.978723,1.170213,29,5,5,56.0,47.0,1.037037,0.703125,1.837209,1.140000


## checking one week of data as a sample

In [6]:
rides_df[(rides_df['PULocationID'] == 79)].head(8)

Unnamed: 0,date,PULocationID,count,label,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11
8424,2023-01-13,79,2771.0,1.0625,1.117331,13,2,4,2024.0,2608.0,1.174014,1.158098,1.168135,1.067249
8425,2023-01-14,79,5177.0,1.157907,1.003131,14,2,5,2771.0,4471.0,1.0625,1.174014,1.158098,1.168135
8426,2023-01-15,79,4366.0,1.275862,1.114261,15,3,6,5177.0,3422.0,1.157907,1.0625,1.174014,1.158098
8427,2023-01-16,79,1595.0,1.305237,1.226678,16,3,0,4366.0,1222.0,1.275862,1.157907,1.0625,1.174014
8428,2023-01-17,79,1408.0,0.965045,0.963674,17,3,1,1595.0,1459.0,1.305237,1.275862,1.157907,1.0625
8429,2023-01-18,79,1742.0,0.966704,0.971698,18,3,2,1408.0,1802.0,0.965045,1.305237,1.275862,1.157907
8430,2023-01-19,79,2069.0,1.022233,0.944664,19,3,3,1742.0,2024.0,0.966704,0.965045,1.305237,1.275862
8431,2023-01-20,79,2665.0,0.961747,1.021653,20,3,4,2069.0,2771.0,1.022233,0.966704,0.965045,1.305237


## Dropping some samples

In [7]:
rides_df = rides_df.dropna()
rides_df = rides_df[rides_df['date'] < LAST_DATE]

print(rides_df.shape)
rides_df.head()

(28296, 14)


Unnamed: 0,date,PULocationID,count,label,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11
0,2023-01-13,1,22.0,1.833333,0.916667,13,2,4,9.0,12.0,0.5625,1.25,0.178571,0.46875
1,2023-01-14,1,8.0,1.142857,2.142857,14,2,5,22.0,7.0,1.833333,0.5625,1.25,0.178571
2,2023-01-15,1,20.0,1.538462,0.846154,15,3,6,8.0,13.0,1.142857,1.833333,0.5625,1.25
3,2023-01-16,1,22.0,1.466667,1.0,16,3,0,20.0,15.0,1.538462,1.142857,1.833333,0.5625
4,2023-01-17,1,19.0,3.8,3.0,17,3,1,22.0,5.0,1.466667,1.538462,1.142857,1.833333


### train model for high demand locations

In [8]:

mean_demand = rides_df.groupby('PULocationID')['count'].mean().reset_index().sort_values(by=['count'], ascending=False)

high_demand_locations = mean_demand.iloc[:50]['PULocationID'].values
mid_demand_locations = mean_demand.iloc[50:150]['PULocationID'].values
low_demand_locations = mean_demand.iloc[150:]['PULocationID'].values


high_demand_rides_df = rides_df[rides_df['PULocationID'].isin(high_demand_locations)].reset_index(drop=True)
mid_demand_rides_df=rides_df[rides_df['PULocationID'].isin(mid_demand_locations)].reset_index(drop=True)
low_demand_rides_df=rides_df[rides_df['PULocationID'].isin(low_demand_locations)].reset_index(drop=True)


## Train and Test split

In [9]:
def train_and_test_split(df: pd.DataFrame, split_date):
  train, test = df[df['date'] < split_date], df[df['date'] >= split_date]

  train.set_index('date', inplace = True)
  test.set_index('date', inplace = True)
  return train, test

train_df, test_df = train_and_test_split(high_demand_rides_df, TEST_DATE)

print('train_df shape:', train_df.shape)
print('test_df shape:', test_df.shape)
train_df.head()

train_df shape: (3900, 13)
test_df shape: (1500, 13)


Unnamed: 0_level_0,PULocationID,count,label,arima,PU_day_of_month,week_of_month,PU_day_of_week,last_day_demand,last_week_demand,lag1-8,lag2-9,lag3-10,lag4-11
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-01-13,13,573.0,1.127953,1.051181,13,2,4,646.0,508.0,1.050407,1.384615,1.418345,1.196335
2023-01-14,13,421.0,1.007177,0.988038,14,2,5,573.0,418.0,1.127953,1.050407,1.384615,1.418345
2023-01-15,13,340.0,1.218638,1.261649,15,3,6,421.0,279.0,1.007177,1.127953,1.050407,1.384615
2023-01-16,13,319.0,0.698031,0.87965,16,3,0,340.0,457.0,1.218638,1.007177,1.127953,1.050407
2023-01-17,13,547.0,0.862776,0.798107,17,3,1,319.0,634.0,0.698031,1.218638,1.007177,1.127953


# ML Model 

In [15]:
def model_training(ml_model, x_train, y_train, **params):
  model = ml_model(**params)
  model.fit(x_train, y_train)
  return model

replace_negatives = np.vectorize(lambda x : 1 if x < 1 else x)

## Calculate Error

In [16]:
def symmetric_mean_absolute_percentage_error(actual, predicted):
    res = np.mean(np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2))
    return round(res, 4)


def error_calculator(real_demand, predicted_demand):
  print('SMAPE: ', '{:.2%}'.format(symmetric_mean_absolute_percentage_error(real_demand, predicted_demand)))
  print('MAPE: ', '{:.2%}'.format(mean_absolute_percentage_error(real_demand, predicted_demand)))
  print('MSE: ', '{:.2f}'.format(mean_squared_error(real_demand, predicted_demand)))
  print('MAE: ', '{:.2f}'.format(mean_absolute_error(real_demand, predicted_demand)))


## Hyperparameter tuning

In [17]:
def hyper_parameter_tuning(x_train, y_train, n_estimators, learning_rate, max_depth, scoring_method):
  parameters = {
      'n_estimators' : n_estimators,
      'learning_rate' : learning_rate,
      'max_depth' : max_depth
  }

  gc = GridSearchCV(XGBRegressor(), parameters, scoring=scoring_method)
  gc.fit(x_train, y_train)
  return gc.best_params_


n_estimators = [100,700, 1000]
learning_rate = [0.15, 0.1, 0.01]
max_depth = [3,5]
scoring_method = 'neg_root_mean_squared_error'



### XGBoost Model

In [18]:
def fit_model(df,model_name,output):
    train_df, test_df = train_and_test_split(df, TEST_DATE)
    train_set_label = train_df[TARGET]
    train_set = train_df[FEATURE_LIST]

    y_test = test_df[TARGET]
    x_test = test_df[FEATURE_LIST]
    x_train, x_validation, y_train, y_validation = train_test_split(
    train_set, train_set_label, test_size=VALIDATION_SPLIT_RATIO, shuffle=True)
    if model_name==XGBRegressor:
        params = hyper_parameter_tuning(
                    x_train,
                    y_train,
                    n_estimators,
                    learning_rate,
                    max_depth,
                    scoring_method
                )

        model = model_training(model_name, x_train, y_train,**params)
    else:    
        model = model_training(model_name, x_train, y_train)

    test_pred = model.predict(x_test)
    error_calculator(
        y_test * test_df['last_week_demand'], replace_negatives(test_pred*test_df['last_week_demand']))
    result_df = test_df.copy()
    result_df.drop('count',axis=1,inplace=True)
    result_df['real demand'] = y_test * test_df['last_week_demand']
    result_df['predicted demand'] =replace_negatives( test_pred * test_df['last_week_demand'])
    result_df.to_parquet(output)

In [19]:
print('Linear Regression \nhigh demand')
fit_model(high_demand_rides_df,LinearRegression,LR_OUTPUT_PATH_HIGH)
print('\nMid Demand')
fit_model(mid_demand_rides_df,LinearRegression,LR_OUTPUT_PATH_MID)
print('\nLow Demand')
fit_model(low_demand_rides_df,LinearRegression,LR_OUTPUT_PATH_LOW)


Linear Regression 
high demand
SMAPE:  7.86%
MAPE:  7.96%
MSE:  50473.07
MAE:  152.83

 Mid Demand
SMAPE:  29.64%
MAPE:  36.82%
MSE:  224.14
MAE:  7.81
Low Demand 

SMAPE:  39.82%
MAPE:  50.95%
MSE:  4.29
MAE:  1.28


In [20]:
print('XGboost Regression \nhigh demand')
fit_model(high_demand_rides_df,XGBRegressor,XGB_OUTPUT_PATH_HIGH)
print('\nMid Demand')
fit_model(mid_demand_rides_df,XGBRegressor,XGB_OUTPUT_PATH_MID)
print('\nLow Demand')
fit_model(low_demand_rides_df,XGBRegressor,XGB_OUTPUT_PATH_LOW)


XGboost Regression 
high demand
SMAPE:  7.65%
MAPE:  7.69%
MSE:  49402.70
MAE:  149.45

Mid Demand
SMAPE:  27.68%
MAPE:  33.55%
MSE:  182.92
MAE:  7.13

Low Demand
SMAPE:  37.06%
MAPE:  47.04%
MSE:  3.60
MAE:  1.17


### Result Data

### Merge 

In [21]:
def merge_after_pred(low_path,mid_path,high_path,output_path):
    low=pd.read_parquet(low_path)
    mid=pd.read_parquet(mid_path)
    high=pd.read_parquet(high_path)
    return pd.concat([low,mid,high]).to_parquet(output_path)
merge_after_pred(XGB_OUTPUT_PATH_LOW,XGB_OUTPUT_PATH_MID,XGB_OUTPUT_PATH_HIGH,XGB_OUTPUT_PATH)


In [22]:
merge_after_pred(LR_OUTPUT_PATH_LOW,LR_OUTPUT_PATH_MID,LR_OUTPUT_PATH_HIGH,LR_OUTPUT_PATH)