# imports

In [None]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import warnings

from datetime import date
from itertools import product
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

warnings.simplefilter('ignore')

# Config

In [None]:
DATA_FILE_PATHS = '/content/drive/MyDrive/RC/data/'
START_DATE = '2023-01-01'
END_DATE = '2023-05-01'
TEST_DATE = '2023,4,1'
LAST_DATE = '2023,5,1'
FEATURE_LIST = [
    'time_interval_number',
    'PU_day_of_week',
    'last_day_demand',
    'last_week_demand'
]
TARGET = 'count'
VALIDATION_SPLIT_RATIO = 0.2
NUMBER_INTERVAL_PER_DAY = 8
TIME_INTERVAL_LR_OUTPUT_PATH = '/content/drive/MyDrive/RC/output/time_interval_lr_result.parquet'
TIME_INTERVAL_XGB_OUTPUT_PATH = '/content/drive/MyDrive/RC/output/time_interval_XGB_result.parquet'

# Load Data

In [None]:
def load_data(file_paths, interval : int, start_date = None, end_date = None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        if end_date:
            df = df[
                (df['date'] >= start_date) & (df['date'] < end_date)
            ]
        else:
            df = df[df['date'] > start_date].reset_index(drop = True)
    df = df.sort_values(by = 'date')
    df = df.reset_index(drop = True)
    interval_per_day = int(24 / interval)

    df['interval_start'] = df['tpep_pickup_datetime'].dt.floor(f"{interval_per_day}H")
    df['interval_end'] = df['interval_start'] + \
        pd.Timedelta(hours=interval_per_day)
    df['time_interval'] = df['interval_start'].dt.strftime(
        '%H:%M:%S') + ' - ' + df['interval_end'].dt.strftime('%H:%M:%S')
    df.drop(
        columns = ['interval_start', 'interval_end'],
        inplace = True
    )
    df['time_interval_number'] = pd.cut(
        df['tpep_pickup_datetime'].dt.hour,
        bins = interval,
        labels = range(1, interval + 1),
        right = False
    )

    return df

rides_df = load_data(
    DATA_FILE_PATHS,
    NUMBER_INTERVAL_PER_DAY,
    START_DATE,
    END_DATE
)
print(rides_df.shape)
rides_df.head()

(12672629, 22)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date,time_interval,time_interval_number
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,0.5,0.0,0.0,1.0,14.3,2.5,0.0,2023-01-01,00:00:00 - 03:00:00,1
1,1,2023-01-01 16:18:55,2023-01-01 16:26:09,3.0,0.0,1.0,N,107,90,1,...,0.5,2.1,0.0,1.0,12.6,2.5,0.0,2023-01-01,15:00:00 - 18:00:00,6
2,2,2023-01-01 16:59:08,2023-01-01 17:41:59,1.0,19.82,2.0,N,132,238,1,...,0.5,16.36,6.55,1.0,98.16,2.5,1.25,2023-01-01,15:00:00 - 18:00:00,6
3,2,2023-01-01 16:29:59,2023-01-01 16:59:01,1.0,9.36,1.0,N,138,68,1,...,0.5,11.8,6.55,1.0,70.8,2.5,1.25,2023-01-01,15:00:00 - 18:00:00,6
4,2,2023-01-01 16:35:44,2023-01-01 16:53:56,2.0,3.18,1.0,N,114,162,1,...,0.5,4.76,0.0,1.0,28.56,2.5,0.0,2023-01-01,15:00:00 - 18:00:00,6


# aggregate data and labeling

In [None]:
def labeling_by_interval(rides_df : pd.DataFrame):
    aggregated_df = rides_df.groupby(
        [
            'date',
            'time_interval_number',
            'PULocationID'
        ]
    ).size().reset_index(name = 'count')
    unique_dates = rides_df['date'].unique()
    unique_interval = rides_df['time_interval_number'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(
        product(
            unique_dates,
            unique_interval,
            unique_pu_location_ids
        )
    )
    combinations_df = pd.DataFrame(
        all_combinations,
        columns = ['date', 'time_interval_number', 'PULocationID']
    )
    label_df = aggregated_df.merge(
        combinations_df,
        how = 'right',
        on = ['date', 'time_interval_number', 'PULocationID']
    ).fillna(0)
    label_df = label_df.sort_values(
        by = ['date', 'time_interval_number'],
        ascending = [
            True,
            True
        ]
    )
    return label_df

rides_df = labeling_by_interval(rides_df)

print(rides_df.shape)
rides_df.head()

(251520, 4)


Unnamed: 0,date,time_interval_number,PULocationID,count
0,2023-01-01,1,161,504
1,2023-01-01,1,107,604
2,2023-01-01,1,132,389
3,2023-01-01,1,138,42
4,2023-01-01,1,114,205


# Feature Extraction

## adding calender features

In [None]:
def adding_feature(rides_df : pd.DataFrame, interval : int):
    rides_df['date'] = rides_df['date'].astype('datetime64[ns]')
    rides_df['PU_day_of_week'] = rides_df['date'].dt.weekday.astype(np.uint8)
    rides_df = rides_df.sort_values(
        [
            'PULocationID',
            'date',
            'time_interval_number'
        ]
    )
    rides_df['last_day_demand'] = rides_df.groupby(['PULocationID'])['count'].shift(interval)
    rides_df['last_week_demand'] = rides_df.groupby(['PULocationID'])['count'].shift(interval * 7)
    return rides_df

rides_df['count'] = rides_df['count'] + 1
rides_df = adding_feature(rides_df, NUMBER_INTERVAL_PER_DAY)

print(rides_df.shape)
rides_df.head()

(251520, 7)


Unnamed: 0,date,time_interval_number,PULocationID,count,PU_day_of_week,last_day_demand,last_week_demand
58,2023-01-01,1,1,1,6,,
1368,2023-01-01,2,1,2,6,,
1630,2023-01-01,3,1,2,6,,
1892,2023-01-01,4,1,2,6,,
844,2023-01-01,5,1,14,6,,


## checking one week of data as a sample

In [None]:
rides_df[(rides_df['PULocationID'] == 79)].tail(16)

Unnamed: 0,date,time_interval_number,PULocationID,count,PU_day_of_week,last_day_demand,last_week_demand
247342,2023-04-29,1,79,1510,5,433.0,1289.0
248652,2023-04-29,2,79,335,5,104.0,432.0
248914,2023-04-29,3,79,79,5,188.0,75.0
249176,2023-04-29,4,79,295,5,187.0,231.0
248128,2023-04-29,5,79,469,5,260.0,353.0
247604,2023-04-29,6,79,444,5,299.0,473.0
247866,2023-04-29,7,79,533,5,599.0,654.0
248390,2023-04-29,8,79,924,5,1083.0,1142.0
249438,2023-04-30,1,79,1354,6,1510.0,1244.0
250748,2023-04-30,2,79,353,6,335.0,324.0


## Dropping some samples

In [None]:
rides_df = rides_df.dropna()
date = LAST_DATE.split(',')
end_date_time = datetime.datetime(
    int(date[0]),
    int(date[1]),
    int(date[2])
)
rides_df = rides_df[rides_df['date'] < end_date_time]

print(rides_df.shape)
rides_df.head()

(236848, 7)


Unnamed: 0,date,time_interval_number,PULocationID,count,PU_day_of_week,last_day_demand,last_week_demand
14730,2023-01-08,1,1,1,6,1.0,1.0
16040,2023-01-08,2,1,1,6,1.0,2.0
16302,2023-01-08,3,1,4,6,2.0,2.0
16564,2023-01-08,4,1,2,6,2.0,2.0
15516,2023-01-08,5,1,3,6,2.0,14.0


## Train and Test split

In [None]:
def train_and_test_split(data, split_date):

  date = split_date.split(',')
  start_date_time = datetime.datetime(
      int(date[0]),
      int(date[1]),
      int(date[2])
  )
  train_data = data[
      rides_df['date'] < start_date_time
  ]
  test_data = data[
      rides_df['date'] >= start_date_time
  ]

  train_data.set_index('date', inplace = True)
  test_data.set_index('date', inplace = True)

  pu_location_id = test_data['PULocationID']
  train_data = train_data.drop('PULocationID', axis = 1)
  test_data = test_data.drop('PULocationID', axis = 1)

  return train_data, test_data, pu_location_id

train_df, test_df, pu_location_id = train_and_test_split(
    rides_df,
    TEST_DATE
)

print(train_df.shape)
print(test_df.shape)
train_df.head()

(173968, 5)
(62880, 5)


Unnamed: 0_level_0,time_interval_number,count,PU_day_of_week,last_day_demand,last_week_demand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-08,1,1,6,1.0,1.0
2023-01-08,2,1,6,1.0,2.0
2023-01-08,3,4,6,2.0,2.0
2023-01-08,4,2,6,2.0,2.0
2023-01-08,5,3,6,2.0,14.0


## Target and Feature split

In [None]:
train_label_df = train_df[TARGET]
train_df = train_df[FEATURE_LIST]

test_label_df = test_df[TARGET]
test_df = test_df[FEATURE_LIST]

## Train and Validation split

In [None]:
train_df, validation_df, train_label_df, validation_label_df = train_test_split(
    train_df,
    train_label_df,
    test_size = VALIDATION_SPLIT_RATIO,
    shuffle = False
)

# ML Models

In [None]:
def model_training(ml_model, train_df, train_label_df, **params):
  model = ml_model(**params)
  model.fit(
      train_df,
      train_label_df
  )
  return model

replace_negatives = np.vectorize(lambda x : 0 if x < 0 else x)

## Calculate Error

In [None]:
def symmetric_mean_absolute_percentage_error(actual, predicted) -> float:
	return round(
      np.mean(
          np.abs(predicted - actual) /
          ((np.abs(predicted) + np.abs(actual)) / 2)
      ), 4
  )

def error_calculator(real_demand, predicted_demand):
  print(
      'SMAPE: ',
      round(
          symmetric_mean_absolute_percentage_error(
              real_demand,
              predicted_demand
          ) * 100 , 2
      ), '%'
  )
  print(
      'MAPE:  ',
      round(
          float(
              mean_absolute_percentage_error(
                  real_demand,
                  predicted_demand
              )
          ) * 100, 2
      ), '%'
  )
  print(
      'MSE:   ',
      round(
          float(
              mean_squared_error(
                  real_demand,
                  predicted_demand
              )
          ), 2
      )
  )
  print(
      'MAE:   ',
      round(
          float(
              mean_absolute_error(
                  real_demand,
                  predicted_demand
              )
          ), 2
      )
  )

## Linear Regression Model

In [None]:
lr_model = model_training(
    LinearRegression,
    train_df,
    train_label_df
)

### Validation prediction

In [None]:
lr_validation_pred = replace_negatives(
    np.round_(
        lr_model.predict(
            validation_df
        )
    )
)
error_calculator(
    validation_label_df,
    lr_validation_pred
)

SMAPE:  36.2 %
MAPE:   48.61 %
MSE:    1232.81
MAE:    12.08


### Test prediction

In [None]:
lr_test_pred = replace_negatives(
    np.round_(
        lr_model.predict(
            test_df
        )
    )
)
error_calculator(
    test_label_df,
    lr_test_pred
)

SMAPE:  38.17 %
MAPE:   50.2 %
MSE:    612.97
MAE:    7.61


### Result Data

In [None]:
lr_result_df = test_df.copy()
lr_result_df['PULocationID'] = pu_location_id
lr_result_df['real demand'] = test_label_df
lr_result_df['predicted demand'] = lr_test_pred

print(lr_result_df.shape)
lr_result_df.head()

(62880, 7)


Unnamed: 0_level_0,time_interval_number,PU_day_of_week,last_day_demand,last_week_demand,PULocationID,real demand,predicted demand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-04-01,1,5,1.0,1.0,1,1,1.0
2023-04-01,2,5,1.0,1.0,1,2,1.0
2023-04-01,3,5,1.0,4.0,1,2,3.0
2023-04-01,4,5,3.0,3.0,1,1,3.0
2023-04-01,5,5,6.0,4.0,1,2,4.0


In [None]:
lr_result_df.to_parquet(TIME_INTERVAL_LR_OUTPUT_PATH)

## XGBoost Model

### Hyperparameter tuning

In [None]:
def hyper_parameter_tuning(n_estimators, learning_rate, max_depth, scoring_method):
  parameters = {
      'n_estimators' : n_estimators,
      'learning_rate' : learning_rate,
      'max_depth' : max_depth
  }

  gc = GridSearchCV(
      XGBRegressor(),
      parameters,
      scoring = scoring_method
  )

  gc.fit(
      train_df,
      train_label_df
  )

  param = gc.best_params_

  return param

n_estimators = [100, 500, 700]
learning_rate = [0.15, 0.1, 0.01]
max_depth = [2, 3, 5]
scoring_method = 'neg_root_mean_squared_error'

param = hyper_parameter_tuning(
    n_estimators,
    learning_rate,
    max_depth,
    scoring_method
)

print(param)

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 700}


### XGBoost Model

In [None]:
XGB_model = model_training(
    XGBRegressor,
    train_df,
    train_label_df,
    n_estimators = param['n_estimators'],
    learning_rate = param['learning_rate'],
    max_depth = param['max_depth']
)

### Validation prediction

In [None]:
XGB_validation_pred = replace_negatives(
    np.round_(
        XGB_model.predict(
            validation_df
        )
    )
)
error_calculator(
    validation_label_df,
    XGB_validation_pred
)

SMAPE:  27.73 %
MAPE:   34.46 %
MSE:    1168.5
MAE:    11.53


### Test prediction

In [None]:
XGB_test_pred = replace_negatives(
    np.round_(
        XGB_model.predict(
            test_df
        )
    )
)
error_calculator(
    test_label_df,
    XGB_test_pred
)

SMAPE:  27.47 %
MAPE:   34.4 %
MSE:    562.2
MAE:    7.11


### Result Data

In [None]:
XGB_result_df = test_df.copy()
XGB_result_df['PULocationID'] = pu_location_id
XGB_result_df['real demand'] = test_label_df
XGB_result_df['predicted demand'] = XGB_test_pred

print(XGB_result_df.shape)
XGB_result_df.head()

(62880, 7)


Unnamed: 0_level_0,time_interval_number,PU_day_of_week,last_day_demand,last_week_demand,PULocationID,real demand,predicted demand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-04-01,1,5,1.0,1.0,1,1,1.0
2023-04-01,2,5,1.0,1.0,1,2,1.0
2023-04-01,3,5,1.0,4.0,1,2,3.0
2023-04-01,4,5,3.0,3.0,1,1,3.0
2023-04-01,5,5,6.0,4.0,1,2,4.0


In [None]:
XGB_result_df.to_parquet(TIME_INTERVAL_XGB_OUTPUT_PATH)