### import modules

In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import numpy as np

### ignore warnings

In [2]:
import warnings
warnings.filterwarnings('ignore')

### config

In [14]:
LABEL_DATA_PATH = 'D:/projects/rahnamcollege-ml/demand-prediction/data/label/label.parquet'
ARIMA_MODELS_PATH = 'D:/projects/rahnamcollege-ml/demand-prediction/model/arima_models/'
OUTPUT_PATH = 'D:/projects/rahnamcollege-ml/demand-prediction/data/arima_output.xlsx'
TRAIN_TEST_SPLIT_DATE = '2023-04-01'
DATA_LAST_DATE = '2023-04-31'
ADDED_PASSENGER_COUNT = 100
HIGH_DEMAND_THRESHOLD = 1000

TESTING_ARIMA_ORDERS = [
    (4, 2, 1),
    (5, 1, 2),
    (4, 2, 1),
    (1, 2, 2),
    (6, 1, 4)
]

### load data

In [15]:
rides_df = pd.read_parquet(LABEL_DATA_PATH)
rides_df = rides_df[rides_df['date'] < DATA_LAST_DATE].reset_index(drop=True)
print(rides_df.shape)
rides_df.head()

(31178, 3)


Unnamed: 0,date,PULocationID,count
0,2023-01-02,70,503.0
1,2023-01-02,132,6419.0
2,2023-01-02,142,2028.0
3,2023-01-02,164,1462.0
4,2023-01-02,144,567.0


### prepare data

In [17]:
def train_test_split(df, split_date):
    train_df = df[df['date'] < split_date]
    test_df = df[df['date'] >= split_date]
    return train_df, test_df

train_df, test_df = train_test_split(rides_df, TRAIN_TEST_SPLIT_DATE)

print(train_df.shape)
train_df.head()

(23318, 3)


Unnamed: 0,date,PULocationID,count
0,2023-01-02,70,503.0
1,2023-01-02,132,6419.0
2,2023-01-02,142,2028.0
3,2023-01-02,164,1462.0
4,2023-01-02,144,567.0


### seprate locations data

In [18]:
train_location_dfs = {}
test_location_dfs = {}


location_ids = train_df['PULocationID'].unique()

for location_id in location_ids:
    train_location_dfs[location_id] = train_df[
        train_df['PULocationID'] == location_id].sort_values(by=['date']).reset_index(drop=True)
    
    test_location_dfs[location_id] = test_df[
        test_df['PULocationID'] == location_id].sort_values(by=['date']).reset_index(drop=True)

### train model

In [19]:
def train_arima_model(count_series, order):
    model = ARIMA(count_series, order=order)
    fitted_model = model.fit()
    return fitted_model


models = {}

for location_id, location_df in train_location_dfs.items():
    ## smooth label.
    count_series = location_df['count'] + ADDED_PASSENGER_COUNT
    models[location_id] = train_arima_model(count_series, order=(3, 1, 3))

### predict test data

In [20]:
def predict_passenger_count(location_id, model=None):
    train_df = train_location_dfs[location_id]
    test_df = test_location_dfs[location_id]
    model = model or models[location_id]
    start_index = len(train_df)
    end_index = len(train_df) + len(test_df)
    
    return model.predict(start=start_index + 1, end=end_index, typ='levels')
    

In [21]:
for location_id in location_ids:
    test_df = test_location_dfs[location_id]
    pred = predict_passenger_count(location_id) - ADDED_PASSENGER_COUNT
    pred = pred.apply(lambda x: max(0, int(x)))
    test_df['pred'] = pred.values

### concat test dataframes

In [22]:
pred_df = pd.concat(test_location_dfs.values())
print(pred_df.shape)
pred_df.head()

(7860, 4)


Unnamed: 0,date,PULocationID,count,pred
0,2023-04-01,70,373.0,560
1,2023-04-02,70,528.0,571
2,2023-04-03,70,568.0,478
3,2023-04-04,70,467.0,515
4,2023-04-05,70,421.0,580


### evaluate loss

In [24]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [40]:
def evaluate_model(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mape = mean_absolute_percentage_error(actual, pred)
    mse = mean_squared_error(actual, pred)

    print(f'mape = {mape}')
    print(f'mae = {mae}')
    print(f'rmse = {np.sqrt(mse)}')

evaluate_model(pred_df['count'], pred_df['pred'])

mape = 365559359066460.44
mae = 58.74325699745547
rmse = 193.57363564623876


### testing arima model with some order parameters

In [26]:
def evaluate_custom_order_arima(arima_order_parameter):
    for location_id in location_ids:
        train_df = train_location_dfs[location_id]
        test_df = test_location_dfs[location_id]
        
        series = train_df['count'] + ADDED_PASSENGER_COUNT
        model = train_arima_model(series, arima_order_parameter)
        
        pred = predict_passenger_count(location_id, model) - ADDED_PASSENGER_COUNT
        pred = pred.apply(lambda x: max(0, int(x)))
        test_df['pred'] = pred.values
    pred_df = pd.concat(test_location_dfs.values())
    
    mae = mean_absolute_error(pred_df['count'], pred_df['pred'])
    mape = mean_absolute_percentage_error(pred_df['count'], pred_df['pred'])
    rmse = np.sqrt(mean_squared_error(pred_df['count'], pred_df['pred']))
    
    return {'mae': mae, 'mape': mape, 'rmse': rmse}  

In [27]:
for order in TESTING_ARIMA_ORDERS:
    results = evaluate_custom_order_arima(order)
    
    print('order:', order)
    print('\n'.join([f'{key} = {value}' for key, value in results.items()]))
    print('\n ----- \n')

order: (4, 2, 1)
mae = 70.7320610687023
mape = 582717661709389.0
rmse = 236.2431398360445

 ----- 

order: (5, 1, 2)
mae = 59.35343511450382
mape = 362121496755490.5
rmse = 198.3498656697199

 ----- 

order: (4, 2, 1)
mae = 70.7320610687023
mape = 582717661709389.0
rmse = 236.2431398360445

 ----- 

order: (1, 2, 2)
mae = 3420.431679389313
mape = 560371556688086.06
rmse = 55063.781401922926

 ----- 

order: (6, 1, 4)
mae = 63.27595419847328
mape = 355818749185379.2
rmse = 212.80997777646934

 ----- 



### recreate best model in expriments

In [36]:
models = {}

for location_id, location_df in train_location_dfs.items():
    ## smooth label.
    count_series = location_df['count'] + ADDED_PASSENGER_COUNT
    models[location_id] = train_arima_model(count_series, order=(3, 1, 3))

### add fitted values

In [37]:
dfs = []

for location_id in location_ids:
    train_df = train_location_dfs[location_id]
    model = models[location_id]
    train_df['pred'] = model.fittedvalues.values - ADDED_PASSENGER_COUNT
    test_df = test_location_dfs[location_id]
    pred = predict_passenger_count(location_id) - ADDED_PASSENGER_COUNT
    test_df['pred'] = pred.values
    dfs.append(train_df)
    dfs.append(test_df)

result_df = pd.concat(dfs)
result_df['pred'] = result_df['pred'].apply(lambda x: max(0, int(x)))

print(result_df.shape)
result_df.head()
    

(31178, 4)


Unnamed: 0,date,PULocationID,count,pred
0,2023-01-02,70,503.0,0
1,2023-01-03,70,530.0,500
2,2023-01-04,70,485.0,518
3,2023-01-05,70,417.0,488
4,2023-01-06,70,405.0,489


### evaluate on high demand locations

In [38]:
mean_counts = rides_df.groupby('PULocationID')['count'].mean()
true_items_location_ids = mean_counts[mean_counts > HIGH_DEMAND_THRESHOLD].index.tolist()
high_demand_df = result_df[result_df['PULocationID'].isin(true_items_location_ids)]
print(high_demand_df.shape)
high_demand_df.head()

(4403, 4)


Unnamed: 0,date,PULocationID,count,pred
0,2023-01-02,132,6419.0,0
1,2023-01-03,132,6784.0,6425
2,2023-01-04,132,6006.0,5862
3,2023-01-05,132,5617.0,5605
4,2023-01-06,132,4877.0,5495


In [41]:
high_demand_df = high_demand_df[high_demand_df['pred'] > 0]

evaluate_model(high_demand_df['count'], high_demand_df['pred'])

mape = 0.11701232984044696
mae = 272.27326914259515
rmse = 397.89935688793145


### save results

In [42]:
result_df.to_excel(OUTPUT_PATH)