In [2]:
!pip install prophet

[0m

In [1]:
import numpy as np 
import pandas as pd 
from prophet import Prophet 
import os 
import math 
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid 
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

Importing plotly failed. Interactive plots will not work.


In [2]:
train = pd.read_csv("train.csv") 
train_calendar = pd.read_csv("train_calendar.csv")
test = pd.read_csv("test.csv") 
test_calendar = pd.read_csv("test_calendar.csv") 

In [3]:
ignore_columns = ['id','shutdown','mini_shutdown', 'blackout', 'mov_change', 'frankfurt_shutdown', 'precipitation', 'snow', 'user_activity_1', 'user_activity_2']
train_dataset_full = train.drop(ignore_columns, axis=1, errors="ignore")
test_dataset = test.drop(ignore_columns, axis=1, errors="ignore")

In [4]:
label_holiday_names = LabelEncoder()
train_dataset_full["holiday_name"] = label_holiday_names.fit_transform(train_dataset_full["holiday_name"])
test_dataset["holiday_name"] = label_holiday_names.transform(test_dataset["holiday_name"])

In [5]:
from prophet import Prophet 
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_percentage_error

# Assume train and test DataFrames are available and preprocessed

regions = ["Prague_1", "Brno_1", "Prague_2", "Prague_3", "Munich_1", "Frankfurt_1", "Budapest_1"]

for region in regions:
    print(f"================== Training for region: {region} ==================") 
    
    # prepare the training data for the current region 
    cur_df = train_dataset_full[train_dataset_full["warehouse"]==region].copy() 
    cur_df = cur_df[["date", "orders", "holiday", "shops_closed", "winter_school_holidays", "school_holidays", "holiday_name"]] 
    cur_df.rename(columns={"date":"ds", "orders":"y"}, inplace=True) 
    
    # prepare the test data for the current region 
    cur_df_test = test_dataset[test_dataset["warehouse"]==region].copy() 
    cur_df_test = cur_df_test[["date", "holiday", "shops_closed", "winter_school_holidays", "school_holidays", "holiday_name"]] 
    cur_df_test.rename(columns={"date":"ds"}, inplace=True) 
    
    validation_size = int(cur_df.shape[0] * 0.2)
    train_cv = cur_df[:-validation_size]
    valid_cv = cur_df[-validation_size:]
    
    # Define a grid of parameters to test
    param_grid = {
        'changepoint_prior_scale': [0.01, 0.1, 0.5, 1.0, 2.0],
        'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
        'holidays_prior_scale': [0.01, 0.1, 1.0, 10.0]
    }
    
    best_mape = float('inf')
    best_params = {}

    for params in ParameterGrid(param_grid):
        model = Prophet(
            changepoint_prior_scale=params["changepoint_prior_scale"],
            seasonality_prior_scale=params["seasonality_prior_scale"],
            holidays_prior_scale=params["holidays_prior_scale"]
        )
        
        # Add extra regressors
        model.add_regressor('holiday')
        model.add_regressor('shops_closed')
        model.add_regressor('winter_school_holidays')
        model.add_regressor('school_holidays')
        model.add_regressor('holiday_name')
        
        model.fit(train_cv)
        
        # Predict and validate on the validation part
        forecast = model.predict(valid_cv[["ds"] + ['holiday', 'shops_closed', 'winter_school_holidays', 'school_holidays', 'holiday_name']])
        mape = mean_absolute_percentage_error(valid_cv["y"], forecast["yhat"])
        if mape < best_mape:
            best_mape = mape
            best_params = params

    print(f"Best MAPE: {best_mape}")
    print(f"Best Params: {best_params}")
    
    # Train final model with the best parameters on all data
    final_model = Prophet(**best_params)
    final_model.add_regressor('holiday')
    final_model.add_regressor('shops_closed')
    final_model.add_regressor('winter_school_holidays')
    final_model.add_regressor('school_holidays') 
    final_model.add_regressor('holiday_name')
    final_model.fit(cur_df)
    
    # Predict on the test data
    forecast = final_model.predict(cur_df_test)
    predictions = forecast[["ds", "yhat"]]
    
    # Save predictions
    predictions.to_csv(f"{region}_fbprophet_additional_regressors_v4.csv", index=False)

17:14:14 - cmdstanpy - INFO - Chain [1] start processing
17:14:14 - cmdstanpy - INFO - Chain [1] done processing




17:14:14 - cmdstanpy - INFO - Chain [1] start processing
17:14:14 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:15 - cmdstanpy - INFO - Chain [1] start processing
17:14:15 - cmdstanpy - INFO - Chain [1] done processing
17:14:16 - cmdstanpy - INFO - Chain [1] start processing
17:14:16 - cmdstanpy - INFO - Chain [1] done processing
17:14:16 - cmdstanpy - INFO - Chain [1] start processing
17:14:16 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.05614002437578786
Best Params: {'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 0.1, 'seasonality_prior_scale': 0.01}


17:14:39 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing




17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:40 - cmdstanpy - INFO - Chain [1] start processing
17:14:40 - cmdstanpy - INFO - Chain [1] done processing
17:14:41 - cmdstanpy - INFO - Chain [1] start processing
17:14:41 - cmdstanpy - INFO - Chain [1] done processing
17:14:41 - cmdstanpy - INFO - Chain [1] start processing
17:14:41 - cmdstanpy - INFO - Chain [1] done processing
17:14:41 - cmdstanpy - INFO - Chain [1] start processing
17:14:41 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.05253507208474001
Best Params: {'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 1.0, 'seasonality_prior_scale': 0.1}


17:15:07 - cmdstanpy - INFO - Chain [1] done processing
17:15:07 - cmdstanpy - INFO - Chain [1] start processing
17:15:07 - cmdstanpy - INFO - Chain [1] done processing




17:15:07 - cmdstanpy - INFO - Chain [1] start processing
17:15:07 - cmdstanpy - INFO - Chain [1] done processing
17:15:07 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:08 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:08 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:08 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:08 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:08 - cmdstanpy - INFO - Chain [1] start processing
17:15:08 - cmdstanpy - INFO - Chain [1] done processing
17:15:09 - cmdstanpy - INFO - Chain [1] start processing
17:15:09 - cmdstanpy - INFO - Chain [1] done processing
17:15:09 - cmdstanpy - INFO - Chain [1] start processing
17:15:09 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.05441853548519866
Best Params: {'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 10.0, 'seasonality_prior_scale': 0.01}


17:15:34 - cmdstanpy - INFO - Chain [1] done processing
17:15:34 - cmdstanpy - INFO - Chain [1] start processing
17:15:34 - cmdstanpy - INFO - Chain [1] done processing




17:15:34 - cmdstanpy - INFO - Chain [1] start processing
17:15:34 - cmdstanpy - INFO - Chain [1] done processing
17:15:34 - cmdstanpy - INFO - Chain [1] start processing
17:15:34 - cmdstanpy - INFO - Chain [1] done processing
17:15:34 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:35 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:35 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:35 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:35 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:35 - cmdstanpy - INFO - Chain [1] start processing
17:15:35 - cmdstanpy - INFO - Chain [1] done processing
17:15:36 - cmdstanpy - INFO - Chain [1] start processing
17:15:36 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.05767974687744649
Best Params: {'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 10.0, 'seasonality_prior_scale': 0.01}


17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing




17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:01 - cmdstanpy - INFO - Chain [1] done processing
17:16:01 - cmdstanpy - INFO - Chain [1] start processing
17:16:02 - cmdstanpy - INFO - Chain [1] done processing
17:16:02 - cmdstanpy - INFO - Chain [1] start processing
17:16:02 - cmdstanpy - INFO - Chain [1] done processing
17:16:02 - cmdstanpy - INFO - Chain [1] start processing
17:16:02 - cmdstanpy - INFO - Chain [1] done processing
17:16:02 - cmdstanpy - INFO - Chain [1] start processing
17:16:02 - cmdstanpy - INFO - Chain [1] done processing
17:16:02 - cmdstanpy - INFO - Chain [1] 

Best MAPE: 0.1620771979623859
Best Params: {'changepoint_prior_scale': 2.0, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 1.0}


17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing




17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:18 - cmdstanpy - INFO - Chain [1] start processing
17:16:18 - cmdstanpy - INFO - Chain [1] done processing
17:16:19 - cmdstanpy - INFO - Chain [1] start processing
17:16:19 - cmdstanpy - INFO - Chain [1] done processing
17:16:19 - cmdstanpy - INFO - Chain [1] start processing
17:16:19 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.05644225986820088
Best Params: {'changepoint_prior_scale': 2.0, 'holidays_prior_scale': 0.01, 'seasonality_prior_scale': 10.0}


17:16:32 - cmdstanpy - INFO - Chain [1] done processing
17:16:32 - cmdstanpy - INFO - Chain [1] start processing
17:16:32 - cmdstanpy - INFO - Chain [1] done processing




17:16:32 - cmdstanpy - INFO - Chain [1] start processing
17:16:32 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:33 - cmdstanpy - INFO - Chain [1] start processing
17:16:33 - cmdstanpy - INFO - Chain [1] done processing
17:16:34 - cmdstanpy - INFO - Chain [1] start processing
17:16:34 - cmdstanpy - INFO - Chain [1] done processing
17:16:34 - cmdstanpy - INFO - Chain [1] start processing
17:16:34 - cmdstanpy - INFO - Chain [1]

Best MAPE: 0.033656976113287586
Best Params: {'changepoint_prior_scale': 0.1, 'holidays_prior_scale': 10.0, 'seasonality_prior_scale': 1.0}


17:16:59 - cmdstanpy - INFO - Chain [1] done processing


In [6]:
# inference 
prague_1 = pd.read_csv("Prague_1_fbprophet_additional_regressors_v4.csv")["yhat"].values 
brno_1 = pd.read_csv("Brno_1_fbprophet_additional_regressors_v4.csv")["yhat"].values 
prague_2 = pd.read_csv("Prague_2_fbprophet_additional_regressors_v4.csv")["yhat"].values  
prague_3 = pd.read_csv("Prague_3_fbprophet_additional_regressors_v4.csv")["yhat"].values  
munich_1 = pd.read_csv("Munich_1_fbprophet_additional_regressors_v4.csv")["yhat"].values 
frankfurt_1 = pd.read_csv("Frankfurt_1_fbprophet_additional_regressors_v4.csv")["yhat"].values  
budapest_1 = pd.read_csv("Budapest_1_fbprophet_additional_regressors_v4.csv")["yhat"].values  

In [7]:
concatenated_predictions = np.concatenate([prague_1, brno_1, prague_2, prague_3, munich_1, frankfurt_1, budapest_1])

concatenated_predictions.shape


(397,)

In [8]:
submission = pd.read_csv("solution_example.csv")

In [12]:
submission["orders"] = concatenated_predictions 

submission.to_csv("fbprophet_with_regressors_0802.csv", index=False) 

In [11]:
submission

Unnamed: 0,id,orders
0,Prague_1_2024-03-16,9981.187761
1,Prague_1_2024-03-17,9959.766656
2,Prague_1_2024-03-18,10202.018170
3,Prague_1_2024-03-19,9922.910670
4,Prague_1_2024-03-20,9954.050722
...,...,...
392,Budapest_1_2024-05-11,6664.905375
393,Budapest_1_2024-05-12,6333.546935
394,Budapest_1_2024-05-13,6584.952812
395,Budapest_1_2024-05-14,6614.586091
