# Forecasting

Get fast, easy model settings (SARIMA) from notebook 3 and predict revenue 

In [52]:
# Import
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import pmdarima as pm

In [53]:
# Settings
DATA_CLEANED_VARIABLE_PATH = "../data/cleaned_data_variable.csv"
DATA_CLEANED_PATH = "../data/cleaned_data.csv"
THAI_COVID_RAW_DATA_PATH = "../raw_data/data_thai_covid_case.xlsx"
DATA_VARIABLE_OUTPUT_PATH = "../data/forecast_data_variable.csv"
DATA_OUTPUT_PATH = "../data/forecast_data.csv"
FORECASTING_MONTH = 12

## Forecast all by province

In [54]:
data_tourism = pd.read_csv(DATA_CLEANED_PATH)
data_tourism["date"] = pd.to_datetime(data_tourism["date"])
# Our problem is about sales, so in this notebook, we focus on revenue_all
data_tourism_revenue_all  = data_tourism[["date", "province", "region", "revenue_all"]]
## divide by 1e6 
data_tourism_revenue_all["revenue_all"]  = data_tourism_revenue_all["revenue_all"].astype(np.float64) / 1e6
thai_revenue_all = data_tourism_revenue_all.groupby(["date", "province", "region"]).sum("revenue_all").reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_tourism_revenue_all["revenue_all"]  = data_tourism_revenue_all["revenue_all"].astype(np.float64) / 1e6


In [4]:
thai_revenue_all

Unnamed: 0,date,province,region,revenue_all
0,2019-01-01,Amnat Charoen,east_northeast,34.510000
1,2019-01-01,Ang Thong,central,132.070000
2,2019-01-01,Bangkok Metropolis,central,81926.490000
3,2019-01-01,Bueng Kan,east_northeast,137.810000
4,2019-01-01,Buri Ram,east_northeast,364.160000
...,...,...,...,...
4461,2023-10-01,Udon Thani,east_northeast,873.069999
4462,2023-10-01,Uthai Thani,north,144.190000
4463,2023-10-01,Uttaradit,north,147.760000
4464,2023-10-01,Yala,south,338.720000


In [5]:
map_province_to_region = {}
for index, row in thai_revenue_all.iterrows():
    map_province_to_region[row["province"]] = row["region"]

In [6]:
result = []

for province, region in map_province_to_region.items():
    print(f"Forecasting {province} ....")
    thai_revenue_all_province = thai_revenue_all[thai_revenue_all["province"] == province].sort_values(by="date").reset_index(drop=True)
    train_data = thai_revenue_all_province.set_index("date")

    SARIMAModel = pm.auto_arima(
                            train_data["revenue_all"],
                            start_p=0,
                            start_q=0, 
                            test= "adf",  # use adftest to find optimal 'd'
                            max_p=3,
                            max_q=3,
                            m=12,
                            start_P=0,
                            seasonal=True,
                            D=1,
                            alpha=0.05,
                            trace=False,
                            suppress_warnings=True,
                            stepwise=True)

    forecast, forecast_int = SARIMAModel.predict(n_periods=FORECASTING_MONTH, 
                                      return_conf_int=True,
                                      alpha=0.05)
    
    for data_sample in forecast.iteritems():
        result.append([data_sample[0], province, region, "revenue_all_forecast", data_sample[1] * 1e6])
    

Forecasting Amnat Charoen ....
Forecasting Ang Thong ....
Forecasting Bangkok Metropolis ....
Forecasting Bueng Kan ....
Forecasting Buri Ram ....
Forecasting Chachoengsao ....
Forecasting Chai Nat ....
Forecasting Chaiyaphum ....
Forecasting Chanthaburi ....
Forecasting Chiang Mai ....
Forecasting Chiang Rai ....
Forecasting Chon Buri ....
Forecasting Chumphon ....
Forecasting Kalasin ....
Forecasting Kamphaeng Phet ....
Forecasting Kanchanaburi ....
Forecasting Khon Kaen ....
Forecasting Krabi ....
Forecasting Lampang ....
Forecasting Lamphun ....
Forecasting Loei ....
Forecasting Lop Buri ....
Forecasting Mae Hong Son ....
Forecasting Maha Sarakham ....
Forecasting Mukdahan ....
Forecasting Nakhon Nayok ....
Forecasting Nakhon Pathom ....
Forecasting Nakhon Phanom ....
Forecasting Nakhon Ratchasima ....
Forecasting Nakhon Sawan ....
Forecasting Nakhon Si Thammarat ....
Forecasting Nan ....
Forecasting Narathiwat ....
Forecasting Nong Bua Lam Phu ....
Forecasting Nong Khai ....
Forec

In [7]:
df_result = pd.DataFrame(result, columns=["date", "province", "region", "variable", "value"])
df_result

Unnamed: 0,date,province,region,variable,value
0,2023-11-01,Amnat Charoen,east_northeast,revenue_forecast,2.428000e+07
1,2023-12-01,Amnat Charoen,east_northeast,revenue_forecast,2.694000e+07
2,2024-01-01,Amnat Charoen,east_northeast,revenue_forecast,2.824000e+07
3,2024-02-01,Amnat Charoen,east_northeast,revenue_forecast,2.598000e+07
4,2024-03-01,Amnat Charoen,east_northeast,revenue_forecast,2.342000e+07
...,...,...,...,...,...
919,2024-06-01,Yasothon,east_northeast,revenue_forecast,6.336960e+07
920,2024-07-01,Yasothon,east_northeast,revenue_forecast,6.180383e+07
921,2024-08-01,Yasothon,east_northeast,revenue_forecast,6.119584e+07
922,2024-09-01,Yasothon,east_northeast,revenue_forecast,5.961885e+07


In [9]:
df_result["value"] = df_result["value"].astype(np.int64)

In [10]:
df_result

Unnamed: 0,date,province,region,variable,value
0,2023-11-01,Amnat Charoen,east_northeast,revenue_forecast,24280000
1,2023-12-01,Amnat Charoen,east_northeast,revenue_forecast,26940000
2,2024-01-01,Amnat Charoen,east_northeast,revenue_forecast,28240000
3,2024-02-01,Amnat Charoen,east_northeast,revenue_forecast,25980000
4,2024-03-01,Amnat Charoen,east_northeast,revenue_forecast,23420000
...,...,...,...,...,...
919,2024-06-01,Yasothon,east_northeast,revenue_forecast,63369601
920,2024-07-01,Yasothon,east_northeast,revenue_forecast,61803832
921,2024-08-01,Yasothon,east_northeast,revenue_forecast,61195841
922,2024-09-01,Yasothon,east_northeast,revenue_forecast,59618851


# Output results

In [55]:
data_cleaned_variable_df = pd.read_csv(DATA_CLEANED_VARIABLE_PATH)

In [56]:
data_cleaned_variable_df

Unnamed: 0,date,province,region,variable,value
0,2019-01-01,Bangkok Metropolis,central,ratio_tourist_stay,9.337000e-01
1,2019-01-01,Lop Buri,central,ratio_tourist_stay,6.132000e-01
2,2019-01-01,Phra Nakhon Si Ayutthaya,central,ratio_tourist_stay,7.337000e-01
3,2019-01-01,Saraburi,central,ratio_tourist_stay,6.733000e-01
4,2019-01-01,Chai Nat,central,ratio_tourist_stay,7.931000e-01
...,...,...,...,...,...
35723,2023-10-01,Roi Et,east_northeast,revenue_foreign,2.410000e+06
35724,2023-10-01,Si Sa Ket,south,revenue_foreign,8.600000e+05
35725,2023-10-01,Surin,east_northeast,revenue_foreign,4.440000e+06
35726,2023-10-01,Amnat Charoen,east_northeast,revenue_foreign,7.400000e+05


In [57]:
df_variables = pd.concat([data_cleaned_variable_df, df_result])

In [58]:
df_variables["date"] = pd.to_datetime(df_variables["date"])

In [59]:
DATA_VARIABLE_OUTPUT_PATH

'../data/forecast_data_variable.csv'

In [60]:
df_variables.to_csv(DATA_VARIABLE_OUTPUT_PATH, index=False)

In [61]:
data_tourism["revenue_all_forecast"] = None

In [62]:
data_tourism

Unnamed: 0,date,province,region,ratio_tourist_stay,no_tourist_stay,no_tourist_all,no_tourist_thai,no_tourist_foreign,revenue_all,revenue_thai,revenue_foreign,revenue_all_forecast
0,2019-01-01,Bangkok Metropolis,central,0.9337,3334971,5959075,3534061,2425014,81926490000,29742580000,52183910000,
1,2019-01-01,Lop Buri,central,0.6132,51858,268664,266301,2363,457240000,451830000,5410000,
2,2019-01-01,Phra Nakhon Si Ayutthaya,central,0.7337,117052,730329,561553,168776,1438730000,1054250000,384480000,
3,2019-01-01,Saraburi,central,0.6733,89850,207236,201400,5836,347790000,336190000,11600000,
4,2019-01-01,Chai Nat,central,0.7931,27141,79073,78514,559,101790000,100900000,890000,
...,...,...,...,...,...,...,...,...,...,...,...,...
4461,2023-10-01,Roi Et,east_northeast,0.4092,25466,62454,61530,924,82280000,79870000,2410000,
4462,2023-10-01,Si Sa Ket,south,0.4684,26470,83815,83471,344,103050000,102190000,860000,
4463,2023-10-01,Surin,east_northeast,0.5071,65693,93421,91175,2246,164579999,160140000,4440000,
4464,2023-10-01,Amnat Charoen,east_northeast,0.3414,5832,20033,19610,423,23000000,22260000,740000,


In [63]:
data_temp = [{"date": data_sample[0], "province": data_sample[1],
              "region": data_sample[2], "revenue_all_forecast": data_sample[-1]} for data_sample in result]

In [64]:
data_temp

[{'date': Timestamp('2023-11-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 24280000.0000018},
 {'date': Timestamp('2023-12-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 26940000.000001792},
 {'date': Timestamp('2024-01-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 28240000.000001796},
 {'date': Timestamp('2024-02-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 25980000.000001796},
 {'date': Timestamp('2024-03-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 23420000.000001796},
 {'date': Timestamp('2024-04-01 00:00:00', freq='MS'),
  'province': 'Amnat Charoen',
  'region': 'east_northeast',
  'revenue_all_forecast': 25480000.000001796},
 {'date': Timestamp('202

In [65]:
df_dictionary = pd.DataFrame(data_temp)
df = pd.concat([data_tourism,  df_dictionary]) 

In [70]:
df = df.where(pd.notnull(df), None)
df

Unnamed: 0,date,province,region,ratio_tourist_stay,no_tourist_stay,no_tourist_all,no_tourist_thai,no_tourist_foreign,revenue_all,revenue_thai,revenue_foreign,revenue_all_forecast
0,2019-01-01,Bangkok Metropolis,central,0.9337,3334971.0,5959075.0,3534061.0,2425014.0,8.192649e+10,2.974258e+10,5.218391e+10,
1,2019-01-01,Lop Buri,central,0.6132,51858.0,268664.0,266301.0,2363.0,4.572400e+08,4.518300e+08,5.410000e+06,
2,2019-01-01,Phra Nakhon Si Ayutthaya,central,0.7337,117052.0,730329.0,561553.0,168776.0,1.438730e+09,1.054250e+09,3.844800e+08,
3,2019-01-01,Saraburi,central,0.6733,89850.0,207236.0,201400.0,5836.0,3.477900e+08,3.361900e+08,1.160000e+07,
4,2019-01-01,Chai Nat,central,0.7931,27141.0,79073.0,78514.0,559.0,1.017900e+08,1.009000e+08,8.900000e+05,
...,...,...,...,...,...,...,...,...,...,...,...,...
919,2024-06-01,Yasothon,east_northeast,,,,,,,,,6.336960e+07
920,2024-07-01,Yasothon,east_northeast,,,,,,,,,6.180383e+07
921,2024-08-01,Yasothon,east_northeast,,,,,,,,,6.119584e+07
922,2024-09-01,Yasothon,east_northeast,,,,,,,,,5.961885e+07


In [72]:
df.to_csv(DATA_OUTPUT_PATH, index=False)