# Packages

In [None]:
!pip install utilsforecast coreforecast nixtla statsforecast mlforecast neuralforecast

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nixtla import NixtlaClient

from statsforecast import StatsForecast
from statsforecast.models import Naive, SeasonalNaive, AutoETS, AutoARIMA

from mlforecast import MLForecast
import xgboost as xgb

from neuralforecast import NeuralForecast
from neuralforecast.auto import AutoNBEATS, AutoNHITS

from utilsforecast.evaluation import evaluate
from utilsforecast.losses import rmse, mae, bias
from utilsforecast.plotting import plot_series


nixtla_client = NixtlaClient(
    api_key= 'nixak-GU4ctXFWmKUcZV3FNQzQkmYAzF5mvytXBSTcY5B8HHTQUdFyC8IevzsadaebAIDUhgUCRxPRwnCzs0LB'
)

# Read in the Data

In [None]:

df = (
    pd.read_parquet('https://raw.githubusercontent.com/ryaltic/Hotel-Occupancy-Forecast/main/sample_hotels.parquet')
)

otb_df = df.iloc[:, 36:]
otb_df


df = (
    df
  [['unique_id', 'ds','holiday_flag','target_day','target_month','target_year','location_type','hotel_type','y']]
)
df


df = pd.concat([df, otb_df], axis=1)
df

df = pd.get_dummies(df, columns=['holiday_flag', 'target_day','target_month','location_type','hotel_type'], drop_first=True)
display(df.head())
display(df.tail())
df.info()

In [None]:
fig = sns.relplot(
  data=df, x='ds', y='y', kind='line', legend=False, hue='unique_id', col='unique_id', height=4, aspect=2,
  palette='Paired', col_wrap=5,  facet_kws={'sharey': False, 'sharex': False}
)

Removing hotel 77 and hotel 28 as the data is incomplete for those hotels

In [None]:
df = df.query("unique_id != 'hotel_77' and unique_id != 'hotel_28'")


Updated figure plot without hotel 77 and hotel 28

In [None]:
fig = sns.relplot(
  data=df, x='ds', y='y', kind='line', legend=False, hue='unique_id', col='unique_id', height=4, aspect=2,
  palette='Paired', col_wrap=5,  facet_kws={'sharey': False, 'sharex': False}
)

In [None]:
train_df = df.query("ds < '2023-06-01'")
test_df = df.query("ds >= '2023-06-01'")
test_df =(
    test_df
    .query("ds <= '2023-06-28'")
    )

# Models

## Baseline Models

In [None]:
baseline_models = [Naive(),
                   SeasonalNaive(season_length=7),
                   AutoETS()]

sf_base = StatsForecast(
    models = baseline_models,
    freq = 'D',
    n_jobs = -1
    )

cross_validation_base = sf_base.cross_validation(
    h=28,
    df = df[['unique_id','ds','y']].query("ds < '2023-06-01'"),
    step_size = 28,
    n_windows = 5
)

eval_base = evaluate(cross_validation_base, metrics=[rmse, mae, bias], models=['Naive', 'SeasonalNaive', 'AutoETS'])
eval_base
eval_base.to_csv('eval_base.csv', index=False)

## AutoARIMA with Predictors

In [None]:
autoarima_pred = [AutoARIMA()]

sf_arima_pred = StatsForecast(
    models=autoarima_pred,
    freq="D",
    n_jobs=-1,
)

cross_validation_arima_pred = sf_arima_pred.cross_validation(
    h = 28,
    df = df.query("ds < '2023-06-01'"),
    step_size=28,
    n_windows = 5
)

eval_autoarima_pred = evaluate(cross_validation_arima_pred, metrics=[bias,rmse,mae], models=['AutoARIMA'])
eval_autoarima_pred
eval_autoarima_pred.to_csv('eval_autoarima_pred.csv', index=False)

## XGB model

In [None]:
xgb_model = {'xgb' : xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)}

mlf = MLForecast(
    models = xgb_model,
    freq = 'D'
)
df = df.query("unique_id != 'hotel_77' and unique_id != 'hotel_28'")
cross_validation_mlf = mlf.cross_validation(
    n_windows = 5,
    step_size=28,
    h = 28,
    static_features=[],
    df = df.query("ds < '2023-06-01'")
    )

display(cross_validation_mlf)

eval_ml = evaluate(
    df = cross_validation_mlf,
    metrics = [bias, rmse, mae],
    models = ['xgb']
)

display(eval_ml)
eval_ml.to_csv('eval_ml.csv', index=False)

## TimeGPT

In [None]:
timegpt_cv_df = nixtla_client.cross_validation(
    df = df.query("ds < '2023-06-01'"),
    h=28,
    freq = 'D',
    n_windows= 5,
    step_size = 28
 )

display(timegpt_cv_df)

timegpt_cv_eval = evaluate(
    df = timegpt_cv_df,
    metrics = [bias, mae, rmse],
    models = ['TimeGPT']
)
display(timegpt_cv_eval)
timegpt_cv_eval.to_csv('timegpt_cv_eval.csv', index=False)

## Neural Models

In [None]:
Neural_models = [AutoNBEATS(h=28),
                 AutoNHITS(h=28)]

nf = NeuralForecast(
    models = Neural_models,
    freq = 'D'
)


cross_validation_nf = nf.cross_validation(
    n_windows = 5,
    step_size=28,
    df = df.query("ds < '2023-06-01'")
    )

display(cross_validation_nf)

eval_nf = evaluate(
    df = cross_validation_nf,
    metrics = [bias, rmse, mae],
    models = ['AutoNBEATS', 'AutoNHITS']
)

display(eval_nf)
eval_nf.to_csv('eval_nf.csv', index=False)

In [None]:
# make a final dataset with all the dataframe joined by unique_id and metric
final_eval = pd.concat([eval_base, eval_autoarima_pred, eval_ml, timegpt_cv_eval, eval_nf], axis=0)
final_eval_cleaned = (
    final_eval
    .groupby(['unique_id', 'metric'], as_index=False)
    .first()
    .reset_index(drop=True)
)
final_eval_cleaned.to_csv('final_eval.csv', index=False)
display(final_eval_cleaned)


In [None]:
##final_eval_cleaned = pd.read_csv('/content/final_eval.csv')
display(final_eval_cleaned.query("metric == 'mae'"))


melted = final_eval_cleaned.melt(
    id_vars=['unique_id', 'metric'],
    var_name='model',
    value_name='score'
)

melted_mae = melted[melted['metric'] == 'mae']


best_models = (
    melted_mae
    .loc[melted_mae.groupby('unique_id')['score'].idxmin()]
    .rename(columns={'model': 'best_model'})
)
display(best_models)


best_overall_model = (
    best_models
    .groupby('best_model')
    .agg(count=('best_model', 'count'))
    .reset_index()
    .sort_values(by='count', ascending=False)
)
display(best_overall_model)

# Forecast

In [None]:
forecast_timegpt = nixtla_client.forecast(
    df = df.query("ds < '2023-06-01'"),
    h=28,
    freq = 'D',
    finetune_loss = 'mae',
    X_df= test_df.drop(columns=['y'])
)

display(forecast_timegpt)

In [None]:
holdout_with_actual = (
    test_df[['unique_id','ds','y']]
    .merge(forecast_timegpt, on=['unique_id', 'ds'], how='left')
    .assign(mae = lambda x: (x['y']- x['TimeGPT']).abs() / x['y'])
)

display(holdout_with_actual)

print(f"Mae: {holdout_with_actual['mae'].mean():.2%}")

# Plot of Actuals vs Forecast

In [None]:
nixtla_client.plot(
    df = df.query("ds >= '2023-01-01'"),
    forecasts_df = forecast_timegpt,
    max_ids=17
)

## Forecasting Best Model

In [None]:
best_models_timegpt = (
    forecast_timegpt
    .query("unique_id == 'hotel_112' or unique_id == 'hotel_21' or unique_id == 'hotel_35' or unique_id == 'hotel_7' or unique_id == 'hotel_98' or unique_id == 'hotel_84'")
    .assign(best_model = 'TimeGPT')
)

best_models_timegpt

In [None]:
mlf.fit(df=df.query("ds < '2023-06-01'"), static_features = [])
forecast_ml = mlf.predict(h=28, X_df= test_df.drop(columns=['y']))
display(forecast_ml)

best_models_ml = (
    forecast_ml
    .query("unique_id == 'hotel_0' or unique_id == 'hotel_133' or unique_id == 'hotel_42' or unique_id == 'hotel_91' or unique_id == 'hotel_63'")
    .assign(best_model = 'XGBoost')
)

best_models_ml


In [None]:
sf_arima_pred.fit(df=df.query("ds < '2023-06-01'"))
forecast_arima = sf_arima_pred.predict(h=28, X_df= test_df.drop(columns=['y']))
display(forecast_arima)
forecast_arima.to_csv('forecast_arima.csv', index=False)
best_models_arima = (
    forecast_arima
    .query("unique_id == 'hotel_105' or unique_id == 'hotel_14' or unique_id == 'hotel_70'")
    .assign(best_model = 'AutoARIMA')
)

best_models_arima

In [None]:
sf_base.fit(df=df.query("ds < '2023-06-01'"))
forecast_baseline= sf_base.predict(h=28)
forecast_ETS = (
    forecast_baseline
    [['unique_id','ds','AutoETS']]
)

best_models_ETS = (
    forecast_ETS
    .query("unique_id == 'hotel_126'")
    .assign(best_model = 'AutoETS')
)

forecast_seasonalNaive = (
    forecast_baseline
    [['unique_id','ds','SeasonalNaive']]
)

best_models_seasonalNaive = (
    forecast_seasonalNaive
    .query("unique_id == 'hotel_56'")
    .assign(best_model = 'SeasonalNaive')
)

best_models_seasonalNaive

In [None]:
nf.fit(df=df.query("ds < '2023-06-01' and unique_id == 'hotel_49'"))
forecast_nf = nf.predict()

display(forecast_nf)
forecast_nf.to_csv('forecast_nf.csv', index=False)
best_models_nf = (
    forecast_nf
    [['unique_id','ds','AutoNBEATS']]
    .assign(best_model = 'AutoNBEATS')
)

best_models_nf

In [None]:
concat_df = pd.concat([best_models_timegpt, best_models_ml, best_models_arima, best_models_ETS, best_models_seasonalNaive, best_models_nf], axis=0)
display(concat_df)
concat_df.to_csv('final_best_models_rating.csv', index=False)

In [None]:
plot_series(
    df=df.query("ds > '2023-01-01'"),
    forecasts_df=concat_df,
    max_ids=17,
    palette='Paired',
    models=['TimeGPT', 'xgb', 'AutoARIMA', 'AutoETS', 'SeasonalNaive', 'AutoNBEATS']
)