Final Code for Monthly forecast

In [None]:
import itertools
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import pickle
warnings.filterwarnings("ignore")

# Load and clean data
data = pd.read_csv('Data_final.csv', parse_dates=['Date'], dayfirst=True)
cols = ['Account Region', 'Country', 'Market']
data.drop(cols, axis=1, inplace=True)

days_per_month = data.groupby([data['Date'].dt.year, data['Date'].dt.month])['Date'].nunique()
if days_per_month.iloc[-1] < 28:
    days_per_month = days_per_month.iloc[:-1]
mask = data['Date'].dt.strftime('%Y-%m').isin(days_per_month.index.map(lambda x: f'{x[0]}-{x[1]:02d}'))
data = data.loc[mask]
data.sort_values('Date', inplace=True)
data = data.dropna().rename(columns={'Sum of Count Of SR': 'Count Of SR','Account Region' : 'Region'})
data['z_score'] = np.abs((data['Count Of SR'] - data['Count Of SR'].mean()) / data['Count Of SR'].std())
data = data[data['z_score'] <= 3]

# data.dropna(inplace=True)
# data.rename(columns={'Sum of Count Of SR': 'Count Of SR'}, inplace=True)

# Resample data by month
data = data.set_index('Date')
# data_grouped = data.groupby(pd.Grouper(freq='M')).sum()
# print(data_grouped)

y = data['Count Of SR'].resample('MS').sum()
print(y.describe())
# Define the range of p, d, and q values
p = d = q = range(0, 2)
# Generate all possible combinations of p, d, and q
pdq = list(itertools.product(p, d, q))
# Generate all possible combinations of seasonal p, d, and q
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

# Initialize variables to hold the best AIC and the corresponding parameters
best_aic = np.inf
best_param = None
best_param_seasonal = None

# Loop over all possible combinations of parameters and seasonal parameters
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            # Fit a SARIMAX model with the current combination of parameters
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()

            # Check if the current model has a lower AIC than the best model so far
            if results.aic < best_aic:
                best_aic = results.aic
                best_param = param
                best_param_seasonal = param_seasonal
        except:
            continue

# Print the best parameters and corresponding AIC value
print("Best param: ", best_param)
print("Best seasonal param: ", best_param_seasonal)
print("Best AIC: ", best_aic)

mod = sm.tsa.statespace.SARIMAX(y,
                                order=best_param,
                                seasonal_order=best_param_seasonal,
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results = mod.fit()

# Save the model
with open('model_overall.pkl', 'wb') as f:
    pickle.dump(results, f)

# Load the model
with open('model_overall.pkl', 'rb') as f:
    results = pickle.load(f)

print(results.summary().tables[1])

results.plot_diagnostics(figsize=(16, 8))
plt.show()

pred = results.get_prediction(start=pd.to_datetime(y.index[len(y.index)*80//100]), dynamic=False)
print(pred)
pred_ci = pred.conf_int()

ax = y[str(y.index[1].year):].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))

pred_df = pd.DataFrame({'predicted_mean': pred.predicted_mean,
                    'lower_bound': pred_ci.iloc[:, 0],
                    'upper_bound': pred_ci.iloc[:, 1]})
print(pred_df)

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('Count Of SR')
plt.legend()
plt.show()

y_forecasted = pred.predicted_mean
y_truth = y[pd.to_datetime(y.index[len(y.index)*80//100]):]

# Compute the root mean square error
rmse = np.sqrt(((y_forecasted - y_truth) ** 2).mean())
print('The Root Mean Squared Error of our forecasts is {}'.format(round(rmse, 2)))

In [None]:
# prediction for 3, 6, 9, 12 months
months_list = [3, 6, 9, 12]
for months in months_list:
    pred_uc = results.get_forecast(steps=months)
    pred_ci = pred_uc.conf_int()

    pred_df = pd.DataFrame({'predicted_mean': pred_uc.predicted_mean,
                        'lower_bound': pred_ci.iloc[:, 0],
                        'upper_bound': pred_ci.iloc[:, 1]})
    print(pred_df)

    ax = y.plot(label='observed', figsize=(14, 7))
    pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
    ax.fill_between(pred_ci.index,
                    pred_ci.iloc[:, 0],
                    pred_ci.iloc[:, 1], color='k', alpha=.25)
    for date in pd.date_range(start=y.index[-1], periods=months, freq='MS'):
        ax.axvline(date, linestyle='--', color='k', alpha=0.2, )
    ax.axvline(linestyle='--', color='k',label=f'{months}-Month Forecast')
    ax.set_xlabel('Date')
    ax.set_ylabel('Count Of SR')
    plt.legend()
    plt.show()

# Final Code for Forcast model building for both Countrywise & Marketwise

In [None]:
import itertools
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#Target_cols = ['Country', 'Market']
Target_cols = ['Country', 'Market']
# Load and clean data
data = pd.read_csv('Data_final.csv', parse_dates=['Date'], dayfirst=True)
cols = ['Account Region']
data.drop(cols, axis=1, inplace=True)

days_per_month = data.groupby([data['Date'].dt.year, data['Date'].dt.month])['Date'].nunique()
if days_per_month.iloc[-1] < 28:
    days_per_month = days_per_month.iloc[:-1]
mask = data['Date'].dt.strftime('%Y-%m').isin(days_per_month.index.map(lambda x: f'{x[0]}-{x[1]:02d}'))
data = data.loc[mask]
data.sort_values('Date', inplace=True)
data = data.dropna().rename(columns={'Sum of Count Of SR': 'Count Of SR','Account Region' : 'Region'})
data['z_score'] = np.abs((data['Count Of SR'] - data['Count Of SR'].mean()) / data['Count Of SR'].std())
data = data[data['z_score'] <= 3]

data['Country'].replace(['Canada','Mexico'],'Canada',inplace=True)
data['Country'].replace(['United States', 'Russia','Aruba','Bermuda','British Virgin Islands','Guyana',
                         'Cayman Islands','Haiti','Puerto Rico','The Bahamas','Turks and Caicos Islands'
                         ],'United States',inplace=True)

data.sort_values('Date', inplace=True)
data.dropna(inplace=True)
data.rename(columns={'Sum of Count Of SR': 'Count Of SR'}, inplace=True)

# Resample data by month
data = data.set_index('Date')

# Ask user for target column number
print('Choose a target column number:')
for i, col in enumerate(Target_cols):
    print(f'{i}: {col}')
target_col_num = int(input())

# Ask user for target value number
target_col = Target_cols[target_col_num]
target_values = data[target_col].unique()
print('Choose a target value number:')
for i, val in enumerate(target_values):
    print(f'{i}: {val}')
target_val_num = int(input())
target_value = target_values[target_val_num]
print('~'*50)

# filter data for the selected target value
target_data = data[data[target_col] == target_value]
if target_data.shape[0] > 100:
    print(f'For {target_col}:', target_value, ', Below are model details & forcast')
    y = target_data['Count Of SR'].resample('MS').sum()
    print('Count Of SR details:\n', y.describe())
    # Define the range of p, d, and q values
    p = d = q = range(0, 2)
    # Generate all possible combinations of p, d, and q
    pdq = list(itertools.product(p, d, q))
    # Generate all possible combinations of seasonal p, d, and q
    seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

    # Initialize variables to hold the best AIC and the corresponding parameters
    best_aic = np.inf
    best_param = None
    best_param_seasonal = None

    # Loop over all possible combinations of parameters and seasonal parameters
    for param in pdq:
        for param_seasonal in seasonal_pdq:
            try:
                # Fit a SARIMAX model with the current combination of parameters
                mod = sm.tsa.statespace.SARIMAX(y,
                                                order=param,
                                                seasonal_order=param_seasonal,
                                                enforce_stationarity=False,
                                                enforce_invertibility=False)
                results = mod.fit()

                # Check if the current model has a lower AIC than the best model so far
                if results.aic < best_aic:
                    best_aic = results.aic
                    best_param = param
                    best_param_seasonal = param_seasonal
            except:
                continue

    # Print the best parameters and corresponding AIC value
    print("Best param: ", best_param)
    print("Best seasonal param: ", best_param_seasonal)
    print("Best AIC: ", best_aic)

    mod = sm.tsa.statespace.SARIMAX(y,
                                    order=best_param,
                                    seasonal_order=best_param_seasonal,
                                    enforce_stationarity=False,
                                    enforce_invertibility=False)

    results = mod.fit()

    print(results.summary().tables[1])

    results.plot_diagnostics(figsize=(16, 8))
    plt.show()

    pred = results.get_prediction(start=pd.to_datetime(y.index[len(y.index)*80//100]), dynamic=False)
    print(pred)
    pred_ci = pred.conf_int()

    ax = y[str(y.index[1].year):].plot(label='observed')
    pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))

    ax.fill_between(pred_ci.index,
                    pred_ci.iloc[:, 0],
                    pred_ci.iloc[:, 1], color='k', alpha=.5)

    ax.set_xlabel('Date')
    ax.set_ylabel('Count Of SR')
    plt.legend()
    plt.show()

    y_forecasted = pred.predicted_mean
    y_truth = y[pd.to_datetime(y.index[len(y.index)*80//100]):]

    # Compute the mean square error
    mse = ((y_forecasted - y_truth) ** 2).mean()
    print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

    print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
    print('-'*50)

    # prediction for 3, 6, 9, 12 months
    months_list = [3, 6, 9, 12]
    for months in months_list:
        print('*'*50)
        print(f'Prediction for {months} months')
        pred_uc = results.get_forecast(steps=months)
        pred_ci = pred_uc.conf_int()

        pred_df = pd.DataFrame({'predicted_mean': pred_uc.predicted_mean,
                            'lower_bound': pred_ci.iloc[:, 0],
                            'upper_bound': pred_ci.iloc[:, 1]})
        print(pred_df)

        ax = y.plot(label='observed', figsize=(14, 7))
        pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
        ax.fill_between(pred_ci.index,
                        pred_ci.iloc[:, 0],
                        pred_ci.iloc[:, 1], color='k', alpha=.5)
        for date in pd.date_range(start=y.index[-1], periods=months, freq='MS'):
            ax.axvline(date, linestyle='--', color='k', alpha=0.2, )
        ax.axvline(linestyle='--', color='k',label=f'{months}-Month Forecast')
        ax.set_xlabel('Date')
        ax.set_ylabel('Count Of SR')
        plt.legend()
        plt.show()

else:
    print(f'Not enough data for {target_col}:', target_value)

In [None]:
import itertools
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#Target_cols = ['Country', 'Market']
Target_cols = ['Country', 'Market']
# Load and clean data
data = pd.read_csv('Data_final.csv', parse_dates=['Date'], dayfirst=True)
cols = ['Account Region']
data.drop(cols, axis=1, inplace=True)

days_per_month = data.groupby([data['Date'].dt.year, data['Date'].dt.month])['Date'].nunique()
if days_per_month.iloc[-1] < 28:
    days_per_month = days_per_month.iloc[:-1]
mask = data['Date'].dt.strftime('%Y-%m').isin(days_per_month.index.map(lambda x: f'{x[0]}-{x[1]:02d}'))
data = data.loc[mask]
data.sort_values('Date', inplace=True)
data = data.dropna().rename(columns={'Sum of Count Of SR': 'Count Of SR','Account Region' : 'Region'})
data['z_score'] = np.abs((data['Count Of SR'] - data['Count Of SR'].mean()) / data['Count Of SR'].std())
data = data[data['z_score'] <= 3]

data['Country'].replace(['Canada','Mexico'],'Canada',inplace=True)
data['Country'].replace(['United States', 'Russia','Aruba','Bermuda','British Virgin Islands','Guyana',
                         'Cayman Islands','Haiti','Puerto Rico','The Bahamas','Turks and Caicos Islands'
                         ],'United States',inplace=True)

data.sort_values('Date', inplace=True)
data.dropna(inplace=True)
data.rename(columns={'Sum of Count Of SR': 'Count Of SR'}, inplace=True)

# Resample data by month
data = data.set_index('Date')

countries = data['Country'].unique()
markets = data['Market'].unique()

# Ask user to choose a country
print("Choose a country from the following options:")
for i, country in enumerate(countries):
    print(f"{i+1}. {country}")
country_choice = int(input("Enter a number: "))
country = countries[country_choice-1]

# Filter data for the chosen country
data = data[data['Country'] == country]

# Ask user to choose a market
print(f"\nChoose a market for {country} from the following options:")
for i, market in enumerate(markets):
    print(f"{i+1}. {market}")
market_choice = int(input("Enter a number: "))
market = markets[market_choice-1]

# Filter data for the chosen market
target_data = data[data['Market'] == market]

# # filter data for the selected target value
# target_data = data[data[target_col] == target_value]
if target_data.shape[0] > 100:
    y = target_data['Count Of SR'].resample('MS').sum()
    print('Count Of SR details:\n', y.describe())
    # Define the range of p, d, and q values
    p = d = q = range(0, 2)
    # Generate all possible combinations of p, d, and q
    pdq = list(itertools.product(p, d, q))
    # Generate all possible combinations of seasonal p, d, and q
    seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

    # Initialize variables to hold the best AIC and the corresponding parameters
    best_aic = np.inf
    best_param = None
    best_param_seasonal = None

    # Loop over all possible combinations of parameters and seasonal parameters
    for param in pdq:
        for param_seasonal in seasonal_pdq:
            try:
                # Fit a SARIMAX model with the current combination of parameters
                mod = sm.tsa.statespace.SARIMAX(y,
                                                order=param,
                                                seasonal_order=param_seasonal,
                                                enforce_stationarity=False,
                                                enforce_invertibility=False)
                results = mod.fit()

                # Check if the current model has a lower AIC than the best model so far
                if results.aic < best_aic:
                    best_aic = results.aic
                    best_param = param
                    best_param_seasonal = param_seasonal
            except:
                continue

    # Print the best parameters and corresponding AIC value
    print("Best param: ", best_param)
    print("Best seasonal param: ", best_param_seasonal)
    print("Best AIC: ", best_aic)

    mod = sm.tsa.statespace.SARIMAX(y,
                                    order=best_param,
                                    seasonal_order=best_param_seasonal,
                                    enforce_stationarity=False,
                                    enforce_invertibility=False)

    results = mod.fit()

    print(results.summary().tables[1])

    results.plot_diagnostics(figsize=(16, 8))
    plt.show()

    pred = results.get_prediction(start=pd.to_datetime(y.index[len(y.index)*80//100]), dynamic=False)
    print(pred)
    pred_ci = pred.conf_int()

    ax = y[str(y.index[1].year):].plot(label='observed')
    pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))

    ax.fill_between(pred_ci.index,
                    pred_ci.iloc[:, 0],
                    pred_ci.iloc[:, 1], color='k', alpha=.5)

    ax.set_xlabel('Date')
    ax.set_ylabel('Count Of SR')
    plt.legend()
    plt.show()

    y_forecasted = pred.predicted_mean
    y_truth = y[pd.to_datetime(y.index[len(y.index)*80//100]):]

    # Compute the mean square error
    mse = ((y_forecasted - y_truth) ** 2).mean()
    print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

    print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
    print('-'*50)

    # prediction for 3, 6, 9, 12 months
    months_list = [3, 6, 9, 12]
    for months in months_list:
        print('*'*50)
        print(f'Prediction for {months} months')
        pred_uc = results.get_forecast(steps=months)
        pred_ci = pred_uc.conf_int()

        pred_df = pd.DataFrame({'predicted_mean': pred_uc.predicted_mean,
                            'lower_bound': pred_ci.iloc[:, 0],
                            'upper_bound': pred_ci.iloc[:, 1]})
        print(pred_df)

        ax = y.plot(label='observed', figsize=(14, 7))
        pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
        ax.fill_between(pred_ci.index,
                        pred_ci.iloc[:, 0],
                        pred_ci.iloc[:, 1], color='k', alpha=.5)
        for date in pd.date_range(start=y.index[-1], periods=months, freq='MS'):
            ax.axvline(date, linestyle='--', color='k', alpha=0.2, )
        ax.axvline(linestyle='--', color='k',label=f'{months}-Month Forecast')
        ax.set_xlabel('Date')
        ax.set_ylabel('Count Of SR')
        plt.legend()
        plt.show()

else:
    print(f'Not enough data for given inputs')

In [None]:
import itertools
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
import pickle
warnings.filterwarnings("ignore")

model_dir = os.path.join(os.getcwd(),'models')
os.makedirs(model_dir, exist_ok=True)
#Target_cols = ['Country', 'Market']
Target_cols = ['Country', 'Market']
# Load and clean data
data = pd.read_csv('Data_final.csv', parse_dates=['Date'], dayfirst=True)
cols = ['Account Region']
data.drop(cols, axis=1, inplace=True)

days_per_month = data.groupby([data['Date'].dt.year, data['Date'].dt.month])['Date'].nunique()
if days_per_month.iloc[-1] < 28:
    days_per_month = days_per_month.iloc[:-1]
mask = data['Date'].dt.strftime('%Y-%m').isin(days_per_month.index.map(lambda x: f'{x[0]}-{x[1]:02d}'))
data = data.loc[mask]
data.sort_values('Date', inplace=True)
data = data.dropna().rename(columns={'Sum of Count Of SR': 'Count Of SR','Account Region' : 'Region'})
data['z_score'] = np.abs((data['Count Of SR'] - data['Count Of SR'].mean()) / data['Count Of SR'].std())
data = data[data['z_score'] <= 3]

data['Country'].replace(['Canada','Mexico'],'Canada',inplace=True)
data['Country'].replace(['United States', 'Russia','Aruba','Bermuda','British Virgin Islands','Guyana',
                         'Cayman Islands','Haiti','Puerto Rico','The Bahamas','Turks and Caicos Islands'
                         ],'United States',inplace=True)

data.sort_values('Date', inplace=True)
data.dropna(inplace=True)
data.rename(columns={'Sum of Count Of SR': 'Count Of SR'}, inplace=True)

# Resample data by month
data = data.set_index('Date')

# Get unique countries and markets
countries = data['Country'].unique()
markets = data['Market'].unique()
# Iterate over each combination of country and market
for country in countries:
    for market in markets:
        # Filter data for the current country and market
        filtered_data = data[(data['Country'] == country) & (data['Market'] == market)]
        
        if filtered_data.shape[0] > 100:
            try:
              print(f'\nFor Country: {country}, Market: {market}, Below are model details & forecast')
              y = filtered_data['Count Of SR'].resample('MS').sum()
              print('Count Of SR details:\n', y.describe())
              # Define the range of p, d, and q values
              p = d = q = range(0, 2)
              # Generate all possible combinations of p, d, and q
              pdq = list(itertools.product(p, d, q))
              # Generate all possible combinations of seasonal p, d, and q
              seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

              # Initialize variables to hold the best AIC and the corresponding parameters
              best_aic = np.inf
              best_param = None
              best_param_seasonal = None

              # Loop over all possible combinations of parameters and seasonal parameters
              for param in pdq:
                  for param_seasonal in seasonal_pdq:
                      try:
                          # Fit a SARIMAX model with the current combination of parameters
                          mod = sm.tsa.statespace.SARIMAX(y,
                                                          order=param,
                                                          seasonal_order=param_seasonal,
                                                          enforce_stationarity=False,
                                                          enforce_invertibility=False)
                          results = mod.fit()

                          # Check if the current model has a lower AIC than the best model so far
                          if results.aic < best_aic:
                              best_aic = results.aic
                              best_param = param
                              best_param_seasonal = param_seasonal
                      except:
                          continue

              # Print the best parameters and corresponding AIC value
              print("Best param: ", best_param)
              print("Best seasonal param: ", best_param_seasonal)
              print("Best AIC: ", best_aic)

              mod = sm.tsa.statespace.SARIMAX(y,
                                              order=best_param,
                                              seasonal_order=best_param_seasonal,
                                              enforce_stationarity=False,
                                              enforce_invertibility=False)

              results = mod.fit()
              
              file_path = os.path.join(model_dir, country + "_" + market + ".pickle")
              # Save the model
              with open(file_path, 'wb') as f:
                  pickle.dump(results, f)

              # Load the model
              with open(file_path, 'rb') as f:
                  results = pickle.load(f)

              print(results.summary().tables[1])

              results.plot_diagnostics(figsize=(16, 8))
              plt.show()

              pred = results.get_prediction(start=pd.to_datetime(y.index[len(y.index)*80//100]), dynamic=False)
              print(pred)
              pred_ci = pred.conf_int()

              ax = y[str(y.index[1].year):].plot(label='observed')
              pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))

              ax.fill_between(pred_ci.index,
                              pred_ci.iloc[:, 0],
                              pred_ci.iloc[:, 1], color='k', alpha=.5)

              ax.set_xlabel('Date')
              ax.set_ylabel('Count Of SR')
              plt.legend()
              plt.show()

              y_forecasted = pred.predicted_mean
              y_truth = y[pd.to_datetime(y.index[len(y.index)*80//100]):]

              # Compute the mean square error
              mse = ((y_forecasted - y_truth) ** 2).mean()
              print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

              print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
              print('-'*50)

              # prediction for 3, 6, 9, 12 months
              months_list = [3, 6, 9, 12]
              for months in months_list:
                  print('*'*50)
                  print(f'Prediction for {months} months')
                  pred_uc = results.get_forecast(steps=months)
                  pred_ci = pred_uc.conf_int()

                  pred_df = pd.DataFrame({'predicted_mean': pred_uc.predicted_mean,
                                      'lower_bound': pred_ci.iloc[:, 0],
                                      'upper_bound': pred_ci.iloc[:, 1]})
                  print(pred_df)

                  ax = y.plot(label='observed', figsize=(14, 7))
                  pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
                  ax.fill_between(pred_ci.index,
                                  pred_ci.iloc[:, 0],
                                  pred_ci.iloc[:, 1], color='k', alpha=.5)
                  for date in pd.date_range(start=y.index[-1], periods=months, freq='MS'):
                      ax.axvline(date, linestyle='--', color='k', alpha=0.2, )
                  ax.axvline(linestyle='--', color='k',label=f'{months}-Month Forecast')
                  ax.set_xlabel('Date')
                  ax.set_ylabel('Count Of SR')
                  plt.legend()
                  plt.show()
            except Exception as e:
              print('Exception in model building')

        else:
            print(f'Not enough data for given inputs')

# Streamlit App

In [None]:
#!rm -r /content/models

In [None]:
!zip -r models.zip /content/models

In [None]:
os.listdir('/content/models')

In [None]:
from google.colab import files
files.download('filename.ext')

In [None]:
# forecast.py
import streamlit as st
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

st.set_option('deprecation.showPyplotGlobalUse', False)

# Create the Streamlit app
st.markdown("<h1 style='text-align: center;'> Forecasting App </h1>", unsafe_allow_html=True)
#st.title("Forecasting App")

data = pd.read_csv('Data_final.csv', parse_dates=['Date'], dayfirst=True)
cols = ['Account Region']
data.drop(cols, axis=1, inplace=True)
days_per_month = data.groupby([data['Date'].dt.year, data['Date'].dt.month])['Date'].nunique()
if days_per_month.iloc[-1] < 28:
    days_per_month = days_per_month.iloc[:-1]
mask = data['Date'].dt.strftime('%Y-%m').isin(days_per_month.index.map(lambda x: f'{x[0]}-{x[1]:02d}'))
data = data.loc[mask]
data.sort_values('Date', inplace=True)
data = data.dropna().rename(columns={'Sum of Count Of SR': 'Count Of SR','Account Region' : 'Region'})
data['z_score'] = np.abs((data['Count Of SR'] - data['Count Of SR'].mean()) / data['Count Of SR'].std())
data = data[data['z_score'] <= 3]
data['Country'].replace(['Canada','Mexico'],'Canada',inplace=True)
data['Country'].replace(['United States', 'Russia','Aruba','Bermuda','British Virgin Islands','Guyana',
                         'Cayman Islands','Haiti','Puerto Rico','The Bahamas','Turks and Caicos Islands'
                         ],'United States',inplace=True)
data.sort_values('Date', inplace=True)
data.dropna(inplace=True)
data.rename(columns={'Sum of Count Of SR': 'Count Of SR'}, inplace=True)
# Resample data by month
data = data.set_index('Date')

model_dir = os.path.join(os.getcwd(), 'models')
pickle_files = os.listdir(model_dir)

# Get unique countries and markets from the list of file names
def get_country_and_market(filename):
    parts = filename.split('_')
    country = parts[0]
    market = parts[1]
    return country, market

countries = list(set([get_country_and_market(filename)[0] for filename in pickle_files]))
country_option = st.selectbox('Select Country', countries)

# Filter the pickle file names based on the selected country
filtered_files = [filename for filename in pickle_files if get_country_and_market(filename)[0] == country_option]

if filtered_files:
    markets = list(set([get_country_and_market(filename.split('.')[0])[1] for filename in filtered_files]))
    market_option = st.selectbox('Select Market', markets)
    # Filter the pickle file names further based on the selected market
    filtered_files = [filename for filename in filtered_files if get_country_and_market(filename.split('.')[0])[1] == market_option]
else:
    st.write('No models found for the selected country')

if filtered_files:
    # Create a dictionary to hold the models
    models = {}
    # Load the models from the pickle files
    for filename in filtered_files:
        with open(os.path.join(model_dir, filename), 'rb') as f:
            model = pickle.load(f)
            models[filename] = model
else:
    st.write('No models found for the selected market')

if filtered_files:
    # Get the model for the selected country and market
    model_name = filtered_files[0] # Assumes only one file is selected
    model = models[model_name]

    y = data.loc[(data['Country'] == country_option) & (data['Market'] == market_option), 'Count Of SR'].resample('MS').sum()
    
    # Ask for the number of months to forecast
    months = st.selectbox("Select the number of months to forecast:", [3, 6, 9, 12])
    
    if st.button('Make Forecast'):
        # print('*'*50)
        # print(f'Prediction for {months} months')
        pred_uc = model.get_forecast(steps=months)
        pred_ci = pred_uc.conf_int()
        pred_df = pd.DataFrame({'predicted_mean': pred_uc.predicted_mean,
                            'lower_bound': pred_ci.iloc[:, 0],
                            'upper_bound': pred_ci.iloc[:, 1]})
        st.write(f'<style>div.stTable td{{text-align: center!important}}</style>', unsafe_allow_html=True)
        st.table(pred_df)
        # st.table(pred_df.style.set_table_styles([{'selector': 'td', 
        # 'props': [('text-align', 'center')]}]))
        ax = y.plot(label='observed', figsize=(14, 7))
        pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
        ax.fill_between(pred_ci.index,
                        pred_ci.iloc[:, 0],
                        pred_ci.iloc[:, 1], color='k', alpha=.5)
        for date in pd.date_range(start=y.index[-1], periods=months, freq='MS'):
            ax.axvline(date, linestyle='--', color='k', alpha=0.2, )
        ax.axvline(linestyle='--', color='k',label=f'{months}-Month Forecast')
        ax.set_xlabel('Date')
        ax.set_ylabel('Count Of SR')
        plt.legend()
        
        graph_title = f'<h5><center> Forecast Graph of {months} months for Country : {country_option} & market : {market_option} </center> </h5>'
        st.markdown(graph_title, unsafe_allow_html=True)

        #st.write(f'Forecast Graph for Country {country_option} with market {market_option} for {months} months')
        st.pyplot() # Display the plot in the Streamlit app

# Run below Command
#streamlit run forecast.py