# ARIMA Model

## Content
* Elements
* Data Preprocessing
* Model Identification
* Model Estimation
* Model Verification
* Model Use

Import required tools

In [None]:
import time
import itertools
import joblib
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Get requiered config

In [None]:
# Set random seed for reproducibility
np.random.seed(42)
# Show all columns ans rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data Preprocessing

### Load data

In [None]:
# Load data
loading_path = r'/Users/rlg/Desktop/Workspace/data/df_inflation.csv'
df = pd.read_csv(loading_path)
df.head()

In [None]:
# Select required data
df = df[['DS', 'INFLACION']]
# Make sure dates are fine
df['DS'] = pd.to_datetime(df['DS'], format='mixed')
# Save dates for further forecast  evaluation 
df_future = df[df['DS'] >= '2025-01-01'] # For forecast evaluation
df = df[df['DS'] < '2025-01-01']
# Change column names
df = df.rename(columns={'DS': 'ds', 'INFLACION': 'y'})
df_future = df_future.rename(columns = {'DS' : 'ds', 'INFLACION' : 'y'})
# Set DS as index
endog = pd.Series(df['y'].values, index=df['ds'])
endog_future = pd.Series(df_future['y'].values, index = df_future['ds'])
# See data
endog.head()

In [None]:
# Plot data
plt.figure(figsize=(12, 6))
sns.lineplot(data = endog, marker = 'o')
plt.title("Time Series Plot")
plt.xlabel("Date")
plt.ylabel("Value")
plt.grid(True)
plt.tight_layout()
plt.show()

## Model Identification
This section finds the best $(p,d,q)$ parameters of an ARIMA model in order to get the best fit with the required data with the following approch:
* Grid search with in-sample metrics ($MAPE$, $R^2$, $MAE$, $AIC$) over  $p,I,q$  parameters and Box-Cox transformation.

### Grid Search (In-Sample Metrics)

In [None]:
# Parameter grid
param_grid = {
    'AR_p' : [0,1,2],
    'MA_q' : [0,1,2],
    'd': [0,1,2],
    'Box-Cox' : [True, False]
}

In [None]:
# Import required tools
from itertools import product
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error


# Hyperparameter grid search function
def ARIMA_GRID_SEARCH_IN_SAMPLE(endog, param_grid, freq, verbose = 0):
    """
    ARIMA Hyperparameter Grid Search with In-Sample Evaluation

    IN:
    -> endog: endogenous variables at present time (pd.Series)
    -> param_grid: hyperparameter grid to search over (dict)
    -> bc_transform: whether to apply Box-Cox transformation (bool)
    -> freq: frequency of the time series data (str)

    OUT:
    -> df_metrics: Table with hyperparameters combinations and its corresponing in-sample metrics (pandas.DataFrame)
    """ 
    # Print total iterations
    if verbose:
        print('Total Iterations: ', len(list(product(*param_grid.values()))))

    # Initialize metrics df and metrics lists
    df_metrics = pd.DataFrame(columns = ['AR_p', 'd', 'MA_q',  'Box-Cox', 'IN_SAMPLE_MAPE', 'IN_SAMPLE_R2', 'IN_SAMPLE_MAE', 'AIC', 'IN_SAMPLE_TIME'])

    # Initialize iteration counter 
    iter = 1

    # Perform grid search
    for params in product(*param_grid.values()):
        if verbose:
            print('------------------------------')
            print('Iteration: ', iter)
            print('Parameters: ', params)

        # Discard not suitable parameter combinations
        if params[0] == 0 and params[1] == 0:
            if verbose:
                print('Not suitable parameter combination')
        else:



            ### Prepare data
            # Initialize hyperparameters
            p, q, d, bc = params

            # Initialize training data
            endog_train = endog.copy()

            # Handle Box-Cox transformation
            if bc:
                endog_train, l = boxcox(endog_train)
                endog_train = pd.Series(endog_train, index = endog.index)



            ### Train model
            # Start time
            start_time = time.time()
    
            # Set model
            model = SARIMAX(
                endog = endog_train, 
                order = (p, d, q),
                trend = 'c',
                freq = freq
                )

            # Train model
            results = model.fit()



            ### Get predicted values and actual values
            # Get predicted values
            predicted = results.predict()

            # Handle inverse Box-Cox transformation
            if bc:
                predicted = inv_boxcox(predicted, l)

            # Get actual values
            actual = endog_train
                
            # Stop time 
            end_time = time.time()



            ### Calculate metrics
            mape = mean_absolute_percentage_error(actual, predicted)
            r2 = r2_score(actual, predicted)
            mae = mean_absolute_error(actual, predicted)
            aic = results.aic

            # Fill metrics data frame
            df_metrics_aux = pd.DataFrame({
                'AR_p': [p],
                'd': [d],
                'MA_q': [q],
                'Box-Cox': [bc],
                'IN_SAMPLE_MAPE': [mape],
                'IN_SAMPLE_R2': [r2],
                'IN_SAMPLE_MAE': [mae],
                'AIC' : [aic],
                'IN_SAMPLE_TIME': [end_time - start_time]
                })
            if verbose: 
                print(df_metrics_aux.head())
            df_metrics = pd.concat([df_metrics, df_metrics_aux])

        # Increase iteration counter
        iter = iter + 1

    # Return hyperparameter-metrics data frame
    return df_metrics

In [None]:
# Execute hyperparameter grid search
df_metrics = ARIMA_GRID_SEARCH_IN_SAMPLE(endog, param_grid, freq = 'MS', verbose = 0)
df_metrics = df_metrics.sort_values(by = 'IN_SAMPLE_MAPE')
df_metrics.head()

In [None]:
# Dave grid search metrics
# df_metrics.to_csv(r'Observations_TS/arima_grid_search_metrics.csv', index = False)

## Model Estimation
This section trains the model with the best $(p,d,q)$ parameters found in the previous section. 

In [None]:
# Import required tools
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error


# Train function
def ARIMA_TRAIN(endog, best_pdq, bc, freq):
    """
    ARIMA training function

    IN:
    -> endog: endogenous variables at present time (pd.Series)
    -> best_pdq: p,d,q model parameters in that order (list)
    -> bc_transform: whether to apply Box-Cox transformation (bool)
    -> freq: frequency of the time series data (str)

    PRINTS:
    MAPE, R2, MAE on train set

    OUT:
    -> model : trained model (joblib)
    """ 
    ### Prepare data
    # Initialize training data
    endog_train = endog.copy()

    # Initialize parameters
    p = best_pdq[0]
    d = best_pdq[1]
    q = best_pdq[2]

     # Handle Box-Cox transformation
    if bc:
        endog_train, l = boxcox(endog_train)
        endog_train = pd.Series(endog_train, index = endog.index)



    ### Train model
    # Set model
    model = SARIMAX(
         endog = endog_train, 
         order = (p, d, q),
         trend = 'c',
         freq = freq
         )

    # Train model
    results = model.fit()



    ### Get predicted values and actual values
    # Get predicted values
    predicted = results.predict()

    # Handle inverse Box-Cox transformation
    if bc:
        predicted = inv_boxcox(predicted, l)
        
    # Get actual values
    actual = endog_train


    ### Calculate metrics
    print('-------------------------------')
    print('MAPE: ', mean_absolute_percentage_error(actual, predicted))
    print('R2: ', r2_score(actual, predicted))
    print('MAE: ', mean_absolute_error(actual, predicted))
    print('AIC: ', results.aic)
    print(results.summary())
    print('--------------------------------')

    return results

In [None]:
# Execute model training
best_pdq = (2,0,2)
bc = False
freq = 'MS'
model_trained = ARIMA_TRAIN(endog = endog, best_pdq = best_pdq, bc = bc, freq = freq)

In [None]:
# Save model
# joblib.dump(model_trained, r'Models_TS/arima.joblib')

## Model Evaluation
This section evaluates the model using the following approches:
* In-Sample Metrics
* Train-Test Split Metrics
* Assumption Validations

### In-Sample Metrics

In [None]:
# Import required tools
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error


# Train function
def ARIMA_EVAL_IN_SAMPLE(endog, best_pdq, bc, freq):
    """
    ARIMA training function

    IN:
    -> endog: endogenous variables at present time (pd.Series)
    -> best_pdq: p,d,q model parameters in that order (list)
    -> bc_transform: whether to apply Box-Cox transformation (bool)
    -> freq: frequency of the time series data (str)

    PRINTS:
    MAPE, R2, MAE on train set

    OUT:
    -> df_eval_prediction : table with the actual data and the predicted data (pd.DataFrame)
    """ 
    ### Prepare data
    # Initialize training data
    endog_train = endog.copy()

    # Initialize parameters
    p = best_pdq[0]
    d = best_pdq[1]
    q = best_pdq[2]

     # Handle Box-Cox transformation
    if bc:
        endog_train, l = boxcox(endog_train)
        endog_train = pd.Series(endog_train, index = endog.index)



    ### Train model
    # Set model
    model = SARIMAX(
         endog = endog_train, 
         order = (p, d, q),
         trend = 'c',
         freq = freq
         )

    # Train model
    results = model.fit()



    ### Get predicted values and actual values
    # Get predicted values
    predicted_full = results.get_prediction()
    predicted = predicted_full.predicted_mean
    conf_int = predicted_full.conf_int()


    # Handle inverse Box-Cox transformation
    if bc:
        predicted = inv_boxcox(predicted, l)
        predicted = pd.Series(predicted, index = endog.index)
        conf_int['lower y'] = inv_boxcox(conf_int['lower y'], l)
        conf_int['upper y'] = inv_boxcox(conf_int['upper y'], l)
        
    # Get actual values
    actual = endog_train

    # Set table with results
    dict_eval_prediction = {
        'y' : actual,
        'y_hat' : predicted, 
        'lower_y' : conf_int['lower y'],
        'upper_y' : conf_int['upper y'] 
    }

    df_eval_prediction = pd.DataFrame(dict_eval_prediction, index = endog.index)



    ### Calculate metrics
    print('-------------------------------')
    print('MAPE: ', mean_absolute_percentage_error(actual, predicted))
    print('R2: ', r2_score(actual, predicted))
    print('MAE: ', mean_absolute_error(actual, predicted))
    print('--------------------------------')
    return df_eval_prediction



In [None]:
# Get in-sample evaluation
best_pdq = [2,0,2]
bc = False
freq = 'MS'
df_eval_prediction = ARIMA_EVAL_IN_SAMPLE(endog = endog, best_pdq = best_pdq , bc = bc, freq = freq)
df_eval_prediction.head()

In [None]:
### Plot actual vs. predicted
# Set a nice theme
sns.set_theme(style="whitegrid", palette="muted")

# Ensure datetime index
df_eval_prediction.index = pd.to_datetime(df_eval_prediction.index)

# Create the plot
plt.figure(figsize=(12, 6))

# Plot actual
sns.lineplot(
    x=df_eval_prediction.index,
    y=df_eval_prediction['y'],
    label='Actual',
    marker='o',
    linewidth=2
)

# Plot predicted
sns.lineplot(
    x=df_eval_prediction.index,
    y=df_eval_prediction['y_hat'],
    label='Predicted',
    linestyle='--',
    marker='X',
    linewidth=2
)

# Plot confidence interval manually
plt.fill_between(
    df_eval_prediction.index,
    df_eval_prediction['lower_y'],
    df_eval_prediction['upper_y'],
    color='skyblue',
    alpha=0.3,
    label='95% CI'
)

# Titles and labels
plt.title('Actual vs. Predicted', fontsize=16, weight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Value', fontsize=12)

# Ticks and legend
plt.xticks(rotation=45)
plt.legend(frameon=True)
sns.despine()
plt.tight_layout()
plt.show()


### Train-Test Metrics

In [None]:
# Import required tools
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error


# Train function
def ARIMA_EVAL_TAIN_TEST(endog, best_pdq, bc, test_size, freq):
    """
    ARIMA training function

    IN:
    -> endog: endogenous variables at present time (pd.Series)
    -> best_pdq: p,d,q model parameters in that order (list)
    -> bc_transform: whether to apply Box-Cox transformation (bool)
    -> test_size : last months to evaluate model (str)
    -> freq: frequency of the time series data (str)

    PRINTS:
    MAPE, R2, MAE on test set

    OUT:
    -> df_eval_forecast : table with the actual data and the predicted and forecasted data (pd.DataFrame)
    """ 
    ### Prepare data
    # Initialize training data
    full_index = endog.index
    endog_train = endog.head(endog.shape[0] - test_size)
    train_index = endog_train.index
    endog_test = endog.tail(test_size)
    test_index = endog_test.index

    # Initialize parameters
    p = best_pdq[0]
    d = best_pdq[1]
    q = best_pdq[2]

     # Handle Box-Cox transformation
    if bc:
        endog_train, l = boxcox(endog_train)
        endog_train = pd.Series(endog_train, index = train_index)



    ### Train model
    # Set model
    model = SARIMAX(
         endog = endog_train, 
         order = (p, d, q),
         trend = 'c',
         freq = freq
         )

    # Train model
    results = model.fit()



    ### Get forecasted values and actual values
    # Get predicted values
    predicted_full = results.get_prediction()
    predicted = predicted_full.predicted_mean
    ci_predicted = predicted_full.conf_int()
    # Get forecasted values
    forecasted_full = results.get_forecast(steps = test_size)
    forecasted = forecasted_full.predicted_mean
    ci_forecasted = forecasted_full.conf_int()

    # Handle inverse Box-Cox transformation
    if bc:
        predicted = inv_boxcox(predicted, l)
        predicted = pd.Series(predicted, index = train_index)
        ci_predicted['lower y'] = inv_boxcox(ci_predicted['lower y'], l)
        ci_predicted['upper y'] = inv_boxcox(ci_predicted['upper y'], l)
        forecasted = inv_boxcox(forecasted, l)
        forecasted = pd.Series(forecasted, index = test_index)
        ci_forecasted['lower y'] = inv_boxcox(ci_forecasted['lower y'], l)
        ci_forecasted['upper y'] = inv_boxcox(ci_forecasted['upper y'], l)

    # Get actual values
    actual = endog_test

    # Set table with results
    full = endog
    full_hat = pd.concat([predicted, forecasted], axis = 0)
    ci_full = pd.concat([ci_predicted, ci_forecasted], axis = 0)
    dict_eval_forecast = {
        'y' : full,
        'y_hat' : full_hat, 
        'lower_y' : ci_full['lower y'],
        'upper_y' : ci_full['upper y'] 
    }
    df_eval_forecast = pd.DataFrame(dict_eval_forecast, index = full_index)
    


    ### Calculate metrics
    print('-------------------------------')
    print('MAPE: ', mean_absolute_percentage_error(actual, forecasted))
    print('R2: ', r2_score(actual, forecasted))
    print('MAE: ', mean_absolute_error(actual, forecasted))
    print('--------------------------------')

    return df_eval_forecast

In [None]:
# Evaluate model through train-test split
best_pdq = [2,0,2]
test_size = 6
bc = False
freq = 'MS'
df_eval_forecast = ARIMA_EVAL_TAIN_TEST(endog = endog, best_pdq = best_pdq, bc = bc, test_size = test_size, freq = freq)
df_eval_forecast.head()

In [None]:
### Plot actual vs. predicted and forecasted
# Set a nice theme
sns.set_theme(style="whitegrid", palette="muted")

# Ensure datetime index
df_eval_forecast.index = pd.to_datetime(df_eval_forecast.index)

# Create the plot
plt.figure(figsize=(12, 6))

# Plot actual
sns.lineplot(
    x=df_eval_forecast.index,
    y=df_eval_forecast['y'],
    label='Actual',
    marker='o',
    linewidth=2
)

# Plot predicted
sns.lineplot(
    x=df_eval_forecast.index,
    y=df_eval_forecast['y_hat'],
    label='Predicted',
    linestyle='--',
    marker='X',
    linewidth=2
)

# Plot confidence interval manually
plt.fill_between(
    df_eval_forecast.index,
    df_eval_forecast['lower_y'],
    df_eval_forecast['upper_y'],
    color='skyblue',
    alpha=0.3,
    label='95% CI'
)

# ðŸ‘‰ Highlight forecast region
forecast_start = df_eval_forecast.index[-6]
forecast_end = df_eval_forecast.index[-1]
plt.axvspan(
    forecast_start, forecast_end,
    color='lightgrey',
    alpha=0.5,
    label='Forecast Period'
)

# Titles and labels
plt.title('Actual vs. Predicted', fontsize=16, weight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Value', fontsize=12)

# Ticks and legend
plt.xticks(rotation=45)
plt.legend(frameon=True)
sns.despine()
plt.tight_layout()
plt.show()


### Assumptions Verification

## Model Use
This section predict future values with it's required confidenc intervals.

In [None]:
# Impomrt required tools
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error

def ARIMA_FORECAST(endog, n_periods, best_pdq, bc, freq):
    """
    ARIMA forecsting function

    IN:
    -> endog: endogenous variables at present time (pd.Series)
    -> n_periods : periods to forecast (int)
    -> best_pdq: p,d,q model parameters in that order (list)
    -> bc_transform: whether to apply Box-Cox transformation or not (bool)
    -> freq: frequency of the time series data (str)

    OUT:
    -> df_eval_forecast : table with the actual data and the predicted and forecasted data. Note there won't be actual data for forecasting periods (pd.DataFrame)
    """ 
    ### Prepare data
    # Initialize training data
    endog_train = endog.copy()
    train_index = endog_train.index
    forecast_index = pd.date_range(
    start = endog.index[-1] + pd.tseries.frequencies.to_offset(freq),
    periods = n_periods,
    freq = freq)
    full_index = train_index.append(forecast_index)

    # Initialize parameters
    p = best_pdq[0]
    d = best_pdq[1]
    q = best_pdq[2]

     # Handle Box-Cox transformation
    if bc:
        endog_train, l = boxcox(endog_train)
        endog_train = pd.Series(endog_train, index = train_index)



    ### Train model
    # Set model
    model = SARIMAX(
         endog = endog_train, 
         order = (p, d, q),
         trend = 'c',
         freq = freq
         )

    # Train model
    results = model.fit()



    ### Predict and forecast
    # Get predicted values
    predicted_full = results.get_prediction()
    predicted = predicted_full.predicted_mean
    ci_predicted = predicted_full.conf_int()
    # Get forecasted values
    forecasted_full = results.get_forecast(steps = n_periods)
    forecasted = forecasted_full.predicted_mean
    ci_forecasted = forecasted_full.conf_int()

    # Handle inverse Box-Cox transformation
    if bc:
        predicted = inv_boxcox(predicted, l)
        predicted = pd.Series(predicted, index = train_index)
        ci_predicted['lower y'] = inv_boxcox(ci_predicted['lower y'], l)
        ci_predicted['upper y'] = inv_boxcox(ci_predicted['upper y'], l)
        forecasted = inv_boxcox(forecasted, l)
        forecasted = pd.Series(forecasted, index = forecast_index)
        ci_forecasted['lower y'] = inv_boxcox(ci_forecasted['lower y'], l)
        ci_forecasted['upper y'] = inv_boxcox(ci_forecasted['upper y'], l)

    # Set table with results
    full = pd.concat([endog, pd.Series([np.nan]*n_periods, index = forecast_index)], axis = 0)
    full_hat = pd.concat([predicted, forecasted], axis = 0)
    ci_full = pd.concat([ci_predicted, ci_forecasted], axis = 0)
    dict_forecast = {
        'y' : full,
        'y_hat' : full_hat, 
        'lower_y' : ci_full['lower y'],
        'upper_y' : ci_full['upper y'], 
    }
    df_forecast = pd.DataFrame(dict_forecast, index = full_index)

    return df_forecast

In [None]:
# Forecast
n_periods = 6
best_pdq = [2,0,2]
bc = False
freq = 'MS'
df_forecast = ARIMA_FORECAST(endog = endog, n_periods = n_periods, best_pdq = best_pdq, bc = bc, freq = freq)
df_forecast.tail(n_periods*2)

In [None]:
### Plot actual vs. predicted and forecasted
# Set a nice theme
sns.set_theme(style="whitegrid", palette="muted")

# Ensure datetime index
df_forecast.index = pd.to_datetime(df_forecast.index)

# Create the plot
plt.figure(figsize=(12, 6))

# Plot actual
sns.lineplot(
    x=df_forecast.index,
    y=df_forecast['y'],
    label='Actual',
    marker='o',
    linewidth=2
)

# Plot predicted
sns.lineplot(
    x=df_forecast.index,
    y=df_forecast['y_hat'],
    label='Predicted',
    linestyle='--',
    marker='X',
    linewidth=2
)

# Plot confidence interval manually
plt.fill_between(
    df_forecast.index,
    df_forecast['lower_y'],
    df_forecast['upper_y'],
    color='skyblue',
    alpha=0.3,
    label='95% CI'
)

# ðŸ‘‰ Highlight forecast region
forecast_start = df_forecast.index[-n_periods]
forecast_end = df_forecast.index[-1]
plt.axvspan(
    forecast_start, forecast_end,
    color='lightgrey',
    alpha=0.5,
    label='Forecast Period'
)

# Titles and labels
plt.title('Actual vs. Predicted', fontsize=16, weight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Value', fontsize=12)

# Ticks and legend
plt.xticks(rotation=45)
plt.legend(frameon=True)
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
# Evaluate since we know the future data
print('-------------------------------')
print('MAPE: ', mean_absolute_percentage_error(endog_future, df_forecast['y_hat'].tail(6)))
print('R2: ', r2_score(endog_future, df_forecast['y_hat'].tail(6)))
print('MAE: ', mean_absolute_error(endog_future, df_forecast['y_hat'].tail(6)))
print('--------------------------------')