In [None]:
###### statsmodels/regression/TimeSeries ######
## Notes: The code already includes as many models as possible, 
# but specific adjustments will need to be made based on the project.

# Also, below steps are not strictly in such an order.

# 0. Import necessary libraries.
# 1. Visualize the dataset according to its characteristics
# 2. Load and preprocess the dataset 
# 3. Define the predictors (X) and the target variable (y).
# 4. Split the dataset into training and testing sets.
# 5. Define a function to fit different regression models, 
    # evaluate them using metrics such as MSE, RMSE, and MAE, and return the best model.
###### statsmodels/regression/TimeSeries ######


In [None]:
# 0. Import Libraries
import warnings # for muting warning messages
# mute warning messages
warnings.filterwarnings('ignore')

import statsmodels.api as sm
# Single series models
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.varmax import VARMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor
from statsmodels.tsa.statespace.structural import UnobservedComponents
from statsmodels.tsa.seasonal import STL

# Multivariate models
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.statespace.varmax import VARMAX

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split

import dask.dataframe as dd
from dask_ml.impute import SimpleImputer
from dask_ml.preprocessing import StandardScaler
from dask.distributed import Client
from dask_ml.preprocessing import Categorizer, DummyEncoder

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
import os, sys
import pandas as pd
import numpy as np
import itertools
import logging

from sklearn.datasets import fetch_california_housing
from ISLP import load_data

In [None]:
# 1. EDA 

# 1.1 Time Series Plot
data, time_series = load_dataset('household_power_consumption.txt')
data_processed = data_preprocess(data)
data_processed.set_index('datetime', inplace=True)

# # Handle missing values (if any)
# data = data.interpolate(method='time')
data_subset=data_processed['2009-12-01':'2010-01-01']

plt.figure(figsize=(14, 8))
plt.plot(data_subset)
plt.title('Household Power Consumption')
plt.xlabel('Time')
plt.ylabel('Value')
plt.grid(True)
plt.show()

# 1.2 Time Series Plot with Trend, Seasonal and  Residual
data, time_series = load_dataset('household_power_consumption.txt')
data_processed = data_preprocess(data)
data_processed.set_index('datetime', inplace=True)
data_subset=data_processed['2010-01-01':'2010-01-15']
# Example: Decompose 'Global_active_power' assuming a seasonal pattern every 24 hours (1440 minutes)
decompfreq = 60  # Assuming daily seasonality (24 hours)
decomposition = sm.tsa.seasonal_decompose(data_subset['Global_active_power'], period= decompfreq)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Plotting the decomposition
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 10))

# Original time series
ax1.plot(data_subset.index, data_subset['Global_active_power'], label='Original')
ax1.set_title('Original Time Series')
ax1.legend()

# Trend component
ax2.plot(data_subset.index, trend, label='Trend', color='orange')
ax2.set_title('Trend Component')
ax2.legend()

# Seasonal component
ax3.plot(data_subset.index, seasonal, label='Seasonal', color='green')
ax3.set_title('Seasonal Component')
ax3.legend()

# Residual component
ax4.plot(data_subset.index, residual, label='Residual', color='red')
ax4.set_title('Residual Component')
ax4.legend()

plt.tight_layout()
plt.show()

#1.3 Time Series Plot with Aggregation monthly data and normalized it
data, time_series = load_dataset('household_power_consumption.txt')
data_processed = data_preprocess(data)
data_processed.set_index('datetime', inplace=True)

numerical_columns = data_processed.select_dtypes(include=['int64', 'float64'])

# Calculate mean for each year (excluding 'datetime')
# Group by year and month, and calculate mean
data_mean_monthly = numerical_columns.groupby([numerical_columns.index.year, numerical_columns.index.month]).mean()
data_mean_monthly.reset_index(drop=True, inplace=True)  # Reset index to numeric indices
# Min-Max normalization
data_mean_monthly_normalized = (data_mean_monthly - data_mean_monthly.min()) / (data_mean_monthly.max() - data_mean_monthly.min())
print(data_mean_monthly_normalized)

# 1.4 Time Series Plot with Aggregation monthly data, Trend, Seasonal, and Residual
data, time_series = load_dataset('household_power_consumption.txt')
data_processed = data_preprocess(data)
data_processed.set_index('datetime', inplace=True)

numerical_columns = data_processed.select_dtypes(include=['int64', 'float64'])

# Calculate mean for each year (excluding 'datetime')
# Group by year and month, and calculate mean
data_mean_monthly = numerical_columns.groupby([numerical_columns.index.year, numerical_columns.index.month]).mean()
data_mean_monthly.reset_index(drop=True, inplace=True)  # Reset index to numeric indices
# Min-Max normalization
data_mean_monthly_normalized = (data_mean_monthly - data_mean_monthly.min()) / (data_mean_monthly.max() - data_mean_monthly.min())


# Perform seasonal decomposition for each monthly period
decompfreq = 12  # Assuming annual seasonality (12 months)
decomposition = sm.tsa.seasonal_decompose(data_mean_monthly_normalized['Global_active_power'], period= decompfreq)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Plotting the decomposition
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 10))

# Original time series
ax1.plot(data_mean_monthly_normalized.index, data_mean_monthly_normalized['Global_active_power'], label='Original')
ax1.set_title('Original Time Series')
ax1.legend()

# Trend component
ax2.plot(data_mean_monthly_normalized.index, trend, label='Trend', color='orange')
ax2.set_title('Trend Component')
ax2.legend()

# Seasonal component
ax3.plot(data_mean_monthly_normalized.index, seasonal, label='Seasonal', color='green')
ax3.set_title('Seasonal Component')
ax3.legend()

# Residual component
ax4.plot(data_mean_monthly_normalized.index, residual, label='Residual', color='red')
ax4.set_title('Residual Component')
ax4.legend()

plt.tight_layout()
plt.show()

# 1.5 Time Series Plot with Aggregation monthly data and normalized it, with all variables
data, time_series = load_dataset('household_power_consumption.txt')
data_processed = data_preprocess(data)
data_processed.set_index('datetime', inplace=True)

numerical_columns = data_processed.select_dtypes(include=['int64', 'float64'])

# Calculate mean for each year (excluding 'datetime')
# Group by year and month, and calculate mean
data_mean_monthly = numerical_columns.groupby([numerical_columns.index.year, numerical_columns.index.month]).mean()
data_mean_monthly.reset_index(drop=True, inplace=True)  # Reset index to numeric indices
# Min-Max normalization
data_mean_monthly_normalized = (data_mean_monthly - data_mean_monthly.min()) / (data_mean_monthly.max() - data_mean_monthly.min())

print(len(data_mean_monthly))
plt.figure(figsize=(14, 8))
# Plot each column
for col in data_mean_monthly_normalized.columns:
    # if '_active' in col or 'intensity' in col or 'Voltage' in col:  # Skip the datetime column
    plt.plot(data_mean_monthly_normalized.index, data_mean_monthly_normalized[col], marker='o', label=col)

plt.title('Yearly Means of Electrical Power Measurements')
plt.xlabel('Year and Month')
plt.ylabel('Measurement Value')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
#2. Load the dataset
def find_filepath(name):
    for root, dirs, files in os.walk('..'):
        for file in files:
            base, extension = os.path.splitext(file)
            if extension.lower() in ('.txt', '.csv'):
                if os.path.splitext(name)[1]:  # If name has an extension
                    if file.lower() == name.lower():  # Compare filename with name
                        file_path = os.path.join(root, file)
                        if os.path.isfile(file_path):
                            return file_path
                else:  # If name does not have an extension
                    if base == os.path.splitext(name)[0]:  # Compare base part of filename with name
                        file_path = os.path.join(root, file)
                        if os.path.isfile(file_path):
                            return file_path
    return None

def get_dataframe(data):
    if isinstance(data, pd.DataFrame):
        return data
    else:
        return pd.DataFrame(data)

def capitalize_first_letter(word):
    # Capitalize the first letter and convert the rest of the word to lowercase
    return word[:1].upper() + word[1:].lower()

def calculate_dynamic_blocksize(file_path,partitions):
    # Get file size in bytes
    file_size = os.path.getsize(file_path)
    
    # Calculate blocksize in bytes
    block_size = file_size // partitions
    
    # Convert blocksize to MB
    block_size_mb = block_size / (1024 * 1024)
    
    return f"{block_size_mb}MB"

def load_dataset(name, partitions=10, sept =';', infer_datetime = True, low_mem=False):
    time_series=False
    try: 
        capitalized_name = capitalize_first_letter(name)

        data = load_data(f"{capitalized_name}")
        
        return get_dataframe(data), time_series
    except:
        file_path=find_filepath(name)
        if file_path:
            blocksize = calculate_dynamic_blocksize(file_path, partitions)
            data = dd.read_csv(
                    file_path, 
                    sep=sept, 
                    infer_datetime_format=infer_datetime,  
                    na_values='?', 
                    low_memory=low_mem, # load all data into memory at one time to facilitate to infer the data types
                    blocksize = blocksize   
            )
            original_column_list = data.columns.tolist()
            # Convert all elements to lowercase
            lowercase_column_list = [element.lower() for element in original_column_list]
            if 'date' in lowercase_column_list and 'time' in lowercase_column_list:
                for col in original_column_list:
                    if col.lower() == 'date':
                        data = data.rename(columns={col: 'Date'})
                    elif col.lower() == 'time':
                        data = data.rename(columns={col: 'Time'})
                data['datetime']= dd.to_datetime(data['Date'] + ' ' + data['Time'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
                # Optional: Handle any rows where datetime parsing failed by filling with NaN
                data['datetime'].fillna(pd.NaT)
               
                time_series =True
                data = data.drop(columns = ['Date','Time'])

            # Persist the dataframe to speed up future operations
            data = data.persist()
            
            return data, time_series
        else: 
            return 'No dataset named ' + capitalized_name


In [None]:
# 3. Preprocess the dataset
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Define the frequency encoding function
def frequency_encoder(X, cat_feature):
    freq = X[cat_feature].value_counts(normalize=True)
    return X[cat_feature].map(freq).fillna(0)

def get_num_cat_columns(dataframe):
    # Define numeric, categorical, and datetime features
    numeric_features = []
    categorical_features = []
    datetime_features = []

    for col in dataframe.columns:
        if dataframe[col].dtype in ['int64', 'float64']:
            # Exclude binary columns
            unique_values = dataframe[col].dropna().unique()
            if len(unique_values) > 2:  # More than 2 unique values (not binary)
                numeric_features.append(col)
            else:  # Binary columns
                categorical_features.append(col)
        elif dataframe[col].dtype == 'object' or dataframe[col].dtype.name == 'category':
            categorical_features.append(col)
        elif dataframe[col].dtype == 'datetime64[ns]':
            datetime_features.append(col)

    print("Numeric Features:", numeric_features)
    print("Categorical Features:", categorical_features)
    print("Datetime Features:", datetime_features)
    return numeric_features, categorical_features, datetime_features

def data_preprocess(data, scale =False):
    # dataframe = get_dataframe(data)
    dataframe = data
  
    # 0. Create empty DataFrames for transformed numeric and categorical data
    X_num_transformed = dd.DataFrame()
    X_cat_transformed = dd.DataFrame()

    # Define numeric， categorical and datetime features
    numeric_features, categorical_features, datetime_features = get_num_cat_columns(dataframe.compute().head(100))

    # 1. Process numeric features
    if len(numeric_features) > 0:
       
        # Define numeric transformer 
        imputer_num = SimpleImputer(strategy='constant', fill_value=0)
        X_num_imputed = imputer_num.fit_transform(dataframe[numeric_features])

    if scale:
        scaler = StandardScaler()
        X_num_transformed = scaler.fit_transform(X_num_imputed)
    else:
        X_num_transformed = X_num_imputed

    # 2. Process datetime features
    if len(datetime_features) > 0:
        # Interpolate missing values in datetime columns
        for datetime_feature in datetime_features:
            dataframe[datetime_feature] = dataframe[datetime_feature].interpolate(method='linear')

    # 3. Define categorical transformer steps
    imputer_cat = SimpleImputer(strategy='most_frequent')

    # Process categorical features
    if len(categorical_features) > 0:
        print(f'Categorical features found. {categorical_features}')
        X_cat = dataframe[categorical_features]
        X_cat = dd.utils.make_categorical(X_cat, columns=categorical_features)

        for cat_feature in categorical_features:
            X_cat_transformed[cat_feature] = frequency_encoder(X_cat.compute(), cat_feature)

    # Combine processed numeric, datetime and categorical features
    X_transformed = dd.concat([X_num_transformed, X_cat_transformed, dataframe[datetime_features] ], axis=1)

    return X_transformed

In [None]:
# 4. Pipeline fitting
# 4.1 Subfunctions for calculating metrics
def compute_forecast_interval_coverage(y_actual, lower_bounds, upper_bounds):
    assert len(y_actual) == len(lower_bounds) == len(upper_bounds), "Lengths of actual and intervals should match"
    
    num_forecasts = len(y_actual)
    num_covered = 0
    
    for i in range(num_forecasts):
        if lower_bounds[i] <= y_actual[i] <= upper_bounds[i]:
            num_covered += 1
    
    coverage = num_covered / num_forecasts
    
    return coverage

def calculate_metrics(result, y_pred, y_test, model_name_with_degree, forecast_intervals=None):
    # Define the calculations and their corresponding functions
    model_metrics={}
    model_metrics["name"] = model_name_with_degree
    # model_metrics["model"] = model_class
    # model_metrics["degree"] = degree
    calculations = {
        "aic": lambda result: result.aic,
        "bic": lambda result: result.bic,
        "likelihood": lambda result: result.llf,
        "rmse": lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
        "mae": mean_absolute_error,
        "mape": lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100,     
        "forecast_interval_coverage": lambda y_true, intervals: compute_forecast_interval_coverage(y_true, intervals[:, 0], intervals[:, 1]),
        "forecast_bias": lambda y_true, y_pred: np.mean(y_pred - y_true)
    }
    
    # Perform each calculation and handle errors
    for metric_name, func in calculations.items():
        try:
            if metric_name in ("aic", "bic", "likelihood"):
                if hasattr(result, metric_name):
                   value = func(result)
                else: 
                    value = np.inf if metric_name in ("aic", "bic") else -np.inf

            elif metric_name == "forecast_interval_coverage":
                # Only calculate the coverage when intervals are provided
                if forecast_intervals:
                    value = func(y_test, forecast_intervals)
                else:
                    value = - np.inf
            else:
                value = func(y_test, y_pred)
        
            # print(f"{name} : {value}")
            model_metrics[f"{metric_name}"] = value
            
        except Exception as e:
            if metric_name in ("forecast_interval_coverage", "likelihood"):
                model_metrics[f"{metric_name}"] = - np.inf
            else: 
                model_metrics[f"{metric_name}"] = np.inf

    model_metrics["result"] = result 
    return model_metrics

def aggregate_metrics_list(metrics_list):
    # Aggregate metrics
    df_metrics_list = get_dataframe(metrics_list)

    # Group by 'name' and aggregate metrics
    grouped = df_metrics_list.groupby('name')
    aggregated_metrics = grouped.agg(lambda x: np.mean(x) if np.issubdtype(x.dtype, np.number) else x.iloc[0])
    
    # return aggregated_metrics.to_dict(orient='index')
    return aggregated_metrics

def sort_metrics_list(df_metrics_aggregated):
    # Sort the metrics by priority 
    df_metrics=get_dataframe(df_metrics_aggregated)
    # Helper function to sort values, treating strings as high values
    def custom_sort(val, ascending=True):
        try:
            return (float(val),) if ascending else (-float(val),)
        except ValueError:
            return (float('-inf'),) if ascending else (float('inf'),)
    # Columns and their sort order
    sort_columns = ['aic', 'bic', 'likelihood', 'rmse', 'mae', 'mape', 'forecast_interval_coverage', 'forecast_bias']
    ascending_order = [True, True, False, True, True, True, False, True]   
    df_metrics = df_metrics.sort_values(by=sort_columns, ascending=ascending_order)
    # Apply custom sorting
    for col, asc in zip(sort_columns, ascending_order):
        df_metrics[col + '_sort'] = df_metrics[col].apply(lambda x: custom_sort(x, ascending=asc))

    # Perform the sorting based on the new sort columns
    sort_by = [col + '_sort' for col in sort_columns]
    df_metrics = df_metrics.sort_values(by=sort_by)
    # Drop the sort helper columns
    df_metrics = df_metrics.drop(columns=sort_by)
    return df_metrics  


In [None]:
# 4.2 Subfunctions for poly nomial features
def poly_transformer(train_data, test_data, degree = 1, scale=False):
    
    column_names=train_data.columns.tolist() 
    poly = PolynomialFeatures(degree = degree, include_bias=False)
    train_poly = poly.fit_transform(train_data)
    test_poly = poly.transform(test_data)
    feature_names = poly.get_feature_names_out(column_names)
    # Convert to DataFrame to retain column names
    train_poly_df = pd.DataFrame(train_poly, columns=feature_names, index=train_data.index)
    test_poly_df = pd.DataFrame(test_poly, columns=feature_names, index=test_data.index)
    
    if scale:
        train_poly_scaled =data_preprocess(train_poly,True)
        test_poly_scaled =data_preprocess(test_poly, True)
        return train_poly_scaled, test_poly_scaled, feature_names
    return train_poly_df, test_poly_df, feature_names

In [None]:
# 4.3 subfuctions for prediction intervals
def get_confidence_intervals(X_train_poly, X_test_poly, y_train, result, num_bootstrap_samples=50,confidence = 0.95):
    bootstrap_predictions =[]
    prediction_intervals = []
    
    for _ in range(num_bootstrap_samples):
        # Generate bootstrapped indices within the specified range
        indices = np.random.choice(np.arange(0, len(X_train_poly)), len(X_train_poly), replace=True)
 
        # Use .iloc[indices] with dataframe, and [indices] with np.arrays
        # Note: after fittransform, a np.arrays is returned
        # Get one set of bootstrap sample data
        X_bootstrap= X_train_poly[indices]
        y_bootstrap= y_train.iloc[indices]
       
        # Use the same model type of best_model
        bootstrap_model=result.model.__class__
        family = result.model.family if hasattr(result.model, 'family') else None
        if family: 
            bootstrap_result = bootstrap_model(y_bootstrap, sm.add_constant(X_bootstrap), family=family).fit()
        else:
            bootstrap_result = bootstrap_model(y_bootstrap, sm.add_constant(X_bootstrap)).fit()

        # Make predictions on the test data using the new result
        predictions_sample = bootstrap_result.predict(sm.add_constant(X_test_poly))
        # Append predictions to the list
        bootstrap_predictions.append(predictions_sample)

    # Compute prediction intervals 
    lower_bound = np.percentile(bootstrap_predictions, (1 - confidence) * 100 / 2, axis=0)
    upper_bound = np.percentile(bootstrap_predictions, (1 + confidence) * 100 / 2, axis=0)
    prediction_intervals = np.column_stack((lower_bound, upper_bound))
    return prediction_intervals

In [None]:
# 4.4 Pipeline fitting with grid search and hyperparams tuning
def fit_time_series_models(logger, univariate, multivariate, testsize_split,
                          degree_list=None, num_bootstrap_samples=50,
                          confidence=0.95):
    # 5. Fit the models
    # 5.1 Define the models to fit
    
    models = {
        "ARIMA": {
            "class": SARIMAX,
            "poly_supported": False,
            "endog": "univariate",
            "params": [
                {"order": (2, 0, 3)}
            ]
        },
        "SARIMA": {
            "class": SARIMAX,
            "poly_supported": False,
            "endog": "univariate",
            "params": [
                {"order": (2, 0, 3), "seasonal_order": (0, 0, 0, 1440), "exog": None}
            ]
        },
        "VAR": {
            "class": VAR,
            "poly_supported": True,
            "endog": "multivariate",
            "params": [
                {"maxlags": 7, "freq": 'T', "trend": 'c'}
            ]
        },
        "VARMAX": {
            "class": VARMAX,
            "poly_supported": True,
            "endog": "multivariate",
            "params": [
                {"order": (2, 2), "exog": None}
            ]
        },
        "ExponentialSmoothing": {
            "class": ExponentialSmoothing,
            "poly_supported": False,
            "endog": "univariate",
            "params": [
                {"trend": "add", "seasonal": "add", "seasonal_periods": 1440, "exog": None}
            ]
        },
        "AutoReg": {
            "class": AutoReg,
            "poly_supported": True,
            "endog": "univariate",
            "params": [
                {"lags": 7, "trend": 'c', "method": 'cmle'}
            ]
        }
    }
    models = {
        "ARIMA": {
            "class": SARIMAX,
            "poly_supported": False,
            "endog": "univariate",
            "params": [
                {"order": (2, 0, 3)}
            ]
        }
    }
    # Default to linear (degree 1) if degree_list is None or empty
    if degree_list is None:
        degree_list = [1]
    else:
        if not isinstance(degree_list, list):
            print('Degree list is not a list.')
            degree_list = [degree_list]  # Ensure degree_list is converted to a list if it's not already
        if 1 not in degree_list:
            degree_list.append(1)  

    # 5.2 TimeSeries Split
    tscv = TimeSeriesSplit(n_splits=testsize_split)  # Define the number of splits

    best_model = None
    metrics_list = []
    for model_name, model_info in models.items():
        model_class = model_info["class"]
        poly_supported = model_info["poly_supported"]
        endog_type = model_info["endog"]
        if endog_type == 'univariate':
            endog = univariate
        else: 
            endog = multivariate

        for degree in degree_list:
            if not poly_supported:
                if degree != 1:
                    continue  # Skip degrees other than 1 for models that do not support polynomial features
        
            # Cross Validation
            for train_index, test_index in tscv.split(endog):
                train_data, test_data = endog.iloc[train_index], endog.iloc[test_index]
                print(f"Train data contains {len(train_data)} rows.")
                print(f"Test data contains {len(test_data)} rows.")
                
                # 5.3 Transform the dataset
                # Only add poly features to those models supporting poly features
                train_poly, test_poly, _ = poly_transformer(train_data, test_data, degree)
    
                # 5.4 Find the best params and fit the model
                model_name_with_degree = f"{model_name} Degree {degree}" 
                try:
                    # Print start training message
                    # print(f"Start training {model_name_with_degree}...")
                    start_time = time.time()
                    model_params = model_info['params']
                    params_str = ", ".join([f"{key}={value}" for key, value in model_params.items()])
                    fitted_model = model_class(train_poly, params_str).fit()

                    end_time = time.time()
                    execution_time = end_time - start_time
                    minutes = int(execution_time // 60)
                    seconds = int(execution_time % 60)
                    print("Training Complete. ", model_name_with_degree, " Execution time:", minutes, "minutes and", seconds, "seconds")
                                        
                    # Make predictions
                    predictions = fitted_model.predict(start=len(train_poly), end=len(train_poly) + len(test_poly) - 1)
        
                    # Calculate metrics
                    model_metrics = calculate_metrics(fitted_model, predictions, test_data, model_name_with_degree)
                    
                    # Store metrics
                    metrics_list.append(model_metrics)    
                except Exception as e:
                    print(f"Error fitting {model_name_with_degree} model:", str(e))
                    continue
            
    # 5.6 Choose the best model
    # 5.6.1 Aggregate the metrics for the same model_name_with_degree
    df_metrics_aggregated = aggregate_metrics_list(metrics_list)    
    # 5.6.2 Sort the metrics 
    df_metrics = sort_metrics_list(df_metrics_aggregated)
    print(df_metrics)
    # save_file(df_metrics, 'metrics_boston', 'Boston Metrics - sm') 

    best_model_info=df_metrics.iloc[0]
    best_model_name =best_model_info['name'] 
    best_model = best_model_info['result'] # result is derived instead of model name
    degree = best_model_info['degree']
    print(f'Best model found: {best_model_name}, Degree: {degree}')
    # print(best_model_info)
   
    X_train_poly, X_test_poly, feature_names = poly_transformer(X_train, X_test, degree)

    # 5.7 (Optional) Calculate the prediction_intervals based on the best model
    # with train data using boostrap resampling
    prediction_intervals = get_confidence_intervals(X_train_poly, X_test_poly, y_train, best_model, num_bootstrap_samples,confidence)

    return best_model, degree, top_features,exog_feature_mapping, prediction_intervals


In [None]:
def main():
    
    # 0. Configure the logger
    logging.basicConfig(
        level=logging.INFO,                 # Set log level to INFO, which logs INFO and higher levels
        format='%(asctime)s - %(levelname)s - %(message)s',  # Log format
        filename='sm_time_series.log',                 # Log to file named app.log
        filemode='w'                        # Write mode (overwrite existing log)
    )

    # Create a logger instance
    logger = logging.getLogger(__name__)
    logger.info('This is an INFO message')
    logger.warning('This is a WARNING message')
    logger.error('This is an ERROR message')

    # 2. Load dataset
    data, time_series = load_dataset('household_power_consumption.txt')

    
    # Specify the target variable(s)
    uni_target_variable = 'Global_active_power'
    multi_target_variable = ['Global_active_power', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
    data_subset = data[multi_target_variable].head(100)

    # 3. Preprocess dataset
    data_processed = data_preprocess(data_subset)
    # Setting the datetime column as the index 
    # in a time series dataset can be beneficial and is often recommended.
    data_processed.set_index('datetime', inplace=True)

    # 4. Define endog (optianal exog)
    
    # Get the data for modeling
    univariate = data_processed[uni_target_variable]
    multivariate = data_processed[multi_target_variable]

    # 5. Fit regression models and get the optimal model
    # 5.1 Specify the degree of the polynomial, test size, cross validation and others
    degree_list = [1]  
    testsize_split = 5  
    confidence=0.95
    num_bootstrap_samples=50

    # 5.2 Fit the time series model using Loop
    fit_time_series_models(logger, univariate, multivariate, testsize_split,
                          degree_list, num_bootstrap_samples,
                          confidence)

if __name__ == "__main__":
    main()

###### statsmodels/time series ######