# Models
In this notebook we propose different model architectures and generate predictions with each model. Here, we only use the first 80% of data, the rest of the series will be used for model evaluation in notebook 4.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [4]:
data_path = "data/stickers/transformed_features.csv" # your features_df data path

def load_train_val_splits(data_path="data/transformed_df", target="num_sold"):
    df = pd.read_csv(data_path,index_col=0)
    df_numeric = df.select_dtypes(include=[float, int]).drop(columns=["id","test"]).dropna()
    y = df_numeric[target]
    X = df_numeric.drop(columns=target)
    
    return train_test_split(X,y,shuffle=False) # No shuffle, avoid data leakage in temporal data

X_train, X_val, y_train, y_val = load_train_val_splits(data_path=data_path)
# In this notebook we train with the first 80% of the time series and validate models with the remaining 20%. Test data is providad by kaggle
X_train.columns

Index(['year', 'weekday', 'day_of_year', 'day_num', 'week_num', 'month',
       'sin_2pi', 'cos_2pi', 'sin_24pi', 'cos_24pi', 'sin_8pi', 'cos_8pi',
       'sin_6pi', 'cos_6pi', 'sin_4pi', 'cos_4pi', 'sin_pi', 'cos_pi',
       'gdp_factor', 'store_factor', 'product_factor', 'holiday',
       'weekday_factor', 'ratio', 'total', 'day_of_year_factor',
       'sincos_factor', 'country_factor', 'cya_factor', 'till_month_end'],
      dtype='object')

In [96]:
# helper function to get testing metrics
def validate_model(model, X_val, y_val):

    # predictions
    y_pred = model.predict(X_val)
    
    # metrics
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")
    
    return {"mse": mse, "mae": mae, "r2": r2}

## Linear regression
Starting with the simplest regression model: linear regression, gives a general notion of the quality of explanatory variables.

In [97]:
model = LinearRegression()
model.fit(X_train,y_train)

In [98]:
validate_model(model,X_val,y_val)

Mean Squared Error: 2634.7391533146283
Mean Absolute Error: 31.628742026547926
R^2 Score: 0.9927534672156321


{'mse': 2634.7391533146283,
 'mae': 31.628742026547926,
 'r2': 0.9927534672156321}

**Nota importante:** Conviene comparar la performance entre linear + other model vs other model a secas para justificar que tiene sentido la combinación. Si un xgboost a secas funciona bien no tiene sentido complicarse la vida.

## Ensemble regression

Using a more sofisticated ensemble model

In [99]:
class EnsembleRegressionModel():
    '''Boost Ensemble Regression Model. 
    1. Fits a linear regression model with features X and target y, then calculates the resulting residuals.
    2. Fits another regression model with same feature set X and residuals as target variable (captures and corrects non linearities and errors from the first model).
    3. Computes the final prediction as the sum of the linear regression output and the residuals predicted by the second model.'''
    def __init__(self,top_model="xgboost",regressor="linear",regression_params={},top_model_params={},name=None):

        self.name=name # Add a verbous name for the model
        
        self.regression_params = regression_params
        self.top_model_params = top_model_params
        
        self.implemented_regressors = {
            "linear": LinearRegression,
            # You can add other regression models here if desired. 
            # However, we highly recommend using linear regression, as it aligns with our previous analysis 
            # and follows Occam's Razor (favoring simplicity).
        }
        self.regressor= self.implemented_regressors.get(regressor)
        
        self.implemented_top_models = {
            "xgboost":XGBRegressor,
            "random_forest":RandomForestRegressor
            # here you can add other models to try (ej: transformer, random forest)
        }
        self.top_model=self.implemented_top_models.get(top_model)

        # Placeholders for the fitted models
        self.base_model = None
        self.top_model_instance = None
        
    def _train_regression(self,X, y):
        '''Trains the base linear regression model and computes residuals.'''
        self.base_model = self.regressor(**self.regression_params)
        self.base_model.fit(X, y)
        y_pred = self.base_model.predict(X)
        self.residuals = y - y_pred

    def _train_top_model(self,X):
        '''Trains the top model on the residuals from the linear regression.'''
        self.top_model_instance = self.top_model(**self.top_model_params)
        self.top_model_instance.fit(X, self.residuals)

    def fit(self,X_train,y_train):
        '''Trains both the base and residual models.'''
        self._train_regression(X_train, y_train)
        self._train_top_model(X_train)
    
    def predict(self,X_test):
        '''Makes predictions by summing the base regressor predictions and the residual model predictions.'''
        base_pred = self.base_model.predict(X_test)
        residual_pred = self.top_model_instance.predict(X_test)
        return base_pred + residual_pred
    
    def evaluate(self,X_test,y_test):
        '''Shows and returns regression metrics.'''
        y_pred = self.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"Mean Squared Error: {mse}")
        print(f"Mean Absolute Error: {mae}")
        print(f"R^2 Score: {r2}")
        
        return {"mse": mse, "mae": mae, "r2": r2}


Start from here.

In [100]:
# try different model hyperparameters if desired

top_model_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "learning_rate": 0.1,
    "random_state": 42
}

regression_params = {
    "fit_intercept": True,
}

my_boosting_model = EnsembleRegressionModel(top_model="xgboost", # only xgboost and random_forest implemented from now
                                            regressor="linear",
                                            regression_params=regression_params,
                                            top_model_params=top_model_params,
                                            name="xgboost_regression_ensemble" # just a verbous name to keep track predictions later
                                            )

my_boosting_model.fit(X_train,y_train) # training logic is already implemented within the class :)
my_boosting_model.evaluate(X_val,y_val) # check model metrics

Mean Squared Error: 16.975260916329688
Mean Absolute Error: 2.0130046593214193
R^2 Score: 0.9999533115888916


{'mse': 16.975260916329688,
 'mae': 2.0130046593214193,
 'r2': 0.9999533115888916}

In [101]:
top_model_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42
}

my_rf_boosting_model = EnsembleRegressionModel(top_model="random_forest", # only xgboost and random_forest implemented from now
                                            regressor="linear",
                                            regression_params=regression_params,
                                            top_model_params=top_model_params,
                                            name="random_forest_regression_ensemble"
                                            )

In [102]:
# try different combinations and store them in the list. get_model_predictions_df generates a df that stores the predictions from each model

# tune hyperparameters as you prefer
xgboost_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "learning_rate": 0.1,
    "random_state": 42
}

xgboost_basic = XGBRegressor(**xgboost_params)
linear_regression = LinearRegression()

my_model_list = [
    my_boosting_model,
    my_rf_boosting_model,
    # more models
    # you can also add other sklearn instances
    xgboost_basic,
    linear_regression
]

In [103]:
def get_model_predictions_df(model_list):
    pred_df = pd.DataFrame()
    for model in model_list:
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        model_name = model.name if hasattr(model, 'name') else type(model).__name__
        pred_df[model_name] = predictions
    return pred_df

pred_df = get_model_predictions_df(my_model_list)
pred_df


Unnamed: 0,xgboost_regression_ensemble,random_forest_regression_ensemble,XGBRegressor,LinearRegression
0,1582.360754,1571.530847,1582.212402,1510.732207
1,418.437848,417.187840,411.304932,466.834088
2,2977.894859,2976.892449,2916.434814,2804.230994
3,1986.342471,1999.542670,1971.394531,2038.794944
4,1045.800047,1045.836979,1048.770142,1033.818575
...,...,...,...,...
55310,465.158574,448.993603,466.840424,623.830083
55311,2896.586507,2870.517573,2796.190918,2427.049153
55312,2291.410791,2301.570962,2284.853027,1949.180749
55313,1237.059349,1229.490203,1226.017212,1178.487972


In [None]:
# store predictions
predictions_path = "data/predictions_df.csv"
pred_df.to_csv(predictions_path)