# Models
In this notebook we propose different model architectures and generate predictions with each model. Here, we only use the first 80% of data, the rest of the series will be used for model evaluation in notebook 4.

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import numpy as np

In [None]:
data_path = "data/stickers/transformed_features.csv" # your features_df data path

def load_train_val_splits(data_path="data/transformed_df", target="num_sold"):
    df = pd.read_csv(data_path,index_col=0)
    df_numeric = df.select_dtypes(include=[float, int]).drop(columns=["id"]).dropna()

    train_df = df_numeric[df_numeric["test"] == 0]
    test_df = df_numeric[df_numeric["test"] == 1]

    X_train = train_df.drop(columns=[target,"test"])
    X_test = test_df.drop(columns=[target,"test"])

    y_train = train_df[target]
    y_test = test_df[target]
    
    return X_train, X_test, y_train, y_test # No shuffle, avoid data leakage in temporal data

X_train, X_val, y_train, y_val = load_train_val_splits(data_path=data_path)
# In this notebook we train with the first 80% of the time series and validate models with the remaining 20%. Test data is providad by kaggle

In [9]:
# helper function to get testing metrics
def validate_model(model, X_val, y_val):

    # predictions
    y_pred = model.predict(X_val)
    
    # metrics
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")
    
    return {"mse": mse, "mae": mae, "r2": r2}

## Linear regression
Starting with the simplest regression model: linear regression, gives a general notion of the quality of explanatory variables.

In [10]:
model = LinearRegression()
model.fit(X_train,y_train)

In [11]:
validate_model(model,X_val,y_val)

Mean Squared Error: 2470.357682312874
Mean Absolute Error: 30.012401634952734
R^2 Score: 0.9930417183400789


{'mse': 2470.357682312874, 'mae': 30.012401634952734, 'r2': 0.9930417183400789}

**Nota importante:** Conviene comparar la performance entre linear + other model vs other model a secas para justificar que tiene sentido la combinación. Si un xgboost a secas funciona bien no tiene sentido complicarse la vida.

## Ensemble regression

Using a more sofisticated ensemble model

In [44]:
class BaggingEnsemble():
    '''Bagging ensemble'''
    def __init__(self, voting="hard",model_list=[]):
        self.model_list=model_list
        self.voting=voting
    def fit(self,X_train,y_train):
        for model in self.model_list:
            model.fit(X_train,y_train)
    def predict(self, X_test):
        if self.voting == "soft":
            return np.mean([model.predict(X_test) for model in self.model_list], axis=0)
        elif self.voting == "hard":
            raise NotImplementedError("Hard voting not implemented yet")
        else:
            raise ValueError("Unrecognized voting strategy: use hard or soft.")



In [42]:
class EnsembleRegressionModel():
    '''Boost Ensemble Regression Model. 
    1. Fits a linear regression model with features X and target y, then calculates the resulting residuals.
    2. Fits another regression model with same feature set X and residuals as target variable (captures and corrects non linearities and errors from the first model).
    3. Computes the final prediction as the sum of the linear regression output and the residuals predicted by the second model.'''
    def __init__(self,top_model="xgboost",regressor="linear",regression_params={},top_model_params={},name=None):

        self.name=name # Add a verbous name for the model
        
        self.regression_params = regression_params
        self.top_model_params = top_model_params
        
        self.implemented_regressors = {
            "linear": LinearRegression,
            "svr":SVR
            # You can add other regression models here if desired. 
            # However, we highly recommend using linear regression, as it aligns with our previous analysis 
            # and follows Occam's Razor (favoring simplicity).
        }
        self.regressor= self.implemented_regressors.get(regressor)
        
        self.implemented_top_models = {
            "xgboost":XGBRegressor,
            "random_forest":RandomForestRegressor,
            "bagging":BaggingEnsemble  # An ensemble model that computes the average of several models
            # here you can add other models to try (ej: transformer, random forest)
        }
        self.top_model=self.implemented_top_models.get(top_model)

        # Placeholders for the fitted models
        self.base_model = None
        self.top_model_instance = None
        
    def _train_regression(self,X, y):
        '''Trains the base linear regression model and computes residuals.'''
        self.base_model = self.regressor(**self.regression_params)
        self.base_model.fit(X, y)
        y_pred = self.base_model.predict(X)
        self.residuals = y - y_pred

    def _train_top_model(self,X):
        '''Trains the top model on the residuals from the linear regression.'''
        self.top_model_instance = self.top_model(**self.top_model_params)
        self.top_model_instance.fit(X, self.residuals)

    def fit(self,X_train,y_train):
        '''Trains both the base and residual models.'''
        self._train_regression(X_train, y_train)
        self._train_top_model(X_train)
    
    def predict(self,X_test):
        '''Makes predictions by summing the base regressor predictions and the residual model predictions.'''
        base_pred = self.base_model.predict(X_test)
        residual_pred = self.top_model_instance.predict(X_test)
        return base_pred + residual_pred
    
    def evaluate(self,X_test,y_test):
        '''Shows and returns regression metrics.'''
        y_pred = self.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"Mean Squared Error: {mse}")
        print(f"Mean Absolute Error: {mae}")
        print(f"R^2 Score: {r2}")
        
        return {"mse": mse, "mae": mae, "r2": r2}


Start from here.

In [13]:
# try different model hyperparameters if desired

top_model_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "learning_rate": 0.1,
    "random_state": 42
}

regression_params = {
    "fit_intercept": True,
}

my_boosting_model = EnsembleRegressionModel(top_model="xgboost", # only xgboost and random_forest implemented from now
                                            regressor="linear",
                                            regression_params=regression_params,
                                            top_model_params=top_model_params,
                                            name="xgboost_regression_ensemble" # just a verbous name to keep track predictions later
                                            )

my_boosting_model.fit(X_train,y_train) # training logic is already implemented within the class :)
my_boosting_model.evaluate(X_val,y_val) # check model metrics

Mean Squared Error: 42.72034164100333
Mean Absolute Error: 1.7745536855247432
R^2 Score: 0.999879669178324


{'mse': 42.72034164100333, 'mae': 1.7745536855247432, 'r2': 0.999879669178324}

In [14]:
top_model_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "random_state": 42
}

my_rf_boosting_model = EnsembleRegressionModel(top_model="random_forest", # only xgboost and random_forest implemented from now
                                            regressor="linear",
                                            regression_params=regression_params,
                                            top_model_params=top_model_params,
                                            name="random_forest_regression_ensemble"
                                            )

In [35]:
# try different combinations and store them in the list. get_model_predictions_df generates a df that stores the predictions from each model

# tune hyperparameters as you prefer
xgboost_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "learning_rate": 0.1,
    "random_state": 42
}

xgboost_basic = XGBRegressor(**xgboost_params)
linear_regression = LinearRegression()
decision_tree = DecisionTreeRegressor(max_depth=2)


my_SVR_boosting_model = EnsembleRegressionModel(top_model="random_forest", # only xgboost and random_forest implemented from now
                                            regressor="svr",
                                            top_model_params=top_model_params,
                                            name="svr_regression_ensemble"
                                            )

my_model_list = [
    my_boosting_model,
    my_rf_boosting_model,
    # more models
    # you can also add other sklearn instances
    xgboost_basic,
    linear_regression,
    decision_tree 
]

In [37]:
def get_model_predictions_df(model_list):
    pred_df = pd.DataFrame()
    for model in model_list:
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        model_name = model.name if hasattr(model, 'name') else type(model).__name__
        pred_df[model_name] = predictions
        print("\n",model_name)
        validate_model(model=model,X_val=X_val,y_val=y_val)
    return pred_df

pred_df = get_model_predictions_df(my_model_list)
pred_df



 xgboost_regression_ensemble
Mean Squared Error: 42.72034164100333
Mean Absolute Error: 1.7745536855247432
R^2 Score: 0.999879669178324

 random_forest_regression_ensemble
Mean Squared Error: 92.88143715040844
Mean Absolute Error: 4.409586568409559
R^2 Score: 0.9997383799093961

 XGBRegressor
Mean Squared Error: 87.98529895766191
Mean Absolute Error: 3.555082451747806
R^2 Score: 0.9997521709117416

 LinearRegression
Mean Squared Error: 2470.357682312874
Mean Absolute Error: 30.012401634952734
R^2 Score: 0.9930417183400789

 DecisionTreeRegressor
Mean Squared Error: 46363.23113328639
Mean Absolute Error: 170.96744836860847
R^2 Score: 0.8694082143653845


Unnamed: 0,xgboost_regression_ensemble,random_forest_regression_ensemble,XGBRegressor,LinearRegression,DecisionTreeRegressor
0,690.835202,686.568606,693.828369,693.121808,713.840612
1,557.935199,554.753139,560.135132,560.946406,713.840612
2,352.548561,356.107185,350.893921,392.960102,161.163799
3,317.477891,313.577434,313.226318,348.317773,161.163799
4,1376.681754,1372.527215,1377.694702,1380.716678,1453.921895
...,...,...,...,...,...
44454,470.615535,449.480234,448.917480,621.522380,161.163799
44455,2942.242358,2863.703907,3009.753906,2429.555224,2818.965219
44456,2262.571746,2295.369555,2223.542480,1949.968444,1453.921895
44457,1236.780983,1230.576901,1244.366699,1178.550785,713.840612


In [49]:
# computing bagging predictions

bagging_params = {
    "voting":"soft",
    "model_list": [XGBRegressor(),RandomForestRegressor()]
}

bagging_model = EnsembleRegressionModel(top_model="bagging",
                                            regressor="linear",
                                            top_model_params=bagging_params,
                                            name="xgboost_rf_bagging_ensemble"
                                            )

baggin_predictions = get_model_predictions_df(model_list=[bagging_model])


 xgboost_rf_bagging_ensemble
Mean Squared Error: 45.073230341054874
Mean Absolute Error: 2.2048828435666232
R^2 Score: 0.9998730417727436


In [None]:
# store predictions
predictions_path = "data/predictions_df.csv"
# pred_df.to_csv(predictions_path)
baggin_predictions.to_csv(predictions_path)