# Stacking

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from icecream import ic
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import copy
from sklearn.model_selection import KFold, StratifiedKFold

## For Classification

In [62]:
from MultiLayerPerceptron import MLP, CrossEntropyLoss
from MLR import MLR
from DecisionTrees import DecisionTree

In [181]:
class StackingForClassification:
    def __init__(self, level0_estimators, level1_estimators, method) -> None:
        self.level0_estimators = level0_estimators
        self.level1_estimators = level1_estimators
        self.method = method
        self.level0_predictions = None
        self.level0_estimators_object = []
        self.base_estimator = None
        
    def fit_level0(self, X, y):
        self.level0_estimators_object = []
        for i, estimator in enumerate(copy.deepcopy(self.level0_estimators)):
            estimator.fit(X, y, epochs = 1000)
            self.level0_estimators_object.append(estimator)

    def predict_level0(self, X):
        self.level0_predictions = np.zeros((X.shape[0], 1))
        for estimator in self.level0_estimators_object:
            self.level0_predictions = np.c_[self.level0_predictions, estimator.predict(X)]
        return self.level0_predictions[:, 1:]
    
    def train_base_estimator(self, X, y):
        for estimator in self.level0_estimators:
            estimator.fit(X, y, epochs = 1000)
    def base_estimator_predict(self, X):
        predictions = np.zeros((X.shape[0], 1))
        for estimator in self.level0_estimators:
            predictions = np.c_[predictions, estimator.predict(X)]
        return predictions[:, 1:]
            
    def stacking(self, X_, y_, no_of_folds=5):
        # do k-fold cross validation
        X, X_test, y, y_test = train_test_split(X_, y_, test_size=0.2, random_state= 42)
        kf = KFold(n_splits=no_of_folds, shuffle=True, random_state=42)
           # 6    *    3
        predictions = np.zeros((1, y.shape[1] * len(self.level0_estimators)))
        # need to do the first rows in both the predictions, new_y
        new_y = np.zeros((1, y.shape[1]))
        for train_ind, val_ind in kf.split(X, y):
            X_train, y_train = X[train_ind], y[train_ind]
            X_val, y_val = X[val_ind], y[val_ind]
            
            new_y = np.vstack((new_y, y_val))
            
            self.fit_level0(X_train, y_train)

            predictions = np.vstack((predictions, self.predict_level0(X_val)))

        X_level1 =  predictions[1:]
        y_level1 = new_y[1:]
        ic(X_level1.shape, y_level1.shape)
        self.level1_estimators.fit(X_level1, y_level1, epochs = 1000)
        
        self.train_base_estimator(X, y)
        X_pred_test = self.base_estimator_predict(X_test)
        
        y_pred_test = self.level1_estimators.predict(X_pred_test)
        print("Accuracy on test data: ", accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred = np.argmax(y_pred_test, axis=1)))

        
    def blending(self, X_, y_, no_of_folds=None):
        X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3)
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)
        
        
        self.train_base_estimator(X_train, y_train)
        X_level1 = self.base_estimator_predict(X_val)

        self.level1_estimators.fit(X_level1, y_val, epochs = 1000)
        
        X_pred_test = self.base_estimator_predict(X_test)
        y_pred_test = self.level1_estimators.predict(X_pred_test)
        
        print("Accuracy on test data:",accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred = np.argmax(y_pred_test, axis=1)))
        

In [170]:
df = pd.read_csv('./Dataset/WineQT.csv')

In [171]:
y = df.quality
df = df.drop('Id', axis=1)
X = df.drop('quality', axis=1)


In [172]:
X = StandardScaler().fit_transform(X.values)
y = pd.get_dummies(y).values
input_size = X.shape[1]
output_size = y.shape[1]

In [173]:
stacking_for_classification = StackingForClassification(
    level0_estimators=[ DecisionTree(),
                        MLP(
                        input_size=input_size,
                        hidden_layer_sizes=[32, 32],
                        output_size=output_size,
                        activation_function=["relu", "relu"],
                        output_activation_function="softmax",
                        optimizer="bgd",
                        loss=CrossEntropyLoss(),
                        learning_rate=0.001),
                        MLR()],
        level1_estimators=MLR(), method="stacking")
stacking_for_classification.stacking(X_=X, y_=y, no_of_folds=5)

ic| X_level1.shape: (914, 18), y_level1.shape: (914, 6)


Accuracy on test data:  0.6157205240174672


In [174]:
stacking_for_classification = StackingForClassification(
    level0_estimators=[ DecisionTree(),
                        MLP(
                        input_size=input_size,
                        hidden_layer_sizes=[32, 32],
                        output_size=output_size,
                        activation_function=["relu", "relu"],
                        output_activation_function="softmax",
                        optimizer="bgd",
                        loss=CrossEntropyLoss(),
                        learning_rate=0.001),
                        MLR()],
        level1_estimators=DecisionTree(), method="stacking")
stacking_for_classification.stacking(X_=X, y_=y, no_of_folds=5)

ic| X_level1.shape: (914, 18), y_level1.shape: (914, 6)


Accuracy on test data:  0.5109170305676856


In [182]:
stacking_for_classification = StackingForClassification(
    level0_estimators=[ DecisionTree(),
                        MLP(
                        input_size=input_size,
                        hidden_layer_sizes=[32, 32],
                        output_size=output_size,
                        activation_function=["relu", "relu"],
                        output_activation_function="softmax",
                        optimizer="bgd",
                        loss=CrossEntropyLoss(),
                        learning_rate=0.001),
                        MLR()],
        level1_estimators=DecisionTree(), method="stacking")
stacking_for_classification.blending(X_=X, y_=y, no_of_folds=5)

Accuracy on test data: 0.5146198830409356


In [183]:
stacking_for_classification = StackingForClassification(
    level0_estimators=[ DecisionTree(),
                        MLP(
                        input_size=input_size,
                        hidden_layer_sizes=[32, 32],
                        output_size=output_size,
                        activation_function=["relu", "relu"],
                        output_activation_function="softmax",
                        optimizer="bgd",
                        loss=CrossEntropyLoss(),
                        learning_rate=0.001),
                        MLR()],
        level1_estimators=MLR(), method="stacking")
stacking_for_classification.blending(X_=X, y_=y, no_of_folds=5)

Accuracy on test data: 0.6257309941520468


# For Regression

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from icecream import ic
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import copy
from sklearn.model_selection import KFold, StratifiedKFold

In [105]:
from DecisionTreeRegression import DecisionTreeRegressorWrapper
from MultiLayerRegression import MultiLayerRegression, MSELoss
from LinearRegression import LinearRegression

In [106]:
class StackingForRegression:
    def __init__(self, level0_estimators, level1_estimators, method) -> None:
        self.level0_estimators = level0_estimators
        self.level1_estimators = level1_estimators
        self.method = method
        self.level0_predictions = None
        self.level0_estimators_object = []
        self.base_estimator = None
        
    def fit_level0(self, X, y):
        self.level0_estimators_object = []
        for i, estimator in enumerate(copy.deepcopy(self.level0_estimators)):
            estimator.fit(X, y, epochs = 1000)
            self.level0_estimators_object.append(estimator)

    def predict_level0(self, X):
        self.level0_predictions = np.zeros((X.shape[0], 1))
        for estimator in self.level0_estimators_object:
            self.level0_predictions = np.c_[self.level0_predictions, estimator.predict(X)]
        return self.level0_predictions[:, 1:]
    
    def train_base_estimator(self, X, y):
        for estimator in self.level0_estimators:
            estimator.fit(X, y, epochs = 1000)
    def base_estimator_predict(self, X):
        predictions = np.zeros((X.shape[0], 1))
        for estimator in self.level0_estimators:
            predictions = np.c_[predictions, estimator.predict(X)]
        return predictions[:, 1:]
            
    def stacking(self, X_, y_, no_of_folds=5):
        # do k-fold cross validation
        X, X_test, y, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
        kf = KFold(n_splits=no_of_folds, shuffle=True, random_state=42)
        
        # Initialize predictions and new_y with the correct shape
        num_estimators = len(self.level0_estimators)
        predictions = np.zeros((1, num_estimators))
        new_y = np.zeros((1, y.shape[1]))

        # ic(predictions)
        # ic(predictions.shape)
        # ic(new_y)
        # ic(new_y.shape)

        for train_ind, val_ind in kf.split(X, y):
            X_train, y_train = X[train_ind], y[train_ind]
            X_val, y_val = X[val_ind], y[val_ind]
            # ic(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
            
            # Ensure that y_val has the same shape as the predictions
            # y_val_reshaped = y_val.reshape((-1, num_estimators))
            # ic(y_val.shape)
            new_y = np.vstack((new_y, y_val))

            self.fit_level0(X_train, y_train)

            predictions = np.vstack((predictions, self.predict_level0(X_val)))

        # Remove the first row of zeros used for initialization
        predictions = predictions[1:]
        new_y = new_y[1:]
        
        # ic(predictions.shape, new_y.shape)
        
        X_level1 = predictions
        y_level1 = new_y
        
        self.level1_estimators.fit(X_level1, y_level1, epochs=1000)
        
        self.train_base_estimator(X, y)
        X_pred_test = self.base_estimator_predict(X_test)

        y_pred_test = self.level1_estimators.predict(X_pred_test)
        print("MSE on test data: ", mean_squared_error(y_true=y_test, y_pred=y_pred_test))
            
    def blending(self, X_, y_, no_of_folds=None):
        X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3)
        X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)
        
        
        self.train_base_estimator(X_train, y_train)
        X_level1 = self.base_estimator_predict(X_val)

        self.level1_estimators.fit(X_level1, y_val, epochs = 1000)
        
        X_pred_test = self.base_estimator_predict(X_test)
        y_pred_test = self.level1_estimators.predict(X_pred_test)
        # ic(y_test, y_pred_test)
        # ic(y_test.shape, y_pred_test.shape)
        print("MSE on test data:", mean_squared_error(y_true=y_test, y_pred = y_pred_test))
        

In [107]:
PATH_TO_BOSTON_HOUSING_DATASET = "./Dataset/HousingData.csv"

In [108]:
df = pd.read_csv(PATH_TO_BOSTON_HOUSING_DATASET)

In [109]:
df.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [110]:
# fill the nan values 
# except for the CHAS column it is a categorical variable
df["CHAS"] = df.CHAS.fillna(df.CHAS.mode()[0])
df = df.fillna(df.mean())

In [111]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [112]:
# standardize the data except the target column 'MEDV' and "CHAS"
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(['MEDV', 'CHAS'], axis=1))
df_scaled = pd.DataFrame(df_scaled, columns=df.drop(["CHAS", "MEDV"], axis=1).columns)
df_scaled["CHAS"] = df["CHAS"]

In [113]:
X = df_scaled.values
y = df["MEDV"].values

input_size = X.shape[1]

# because it's regression
output_size = 1

In [114]:
# split into train test and validation test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)
# y_train.shape, y_test.shape, y_val.shape
y_test = y_test.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

In [115]:
input_size = X_train.shape[1]
output_size = 1

In [116]:
stacking_for_regression = StackingForRegression(
        level0_estimators=[     DecisionTreeRegressorWrapper(),
                                MultiLayerRegression(
                                input_size=input_size,\
                                hidden_layer_sizes=[32, 32],\
                                output_size=output_size,
                                activation_function=["relu", "relu"],\
                                output_activation_function="identity",\
                                optimizer="bgd",\
                                loss = MSELoss(),\
                                learning_rate=0.001),
                                LinearRegression()],
        level1_estimators=LinearRegression(), method="stacking")
stacking_for_regression.stacking(X_=X_train, y_=y_train, no_of_folds=5)

MSE on test data:  22.023895817943473


In [117]:
stacking_for_regression = StackingForRegression(
        level0_estimators=[     DecisionTreeRegressorWrapper(),
                                MultiLayerRegression(
                                input_size=input_size,\
                                hidden_layer_sizes=[32, 32],\
                                output_size=output_size,
                                activation_function=["relu", "relu"],\
                                output_activation_function="identity",\
                                optimizer="bgd",\
                                loss = MSELoss(),\
                                learning_rate=0.001),
                                LinearRegression()],
        level1_estimators=DecisionTreeRegressorWrapper(), method="stacking")
stacking_for_regression.stacking(X_=X_train, y_=y_train, no_of_folds=5)


MSE on test data:  17.227654320987654


In [118]:
stacking_for_regression = StackingForRegression(
        level0_estimators=[     DecisionTreeRegressorWrapper(),
                                MultiLayerRegression(
                                input_size=input_size,\
                                hidden_layer_sizes=[32, 32],\
                                output_size=output_size,
                                activation_function=["relu", "relu"],\
                                output_activation_function="identity",\
                                optimizer="bgd",\
                                loss = MSELoss(),\
                                learning_rate=0.001),
                                LinearRegression()],
        level1_estimators=LinearRegression(), method="stacking")
stacking_for_regression.blending(X_=X_train, y_=y_train, no_of_folds=5)

MSE on test data: 15.352236594840074


In [119]:
stacking_for_regression = StackingForRegression(
        level0_estimators=[     DecisionTreeRegressorWrapper(),
                                MultiLayerRegression(
                                input_size=input_size,\
                                hidden_layer_sizes=[32, 32],\
                                output_size=output_size,
                                activation_function=["relu", "relu"],\
                                output_activation_function="identity",\
                                optimizer="bgd",\
                                loss = MSELoss(),\
                                learning_rate=0.001),
                                LinearRegression()],
        level1_estimators=DecisionTreeRegressorWrapper(), method="stacking")
stacking_for_regression.blending(X_=X_train, y_=y_train, no_of_folds=5)


MSE on test data: 15.511311475409837
