In [1]:
import pickle
from math import sqrt
from typing import Dict, Union

import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost.sklearn import XGBRegressor

SEED = 1212


In [2]:
wine_quality_df = pd.read_csv('data/winequality-red.csv')
wine_quality_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
wine_quality_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [4]:
wine_quality_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
x_train, x_test, y_train, y_test = train_test_split(wine_quality_df.drop('quality', axis=1), wine_quality_df['quality'], train_size=0.8, random_state=SEED)

In [6]:
def evaluate_regression(model_name: str, y_true: Union[pd.Series, np.array], y_pred: Union[pd.Series, np.array]) -> Dict[str, int]:
    """
    Function to evaluate and calculate metrics score for any regression model object.

    Args:
        model_name (str): Model object custom name.
        y_true (Union[pd.Series, np.array]): Feature dataframe from test.
        y_pred (Union[pd.Series, np.array]): Label series from test.

    Returns:
        Dict[str, int]: Dictionary containing the metrics and calculated values.
    """

    metrics_dict = {}
    metrics_dict['model_name'] = model_name
    metrics_dict['r2_score'] = r2_score(y_true, y_pred)
    metrics_dict['explained_variance'] = explained_variance_score(y_true, y_pred)
    metrics_dict['mae'] = mean_absolute_error(y_true, y_pred)
    metrics_dict['mse'] = mean_squared_error(y_true, y_pred)

    return metrics_dict

In [7]:
lin_reg_pipe = Pipeline([('scaler', StandardScaler()), ('lin_reg', LinearRegression())])
lin_reg_pipe.fit(x_train, y_train)

lin_reg_y_pred = lin_reg_pipe.predict(x_test)

lin_reg_metrics = evaluate_regression('Linear Regression', y_test, lin_reg_y_pred)
print(lin_reg_metrics)

{'model_name': 'Linear Regression', 'r2_score': 0.3830963482610811, 'explained_variance': 0.3831045783595084, 'mae': 0.521571858395009, 'mse': 0.43723046316995884}


In [8]:
sgd_reg_pipe = Pipeline([('scaler', StandardScaler()), ('sgd_reg', SGDRegressor())])
sgd_reg_pipe.fit(x_train, y_train)

sgd_reg_y_pred = sgd_reg_pipe.predict(x_test)

sgd_reg_metrics = evaluate_regression('Stochastic Gradient Descent Regression', y_test, sgd_reg_y_pred)
print(sgd_reg_metrics)

{'model_name': 'Stochastic Gradient Descent Regression', 'r2_score': 0.3793523033196343, 'explained_variance': 0.379438224952816, 'mae': 0.5250816472985841, 'mse': 0.4398840550222092}


In [9]:
xgb_reg_pipe = Pipeline([('scaler', StandardScaler()), ('xgb_reg', XGBRegressor())])
xgb_reg_pipe.fit(x_train, y_train)

xgb_reg_y_pred = xgb_reg_pipe.predict(x_test)

xgb_reg_metrics = evaluate_regression('XGBoost Regressor', y_test, xgb_reg_y_pred)
print(xgb_reg_metrics)

{'model_name': 'XGBoost Regressor', 'r2_score': 0.4720407552376791, 'explained_variance': 0.47305291033443886, 'mae': 0.39728558957576754, 'mse': 0.3741911147252949}


In [10]:
mlp_reg_pipe = Pipeline([('scaler', StandardScaler()), ('mlp_reg', MLPRegressor(hidden_layer_sizes=(50,)*8))])
mlp_reg_pipe.fit(x_train, y_train)

mlp_reg_y_pred = mlp_reg_pipe.predict(x_test)

mlp_reg_metrics = evaluate_regression('Multilayer Perceptron Regressor', y_test, mlp_reg_y_pred)
print(mlp_reg_metrics)

{'model_name': 'Multilayer Perceptron Regressor', 'r2_score': 0.20794244719099564, 'explained_variance': 0.20797239468096207, 'mae': 0.5220364208377996, 'mse': 0.5613707905533818}


In [11]:
voting_reg = VotingRegressor([('lin_reg', lin_reg_pipe), ('sgd_reg', sgd_reg_pipe), ('xgb_reg', xgb_reg_pipe), ('mlp_reg', mlp_reg_pipe)], n_jobs=-1)
voting_reg.fit(x_train, y_train)

voting_reg_y_pred = voting_reg.predict(x_test)

voting_reg_metrics = evaluate_regression('Voting Regressor', y_test, voting_reg_y_pred)
print(voting_reg_metrics)

{'model_name': 'Voting Regressor', 'r2_score': 0.48460719759536997, 'explained_variance': 0.48475828671649923, 'mae': 0.4485055515325239, 'mse': 0.36528464870428157}


In [12]:
class DeepNeuralNetworkRegressor(torch.nn.Module):

    def __init__(self, input_size: int, hidden_size: int, hidden_layers: int):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers

        self.layers = torch.nn.ModuleList()

        # Input Layer
        self.layers.append(torch.nn.Linear(self.input_size, self.hidden_size))
        self.layers.append(torch.nn.ReLU())

        # Hidden Layers
        for n_layer in range(self.hidden_layers):
            self.layers.append(torch.nn.Linear(self.hidden_size, self.hidden_size))
            self.layers.append(torch.nn.ReLU())

        # Output Layer
        self.layers.append(torch.nn.Linear(self.hidden_size, 1))

    def forward(self, x):
        x = torch.tensor(x.values, dtype=torch.float32)

        for layer in self.layers:
            x = layer(x)

        return x

In [13]:
deep_reg = DeepNeuralNetworkRegressor(input_size=x_train.shape[1], hidden_size=50, hidden_layers=8)

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(deep_reg.parameters(), lr=0.01)

epochs = 200

for epoch in range(epochs):
    deep_reg.train()
    optimizer.zero_grad()

    # Forward Pass
    y_pred = deep_reg(x_train)

    # Loss Computation
    loss = loss_fn(torch.tensor(y_train.values, dtype=torch.float32), y_pred)

    # Backward Propagation
    loss.backward()
    optimizer.step()

In [14]:
y_pred = deep_reg(x_test).detach().numpy()

deep_reg_metrics = evaluate_regression('Deep Neural Network Regression', y_test, y_pred)
print(deep_reg_metrics)

{'model_name': 'Deep Neural Network Regression', 'r2_score': 0.1392872467800419, 'explained_variance': 0.14906499723941535, 'mae': 0.639071986079216, 'mse': 0.6100301638446453}


In [15]:
metrics_list = [lin_reg_metrics, sgd_reg_metrics, xgb_reg_metrics, mlp_reg_metrics, voting_reg_metrics, deep_reg_metrics]
results = pd.DataFrame(metrics_list).sort_values(['mse']).reset_index(drop=True)
results.head(results.shape[0])

Unnamed: 0,model_name,r2_score,explained_variance,mae,mse
0,Voting Regressor,0.484607,0.484758,0.448506,0.365285
1,XGBoost Regressor,0.472041,0.473053,0.397286,0.374191
2,Linear Regression,0.383096,0.383105,0.521572,0.43723
3,Stochastic Gradient Descent Regression,0.379352,0.379438,0.525082,0.439884
4,Multilayer Perceptron Regressor,0.207942,0.207972,0.522036,0.561371
5,Deep Neural Network Regression,0.139287,0.149065,0.639072,0.61003


In that way, we can choose the XGBoost Regressor model as our default model that is the second highest model performance and it can be easily explained by using it's features importances and SHAP Tree Explainer.

In [16]:
with open('model/xgb_reg.pickle', 'wb') as pickle_file:
    pickle.dump(xgb_reg_pipe, pickle_file)