# Red Wine Quality - Model Comparison

## Dataset Load

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import RobustScaler
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from math import log



wine = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
wine.head()

## Data Analyisis

In [None]:
wine.isnull().sum()

In [None]:
wine.isna().sum()

In [None]:
wine.describe()

In [None]:
fig = make_subplots(
    rows = 4,
    cols = 3,
    subplot_titles = wine.columns
)

i = 1
j = 1

for col in wine.columns:
    fig.append_trace(go.Box(y=wine[:][col]), i, j)
    if j < 3:
        j += 1
    else:
        j = 1
        i += 1
fig.update_layout(
    autosize=False,
    width=1200,
    height=2000,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [None]:
corr = wine.corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)

In [None]:
np.abs(corr["quality"]).sort_values(ascending=False)

### Scaling

In [None]:
# List of columns that need to be scaled:
need_scaling = ["fixed acidity", "residual sugar", "free sulfur dioxide", "total sulfur dioxide", "alcohol"]
scaler = RobustScaler()
wine[need_scaling] = scaler.fit_transform(wine[need_scaling], wine["quality"]);

### Outlier removal

In [None]:
z_scores = zscore(wine)
abs_z_scores = np.abs(z_scores)
wine = wine[(abs_z_scores <= 1.5).all(axis=1)]
"""
wine["residual sugar"] = wine["residual sugar"].drop(wine[
    (wine["residual sugar"] > 2)
].index)
wine["chlorides"] = wine["chlorides"].drop(wine[
    (wine["chlorides"] > 0.12)
].index)
"""
wine.describe()

In [None]:
fig = make_subplots(
    rows = 4,
    cols = 3,
    subplot_titles = wine.columns
)

i = 1
j = 1

for col in wine.columns:
    fig.append_trace(go.Box(y=wine[:][col]), i, j)
    if j < 3:
        j += 1
    else:
        j = 1
        i += 1
fig.update_layout(
    autosize=False,
    width=1200,
    height=2000,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
fig.show()

### Histograms

In [None]:
fig = make_subplots(rows=(wine.shape[1]//3)+1, cols=3)

for i, col in enumerate(wine.columns):
    if col == 'quality':
        fig.add_trace(go.Histogram(x=wine[col], name=col, nbinsx=3),row=(i//3)+1, col=(i%3)+1)
    else:
        fig.add_trace(go.Histogram(x=wine[col], name=col), row=(i//3)+1, col=(i%3)+1)
    
fig.update_layout(height=1500,)
    
fig.show()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 8))

ax1.scatter(wine["alcohol"], wine["quality"], marker=".")
ax1.set_title("Acohol vs Quality")
ax2.scatter(wine["volatile acidity"], wine["quality"], marker=".")
ax2.set_title("Volatile Acidity vs Quality")
ax3.scatter(wine["sulphates"], wine["quality"], marker=".")
ax3.set_title("Sulphates vs Quality")
ax4.scatter(wine["citric acid"], wine["quality"], marker=".")
ax4.set_title("Citric Acid vs Quality")
plt.show()

## Alcohol vs Quality Linear Regression

### Train/Test Dataset Split

In [None]:
X = wine[['alcohol']]
Y = wine['quality']
wine_sample = wine.sample(500, random_state=0)
wine_sample = wine_sample.sort_values(by='alcohol')
X_val = wine_sample[['alcohol']]
Y_val = wine_sample['quality']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

model = LinearRegression().fit(X, Y)


model_l1 = Lasso(alpha=1).fit(X, Y)

model_l2 = Ridge(alpha=1).fit(X, Y)


error_metrics = pd.DataFrame(columns=['CONFIG','RMSE', 'MAE'])

# model without regularization

error_metrics = error_metrics.append({
    'CONFIG': 'train',
    'RMSE': mean_squared_error(Y, model.predict(X), squared=True),
    'MAE': mean_absolute_error(Y, model.predict(X))
}, ignore_index=True)

error_metrics = error_metrics.append({
    'CONFIG': 'validation',
    'RMSE': mean_squared_error(Y_val, model.predict(X_val), squared=True),
    'MAE': mean_absolute_error(Y_val, model.predict(X_val))
}, ignore_index=True)

# model with regularization L1

error_metrics = error_metrics.append({
    'CONFIG': 'train (L1)',
    'RMSE': mean_squared_error(Y, model_l1.predict(X), squared=True),
    'MAE': mean_absolute_error(Y, model_l1.predict(X))
}, ignore_index=True)

error_metrics = error_metrics.append({
    'CONFIG': 'validation (L1)',
    'RMSE': mean_squared_error(Y_val, model_l1.predict(X_val), squared=True),
    'MAE': mean_absolute_error(Y_val, model_l1.predict(X_val))
}, ignore_index=True)

# model with regularization L2

error_metrics = error_metrics.append({
    'CONFIG': 'train (L2)',
    'RMSE': mean_squared_error(Y, model_l2.predict(X), squared=True),
    'MAE': mean_absolute_error(Y, model_l2.predict(X))
}, ignore_index=True)

error_metrics = error_metrics.append({
    'CONFIG': 'validation (L2)',
    'RMSE': mean_squared_error(Y_val, model_l2.predict(X_val), squared=True),
    'MAE': mean_absolute_error(Y_val, model_l2.predict(X_val))
}, ignore_index=True)

error_metrics

fig = make_subplots(rows=1, cols=2)

# draw models
fig.add_trace(go.Scatter(x=X.iloc[:,0], y=Y, mode='markers', name='train'), row=1, col=1)
fig.add_trace(go.Scatter(x=X_val.iloc[:,0], y=Y_val, mode='markers', name='validation'), row=1, col=1)

model_names = ["model", "model L1", "model L2"]
for i, model in enumerate([model, model_l1, model_l2]):
    fig.add_trace(go.Scatter(
        x=X.iloc[:,0], y=model.predict(X), mode='lines', name=model_names[i], line_shape='spline'
    ), row=1, col=1)

fig.update_yaxes(title_text="quality", row=1, col=1)
fig.update_xaxes(title_text="alcohol", row=1, col=1)

# draw errors

for index, row in error_metrics.iterrows():
    fig.add_trace(go.Bar(name=row['CONFIG'], x=["RMSE", "MAE"], y=row[['RMSE','MAE']]), row=1, col=2)

fig.show()

### BIC and AIC for Linear Regression

In [None]:
from math import log

def calculate_aic(n, mse, num_params):
    '''calculate aic for linear regression'''
    aic = n * log(mse) + 2 * num_params
    return aic# ****# # # AIC/BIC LINEAR REGRESSION

num_params = len(model.coef_) + 1
print('Number of parameters: %d' % (num_params))

yhat = model.predict(X)

mse = mean_squared_error(Y, model.predict(X))
print('MSE: %.3f' % mse)

aic = calculate_aic(len(Y), mse, num_params)
print('AIC: %.3f' % aic)


def calculate_bic(n, logloss, num_params):
    '''calculate aic for logistic regression'''
    bic = -2 * log(logloss) + log(n) * num_params
    return bic

bic = calculate_bic(len(Y), mse, num_params)
print('BIC: %.3f' % bic)

## Multiple Linear Regression

In [None]:
X = wine[['alcohol', 'volatile acidity', 'sulphates']].copy()
Y = wine['quality']


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

lrmodel_l1 = Lasso(alpha=1, max_iter=10000, tol=1e-5).fit(X_train, y_train)

logreg_model = LogisticRegression(
    class_weight='balanced',
    solver='newton-cg',
    max_iter=1000,
    C=0.001
)
logreg_model.fit(X_train, y_train)

error_metrics = pd.DataFrame(columns=['CONFIG','RMSE', 'MAE'])

# model without regularization

error_metrics = error_metrics.append({
    'CONFIG': 'train',
    'RMSE': mean_squared_error(Y, logreg_model.predict(X), squared=True),
    'MAE': mean_absolute_error(Y, logreg_model.predict(X))
}, ignore_index=True)


# model with regularization L1

error_metrics = error_metrics.append({
    'CONFIG': 'train (L1)',
    'RMSE': mean_squared_error(Y, lrmodel_l1.predict(X), squared=True),
    'MAE': mean_absolute_error(Y, lrmodel_l1.predict(X))
}, ignore_index=True)


error_metrics


fig = make_subplots(rows=1, cols=2, specs=[[{'is_3d': True}, {'is_3d': False}]])

# draw models
fig.add_trace(go.Scatter3d(
    x=X.iloc[:,0],
    y=X.iloc[:,1],
    z=Y,
    mode='markers',
    
), row=1, col=1)


# prediction surface

X1 = np.array([X.iloc[:,0].min(), X.iloc[:,0].min(), X.iloc[:,0].max(), X.iloc[:,0].max()])
X2 = np.array([X.iloc[:,1].min(), X.iloc[:,1].max(), X.iloc[:,1].min(), X.iloc[:,1].max()])
X3 = np.array([X.iloc[:,2].mean(), X.iloc[:,2].mean(), X.iloc[:,2].mean(), X.iloc[:,2].mean()])
Z = logreg_model.predict(np.column_stack([X1, X2, X3]))


fig.add_trace(go.Mesh3d(
    x=X1,
    y=X2,
    z=Z,
    opacity=0.8,
    name="model"
),row=1, col=1)

# draw errors

for index, row in error_metrics.iterrows():
    fig.add_trace(go.Bar(name=row['CONFIG'], x=["RMSE", "MAE"], y=row[['RMSE','MAE']]), row=1, col=2)

fig.show()

### AIC and BIC for Multiple Linear Regression

In [None]:
def calculate_aic(n, logloss, num_params):
    '''calculate aic for logistic regression'''
    aic = (-2/n) * log(logloss) + (2 * (num_params/n))
    return aic

# numero de parametros
num_params = len(logreg_model.coef_) + 1
print('Number of parameters: %d' % (num_params))

# predicciones
yhat = logreg_model.predict_proba(X)

# calcular el mean squared error
logloss = log_loss(Y, yhat)
print('log_loss: %.3f' % logloss)

# calcular el AIC
aic = calculate_aic(len(Y), logloss, num_params)
print('AIC: %.3f' % aic)

# calcular el mean squared error
mse = mean_squared_error(Y, logreg_model.predict(X))
print('MSE: %.3f' % mse)

# calcular el BIC

from math import log

def calculate_bic(n, logloss, num_params):
    '''calculate aic for logistic regression'''
    bic = -2 * log(logloss) + log(n) * num_params
    return bic
bic = calculate_bic(len(Y), mse, num_params)
print('BIC: %.3f' % bic)


### Logistic Regression

In [None]:
wine.head()

In [None]:
X = X_train
Y = y_train

In [None]:
Y.head()


In [None]:
X = X_train
Y = y_train

logrmodel = LogisticRegression(random_state=0,solver='liblinear').fit(X, Y)

logrmodel_l1 = LogisticRegression(random_state=0, solver='liblinear', penalty='l1', C=1).fit(X, Y)

logrmodel_l2 = LogisticRegression(random_state=0,solver='liblinear', penalty='l2', C=1).fit(X, Y)

fig = make_subplots(rows=1, cols=2)

# draw models
fig.add_trace(go.Scatter(x=X['alcohol'], y=Y, mode='markers', name='train'), row=1, col=1)
#fig.add_trace(go.Scatter(x=X_val[:,0], y=Y_val, mode='markers', name='validation'), row=1, col=1)

model_names = ["model", "model L1", "model L2"]
for i, model in enumerate([logrmodel, logrmodel_l1, logrmodel_l2]):
    fig.add_trace(go.Scatter(
        x=X['alcohol'], y=model.predict(X), mode='lines', name=model_names[i], line_shape='spline'
    ), row=1, col=1)

fig.update_yaxes(title_text="Quality", row=1, col=1)
fig.update_xaxes(title_text="Alcohol", row=1, col=1)

# draw errors



fig.show()

### AIC and BIC for Logistic Regression

In [None]:
# numero de parametros
num_params = len(logrmodel.coef_) + 1
print('Number of parameters: %d' % (num_params))

# predicciones
yhat = logrmodel.predict_proba(X)

# calcular el mean squared error
logloss = log_loss(Y, yhat)
print('log_loss: %.3f' % logloss)

# calcular el AIC
aic = calculate_aic(len(Y), logloss, num_params)
print('AIC: %.3f' % aic)

# calcular el BIC
def calculate_bic(n, logloss, num_params):
    '''calculate aic for logistic regression'''
    bic = -2 * log(logloss) + log(n) * num_params
    return bic
bic = calculate_bic(len(Y), logloss, num_params)
print('BIC: %.3f' % bic)

### Evaluation

In [None]:
acc_train = logreg_model.score(X_train, y_train)
acc_test = logreg_model.score(X_test, y_test)

print(f"accuracy: {acc_train}, accuracy (test): {acc_test}")

## Conclusion

**The best model is simple Linear Regression because it has the lowest AIC score.**