# Wine Quality Prediction

In this notebook, we will use Linear Regression to analyze the correlation between wine quality and several key factors of wines, such as pH and density.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

In [2]:
train = pd.read_csv("wine_dataset/train.csv")
target_column = 'quality'

x_train = train.drop(columns=[target_column]) 
y_train = train[target_column]

valid = pd.read_csv('wine_dataset/val.csv')
x_val = valid.drop(columns=[target_column]) 
y_val = valid[target_column]


test = pd.read_csv('wine_dataset/test.csv')
x_test = test.drop(columns=[target_column])
y_test = test[target_column]

In [3]:
print(x_train.shape)
print(y_train.shape)

print(x_val.shape)
print(y_val.shape)

print(x_test.shape)
print(y_test.shape)

(959, 12)
(959,)
(400, 12)
(400,)
(240, 12)
(240,)


In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

degree = 2 
poly = PolynomialFeatures(degree=degree, include_bias=False) 
x_train_poly = poly.fit_transform(x_train)
x_val_poly = poly.transform(x_val)
x_test_poly = poly.transform(x_test)


In [5]:
x_train_poly.shape

(959, 90)

In [6]:
model = LinearRegression()
model.fit(x_train_poly, y_train)

In [7]:
def compute_mse(y_pred, y_true):
    n = len(y_pred)
    sqaured_errors = (y_pred - y_true)**2
    mse = np.sum(sqaured_errors) / n
    return mse

In [8]:
def print_analysis(model, x, y, set_name):
    y_pred = model.predict(x)
    mse = compute_mse(y_pred, y)
    print(f"Mean Sqaured Error on {set_name} set: {mse}")


### Calculate Mean Squared Error

In [9]:
print_analysis(model, x_train_poly, y_train, 'training')
print_analysis(model, x_val_poly, y_val, 'validation')

Mean Sqaured Error on training set: 0.3306926858618119
Mean Sqaured Error on validation set: 0.4146219036505637


### Applying L2 Regularization (Ridge) to Prevent Overfitting

In [10]:
def compute_mse_given_alpha(alpha):
        model = Ridge(alpha=alpha)
        model.fit(x_train_poly, y_train)
    
        # Calculate MSE using hold-out dataset
        y_val_pred = model.predict(x_val_poly)
        mse = compute_mse(y_val_pred, y_val)
        print(f"Alpha: {alpha}\tMSE: {mse}")
        return mse

def alpha_tuning_using_binary_search(low, high):
    best_alpha = 0
    while low <= high:
        mid = (low + high) // 2
        alpha = low 
        low_mse = compute_mse_given_alpha(low)
        high_mse = compute_mse_given_alpha(mid)
        mid_mse = compute_mse_given_alpha(high)

        if mid_mse < low_mse:
            low = mid + 1
        else:
            high = mid - 1

        best_alpha = mid

    return best_alpha


def alpha_tuning_given_range(low, high):
    best_alpha = 0
    best_mse = float('inf')
    for i in range(low, high+1):
        mse = compute_mse_given_alpha(i)
        if mse < best_mse:
            best_mse = mse
            best_alpha = i
    
    return best_alpha


def alpha_tuning_given_set(alpha_values):
    best_alpha = 0
    best_mse = float('inf')
    for i in alpha_values:
        mse = compute_mse_given_alpha(i)
        if mse < best_mse:
            best_mse = mse
            best_alpha = i
    
    return best_alpha
     

### Hyper-Parameter Tuning: Identifying the best alpha values with 3 different approaches

In [11]:
bs_alpha = alpha_tuning_using_binary_search(0, 500)
print(f'Best alpha (0, 100) using Binary Search: {bs_alpha}')

Alpha: 0	MSE: 0.41296063919290676
Alpha: 250	MSE: 0.39101615028936654
Alpha: 500	MSE: 0.39088043377783777
Alpha: 251	MSE: 0.3910145121539794
Alpha: 375	MSE: 0.39088116743591234
Alpha: 500	MSE: 0.39088043377783777
Alpha: 376	MSE: 0.3908806719185368
Alpha: 438	MSE: 0.39086632930406023
Alpha: 500	MSE: 0.39088043377783777
Alpha: 439	MSE: 0.3908663465227892
Alpha: 469	MSE: 0.3908701843655839
Alpha: 500	MSE: 0.39088043377783777
Alpha: 439	MSE: 0.3908663465227892
Alpha: 453	MSE: 0.3908673511560283
Alpha: 468	MSE: 0.3908699563843424
Alpha: 439	MSE: 0.3908663465227892
Alpha: 445	MSE: 0.39086660391365796
Alpha: 452	MSE: 0.39086723286641617
Alpha: 439	MSE: 0.3908663465227892
Alpha: 441	MSE: 0.3908664030689765
Alpha: 444	MSE: 0.39086654279437494
Alpha: 439	MSE: 0.3908663465227892
Alpha: 439	MSE: 0.3908663465227892
Alpha: 440	MSE: 0.3908663711192183
Best alpha (0, 100) using Binary Search: 439


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [12]:
bf_alpha = alpha_tuning_given_range(0, 100)
print(f'Best alpha (0, 100) using brute force: {bf_alpha}')

Alpha: 0	MSE: 0.41296063919290676
Alpha: 1	MSE: 0.4003508951519521
Alpha: 2	MSE: 0.39849807963811285
Alpha: 3	MSE: 0.39710823775441156
Alpha: 4	MSE: 0.3960354138886136
Alpha: 5	MSE: 0.39519104775147873
Alpha: 6	MSE: 0.3945161802150375
Alpha: 7	MSE: 0.3939699156608134
Alpha: 8	MSE: 0.3935230381206978
Alpha: 9	MSE: 0.39315416775779694
Alpha: 10	MSE: 0.3928473395930267
Alpha: 11	MSE: 0.3925904244096163
Alpha: 12	MSE: 0.39237406850869777
Alpha: 13	MSE: 0.39219096377886176
Alpha: 14	MSE: 0.39203533413822855
Alpha: 15	MSE: 0.3919025672548485
Alpha: 16	MSE: 0.3917889459841287
Alpha: 17	MSE: 0.3916914495870035
Alpha: 18	MSE: 0.3916076046145666
Alpha: 19	MSE: 0.39153537170854363
Alpha: 20	MSE: 0.39147305871807453
Alpha: 21	MSE: 0.39141925336286504
Alpha: 22	MSE: 0.39137277055974573
Alpha: 23	MSE: 0.3913326108822984
Alpha: 24	MSE: 0.3912979275413002
Alpha: 25	MSE: 0.3912679999551137
Alpha: 26	MSE: 0.39124221245202334
Alpha: 27	MSE: 0.39122003699968727
Alpha: 28	MSE: 0.3912010191232184
Alpha: 29	

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [13]:
alpha_values = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
set_alpha = alpha_tuning_given_set(alpha_values)
print(f'Best alpha (0, 5, 10, 15, 20, 25, 30, 35, 40) given values: {set_alpha}')

Alpha: 0	MSE: 0.41296063919290676
Alpha: 5	MSE: 0.39519104775147873
Alpha: 10	MSE: 0.3928473395930267
Alpha: 15	MSE: 0.3919025672548485
Alpha: 20	MSE: 0.39147305871807453
Alpha: 25	MSE: 0.3912679999551137
Alpha: 30	MSE: 0.3911709387086622
Alpha: 35	MSE: 0.391129061693851
Alpha: 40	MSE: 0.39111625764851793
Alpha: 45	MSE: 0.39111879203956773
Best alpha (0, 5, 10, 15, 20, 25, 30, 35, 40) given values: 40


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


### Identifying the best alpha

Let's identify the minimum MSE alpha value obtained from all 3 approaches

In [14]:
best_alpha = min(compute_mse_given_alpha(bs_alpha),
                 compute_mse_given_alpha(bf_alpha), 
                 compute_mse_given_alpha(set_alpha))

Alpha: 439	MSE: 0.3908663465227892
Alpha: 41	MSE: 0.391115843512288
Alpha: 40	MSE: 0.39111625764851793


In [15]:
# Taking the alpha with minimum MSE from all these approaches

model = Ridge(alpha=best_alpha)
model.fit(x_train_poly, y_train)
print_analysis(model, x_test_poly, y_test, 'test')

Mean Sqaured Error on test set: 0.4763901166183406
