In [2]:
import sys
sys.path.append('../../../')
sys.path.append('../')

In [3]:
import pandas as pd
import numpy as np
from libs.kNearestNeighbor import knn
from libs import metrics

In [4]:
path = 'https://raw.githubusercontent.com/spinosaphb/ufc-machine-learning/main/datasets/winequality-white.csv'
df = pd.read_csv(path, delimiter=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [6]:
# Getting X and y
dfs = df.sample(frac=1, random_state=42)
X = dfs.drop('quality', axis=1)
y = dfs['quality']
# normalizing X values
minValueCol = np.array(X.min())
maxValueCol = np.array(X.max())
X_normal = (X - minValueCol) / (maxValueCol - minValueCol)
X_normal

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
4656,0.211538,0.205882,0.246988,0.156442,0.115727,0.184669,0.324826,0.127048,0.336364,0.430233,0.478495
3659,0.153846,0.441176,0.096386,0.032209,0.080119,0.111498,0.276102,0.027955,0.436364,0.360465,0.838710
907,0.317308,0.166667,0.234940,0.023006,0.080119,0.097561,0.266821,0.071139,0.509091,0.244186,0.677419
4352,0.336538,0.196078,0.210843,0.015337,0.133531,0.101045,0.322506,0.090033,0.418182,0.290698,0.435484
3271,0.259615,0.235294,0.204819,0.078221,0.103858,0.087108,0.190255,0.091190,0.509091,0.441860,0.645161
...,...,...,...,...,...,...,...,...,...,...,...
4426,0.230769,0.127451,0.313253,0.090491,0.112760,0.090592,0.264501,0.136302,0.454545,0.313953,0.306452
466,0.307692,0.058824,0.192771,0.128834,0.089021,0.181185,0.306265,0.163678,0.454545,0.244186,0.225806
3092,0.365385,0.186275,0.313253,0.039877,0.100890,0.090592,0.331787,0.080586,0.272727,0.360465,0.548387
3772,0.240385,0.156863,0.174699,0.200920,0.077151,0.177700,0.290023,0.165028,0.409091,0.186047,0.419355


## Dividing X and y by the `quality` label

In [7]:
values_quality = y.unique()
dfByQuality = {
    'X': {},
    'y': {}
}
for quality in values_quality:
    byQuality = y == quality
    dfByQuality['X'][quality] = X_normal[byQuality]
    dfByQuality['y'][quality] = y[byQuality]

## Splitting `X` and `y` into training and testing in a stratified way

In [8]:
# Stratifying and Getting X_train, X_test, y_train and y_test
X_train, X_test, y_train, y_test = np.empty((0,11)), np.empty((0, 11)), np.array([]), np.array([])

In [9]:
for quality in values_quality:
    # Getting percentage for stratification 
    percent = int(dfByQuality['X'][quality].shape[0] * 0.7)
    # Stratifying and Getting X_train and X_test
    X_train = np.concatenate([X_train, dfByQuality['X'][quality][:percent]])
    X_test = np.concatenate([X_test, dfByQuality['X'][quality][percent:]])
    # Stratifying and Getting y_train and y_test
    y_train  = np.concatenate([y_train, dfByQuality['y'][quality][:percent]])
    y_test = np.concatenate([y_test, dfByQuality['y'][quality][percent:]])

## Calculating the prediction with the regression method

In [10]:
y_pred = []
for test in X_test:
    predict = knn.predict_regression(X_train, y_train, test, 5)
    y_pred.append(predict)

In [11]:
difference = abs(y_pred - y_test)
data = np.column_stack([y_pred, y_test, difference])
df_relation = pd.DataFrame(data, columns=["y_pred", "y_test", "diff"])

## Regression metrics

### `Mean absolute error`

In [12]:
mae = metrics.mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error: {mae}')

Mean absolute error: 0.5466032608695652


### `Mean squared error`

In [15]:
mse = metrics.mean_squared_error(y_test, y_pred, squared=True)
print(f'Mean squared error: {mse}')

Mean squared error: 0.5455706521739131


### `Root Mean squared error`

In [16]:
mse = metrics.mean_squared_error(y_test, y_pred, squared=False)
print(f'Mean squared error: {mse}')

Mean squared error: 0.7386275463140494
