In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
dataset = pd.read_csv("../NS2021_Dicer/wine-dataset.csv")

In [3]:
dataset

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
# kvalita vín je ohodnotená na stupnici 0 až 10
# najlepšie vína sú v datasete ohodnotené deviatimi bodmi a najhoršie tromi.
# Kedže tieto čísla nie su pre nás ideálne rozdelíme si ich do kategórií:
# nízka kvalita (0) , stredná kvalita (1) , vysoká kvalita (2)
bins= [0, 5.5, 7.5, 10] # vína s hodnotením 3-5 spadajú do kategórie nízka kvalita, 6-7 je stredná a 8-9 vysoká 
labels = [0, 1, 2]
dataset['quality'] = pd.cut(dataset['quality'], bins=bins, labels=labels)

In [5]:
dst = dataset.fillna(dataset.mean())
dst

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,1
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,1
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,1
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,1
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,0
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,1
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,1
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,0


In [6]:
x = dst.drop(columns=['type','quality'])
y = dst['quality']
sc = StandardScaler()
x = sc.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [7]:
for data in [y_train, y_test]:
    print(data.describe())

count     5197
unique       3
top          1
freq      3115
Name: quality, dtype: int64
count     1300
unique       3
top          1
freq       800
Name: quality, dtype: int64


In [8]:
n3 = KNeighborsClassifier(n_neighbors = 3)
n3.fit(x_train, y_train)
pred_n3 = n3.predict(x_test)
print(classification_report(y_test, pred_n3))
cross_val = cross_val_score(estimator=n3, X=x_train, y=y_train, cv=10)
print(cross_val.mean())

              precision    recall  f1-score   support

           0       0.68      0.66      0.67       468
           1       0.78      0.80      0.79       800
           2       0.25      0.19      0.21        32

    accuracy                           0.73      1300
   macro avg       0.57      0.55      0.56      1300
weighted avg       0.73      0.73      0.73      1300

0.7190703275529865


In [9]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
pred_dt = dt.predict(x_test)
print(classification_report(y_test, pred_dt))
cross_val = cross_val_score(estimator=dt, X=x_train, y=y_train, cv=10)
print(cross_val.mean())

              precision    recall  f1-score   support

           0       0.70      0.75      0.72       468
           1       0.83      0.78      0.80       800
           2       0.20      0.28      0.23        32

    accuracy                           0.76      1300
   macro avg       0.58      0.60      0.59      1300
weighted avg       0.76      0.76      0.76      1300

0.7248395583222174


In [10]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
pred_rf = rf.predict(x_test)
print(classification_report(y_test, pred_rf))
cross_val = cross_val_score(estimator=rf, X=x_train, y=y_train, cv=10)
print(cross_val.mean())

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       468
           1       0.84      0.88      0.86       800
           2       1.00      0.25      0.40        32

    accuracy                           0.82      1300
   macro avg       0.87      0.63      0.68      1300
weighted avg       0.82      0.82      0.81      1300

0.8043085815918186
