In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

plt.style.use('ggplot')
sb.set_style('whitegrid')

In [2]:
red = pd.read_csv('winequality-red.csv', sep= ';')

In [3]:
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
white = pd.read_csv('winequality-white.csv', sep= ';')

In [5]:
white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [8]:
red['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

In [96]:
X = red.drop('quality', axis = 1)
y = red['quality']
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit(Xr_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
Xr_train = scaler.transform(Xr_train)

In [19]:
Xr_test = scaler.transform(Xr_test)

In [22]:
from sklearn.linear_model import LinearRegression

In [97]:
lin = LinearRegression()

In [98]:
lin.fit(Xr_train,yr_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [99]:
lin_pr = lin.predict(Xr_test)

In [100]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report

In [101]:
print(mean_absolute_error(yr_test, lin_pr))

0.5163348830985564


In [102]:
print(np.sqrt(mean_squared_error(yr_test, lin_pr)))

0.6513095193379136


In [103]:
from sklearn.linear_model import LogisticRegression

In [104]:
log = LogisticRegression()

In [105]:
log.fit(Xr_train, yr_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [106]:
log_pred = log.predict(Xr_test)

In [107]:
print(classification_report(yr_test, log_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        19
           5       0.62      0.76      0.68       217
           6       0.50      0.61      0.55       213
           7       0.57      0.06      0.10        70
           8       0.00      0.00      0.00         7

    accuracy                           0.56       528
   macro avg       0.28      0.24      0.22       528
weighted avg       0.54      0.56      0.52       528



  'precision', 'predicted', average, warn_for)


In [38]:
yr_test

803     6
124     5
350     6
682     5
1326    6
       ..
813     4
377     7
898     7
126     5
819     5
Name: quality, Length: 528, dtype: int64

In [74]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [50]:
from sklearn.model_selection import GridSearchCV

In [75]:
grid = GridSearchCV(estimator= KNeighborsRegressor(), param_grid= {'n_neighbors': [i for i in range(1,100)]}, verbose= 1)

In [76]:
grid.fit(Xr_train, yr_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 99 candidates, totalling 297 fits


[Parallel(n_jobs=1)]: Done 297 out of 297 | elapsed:    8.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [77]:
grid.best_params_

{'n_neighbors': 46}

In [109]:
knn = KNeighborsRegressor(n_neighbors=46)

In [110]:
knn.fit(Xr_train, yr_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=46, p=2,
                    weights='uniform')

In [111]:
k_pred = knn.predict(Xr_test)

In [112]:
print(classification_report(yr_test, k_pred))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [113]:
print(mean_absolute_error(yr_test, k_pred))

0.6205533596837945


In [114]:
print(np.sqrt(mean_squared_error(yr_test, k_pred)))

0.7706202804611841


In [115]:
pred = pd.DataFrame(lin_pr, columns= ['Predicted'])

In [116]:
yr_test = yr_test.values

In [117]:
true = pd.DataFrame(yr_test, columns= ['True'])

In [118]:
res = pd.concat([true, pred], axis = 1)

In [119]:
res

Unnamed: 0,True,Predicted
0,6,5.353631
1,5,5.094950
2,6,5.585966
3,5,5.419458
4,6,5.744237
...,...,...
523,4,6.025793
524,7,6.819990
525,7,6.599179
526,5,4.842761


In [146]:
float(res.to_numpy()[4][1]

5.744237185147895