In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.linear_model import LinearRegression  
import warnings
warnings.filterwarnings('ignore')

In [3]:
np.random.seed(42)

In [4]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",sep = ';')

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
df.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6


In [7]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [8]:
df.shape

(4898, 12)

In [9]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [11]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))

In [12]:
y = df.pop('quality')

In [13]:
y

0       6
1       6
2       6
3       6
4       6
       ..
4893    6
4894    5
4895    6
4896    7
4897    6
Name: quality, Length: 4898, dtype: int64

In [14]:
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size =0.2)

In [None]:
def metric_score(actual, pred):
    print("confusion matrix", confusion_matrix(actual, pred))
    print("accuracy score", accuracy_score(actual, pred))
    print("recall_score", recall_score(actual, pred))
    print("precision score", precision_score(actual, pred))
    print("f1 score", f1_score(actual, pred))
    print("roc auc score", roc_auc_score(actual, pred))
    print("classification report", classification_report(actual, pred))

In [None]:
def run_model(model, train_X, train_y, test_X, sample, filename):
    model.fit(train_X, train_y)
    pd.Series(model.coef_, index =test_X.columns).plot(kind = 'bar')
    pred_train = model.predict(train_X)
    pred_test = model.predict(test_X)
    pred_test = abs(pred_test)
    print('MAE: ', mean_absolute_error(train_y, pred_train))
    #print('RMSE: ', np.sqrt(mean_squared_error(train_y, pred_train)))
    cv_score = cross_val_score(model, train_X, train_y, cv = 10, scoring ='neg_mean_absolute_error')
    print('cv_score: ', np.mean(np.abs(cv_score)))
    print('R-squared: ', r2_score(train_y, pred_train))
    sample["Item_Outlet_Sales"] = pred_test
    #sample.to_csv(filename, index = False)
    sample.head()

In [16]:
def run_model(model, train_X, train_y, test_X):
    model.fit(train_X, train_y)
    pd.Series(model.coef_, index =test_X.columns).plot(kind = 'bar')
    pred_train = model.predict(train_X)
    pred_test = model.predict(test_X)
    print("confusion matrix", confusion_matrix(actual, pred))
    print("accuracy score", accuracy_score(actual, pred))
    print('cv_score: ', np.mean(np.abs(cv_score)))
    print('R-squared: ', r2_score(train_y, pred_train))

In [17]:
run_model

<function __main__.run_model(model, train_X, train_y, test_X)>

In [18]:
def model_fit(alg, train_x, test_x, train_y, test_y, if_cv = True, cv_folds = 5):
    alg.fit(train_x, train_y)
    
    #Cross-val
    if if_cv:
        cv_score = cross_val_score(alg, train_x, train_y, cv = cv_folds, scoring = 'f1_macro')
    
    prediction = alg.predict(test_x)
    
    if if_cv:
        print('CV report: Mean - %.3g | Std - %.3g | Min - %.3g, Max- %.3g' %(np.mean(cv_score),
                                                                              np.std(cv_score), 
                                                                              np.min(cv_score), 
                                                                              np.max(cv_score)))
    print("Accuracy: ", accuracy_score(test_y, prediction))
    print('-'*100)
    
    cm = pd.DataFrame(confusion_matrix(test_y, prediction))
    print(cm)


In [19]:
lr = LogisticRegression()
model_fit(lr, train_x, test_x, train_y, test_y)

CV report: Mean - 0.144 | Std - 0.00647 | Min - 0.132, Max- 0.15
Accuracy:  0.45918367346938777
----------------------------------------------------------------------------------------------------
   0  1   2    3  4  5
0  1  0   2    2  0  0
1  0  0   8   17  0  0
2  0  0  93  197  1  0
3  0  0  78  353  1  0
4  0  0  20  169  3  0
5  0  0   1   34  0  0


In [20]:
knn = KNeighborsClassifier()
model_fit(knn, train_x, test_x, train_y, test_y)

CV report: Mean - 0.241 | Std - 0.0155 | Min - 0.22, Max- 0.257
Accuracy:  0.4826530612244898
----------------------------------------------------------------------------------------------------
   0  1    2    3   4  5
0  0  0    3    1   1  0
1  1  3   13    7   1  0
2  0  9  141  123  18  0
3  0  6  111  266  47  2
4  0  4   35   89  60  4
5  0  0    3   20   9  3


In [21]:
def KNN_predicts(train_x, test_x, train_y, test_y, scaler, neighbours, metric ="manhattan", weights = 'uniform'):
    train_scaled = scaler.fit_transform(train_x)
    test_scaled = scaler.transform(test_x)
    
    KNN = KNeighborsClassifier(n_neighbors = neighbours, metric = metric, weights = weights, n_jobs = -1)
    KNN.fit(train_scaled, train_y)
    prediction= KNN.predict(test_scaled)
    
    print("accuracy: ", accuracy_score(test_y, prediction))
    #print("accuracy: ", accuracy_score(train_x, prediction))
    print("-"*100)
    return KNN

In [22]:
KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(),1)

accuracy:  0.6479591836734694
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=1)

In [None]:
## Neigbours tuning

In [23]:
for k in range(1, 11):
    print("Accuracy score on KNN using n_neighbors = {0}". format(k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), k)

Accuracy score on KNN using n_neighbors = 1 accuracy:  0.6479591836734694
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 2 accuracy:  0.5724489795918367
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 3 accuracy:  0.5693877551020409
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 4 accuracy:  0.5551020408163265
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 5 accuracy:  0.5795918367346938
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 6 accuracy:  0.5561224489795918
---------------------------------------------------

In [24]:
#distance metric -- manhattan give better result

In [25]:
k = 5 
for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']:
    print("Accuracy score on KNN using {} metric and {} neighbors :" .format(metric, k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), k, metric)

Accuracy score on KNN using euclidean metric and 5 neighbors : accuracy:  0.5428571428571428
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using minkowski metric and 5 neighbors : accuracy:  0.5428571428571428
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using manhattan metric and 5 neighbors : accuracy:  0.5795918367346938
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using chebyshev metric and 5 neighbors : accuracy:  0.5244897959183673
----------------------------------------------------------------------------------------------------


In [26]:
#weight parameters

In [27]:
for weight in ['uniform', 'distance']:
    print("Accuracy score on KNN using neighbors = {0}:" .format(weight), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 5, metric = 'manhattan', weights = weight)

Accuracy score on KNN using neighbors = uniform: accuracy:  0.5795918367346938
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using neighbors = distance: accuracy:  0.6806122448979591
----------------------------------------------------------------------------------------------------


In [None]:
## Model pickle

In [None]:
import joblib
joblib.dump(KNN, 'KNNModel.pkl')

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'manhattan', weights = 'distance', n_jobs = -1)
#KNN.fit(train_scaled, train_y)

In [None]:
import os
os.getcwd()

In [None]:
KNN_pickled_model = joblib.load('KNNModel.pkl')

In [None]:
KNN_pickled_model.fit(train_x, train_y)
prediction = KNN_pickled_model.predict(test_x)