# Wine Quality Prediction

Implementation of a KNN classifier from scratch then using Cross Validation and Grid Search to improve the accuracy.

In [None]:
import numpy as np
import pandas as pd
import csv
import math
import warnings

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
def LoadData():
    df = pd.read_csv('wine_quality1.csv', names=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality'])
    return df

In [3]:
def data_wrangling(df):
    
    df.dropna(thresh=None,axis=0,inplace=True)
    vc = df['quality'].value_counts()
    u  = [i not in set(vc[vc==1].index) for i in df['quality']]  #returns boolean value(true for all values having count>1)
    df = df[u]
    
    return df

In [4]:
def separate_quality_column(df):
    
    X = df.drop("quality",axis=1)
    Y = df["quality"]
    
    return X,Y

In [5]:
def split_Data(X,Y):
    
    Training_Set, Test_Set, Train_Set_Quality, Test_Set_Quality= train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify=Y)
    return Training_Set, Test_Set, Train_Set_Quality, Test_Set_Quality

In [6]:
def scaler(Training_Set, Test_Set):
    
    sc = StandardScaler()
    Training_Set = sc.fit_transform(Training_Set)
    Test_Set = sc.fit_transform(Test_Set)
    
    return Training_Set, Test_Set

In [7]:
def KNN(K, Training_Set, Train_Set_Quality, Test_Set, Test_Set_Quality):
    
    classifier = KNeighborsClassifier(n_neighbors = K, metric = 'euclidean')

    classifier.fit(Training_Set, Train_Set_Quality)
    y_predicted = classifier.predict(Test_Set)

    print ('Accuracy obtained with KNN: {0} %'.format(accuracy_score(Test_Set_Quality,y_predicted) * 100))

    return classifier

In [8]:
def Cross_Validation(classifier, Training_Set, Train_Set_Quality, cv):
    
    cv_scores = cross_val_score(classifier, Training_Set, Train_Set_Quality, cv=5)
    print ('Accuracy obtained after cross validation: {0} %'.format(np.mean(cv_scores) * 100))

In [9]:
def Grid_Search_CV(K1, K2, Training_Set, Train_Set_Quality, cv):
    
    classifier_2 = KNeighborsClassifier()
    param_grid = {"n_neighbors": np.arange(K1, K2)}
    knn_gscv = GridSearchCV(classifier_2, param_grid, cv=5)
    knn_gscv.fit(Training_Set, Train_Set_Quality)
    print ("Optimized K: ",knn_gscv.best_params_)
    print ("Accuracy obtained after GridSearchCV: {0} %".format(knn_gscv.best_score_ * 100))

In [10]:
def main():
    
    df = LoadData()
    dflength = len(df)
    
    df = data_wrangling(df)
    
    X,Y = separate_quality_column(df)
    warnings.filterwarnings("ignore")
    
    Training_Set, Test_Set, Train_Set_Quality, Test_Set_Quality = split_Data(X, Y)
    Training_Set, Test_Set = scaler(Training_Set, Test_Set)

    classifier = KNN(4, Training_Set, Train_Set_Quality, Test_Set, Test_Set_Quality)
    
    
    cv=5
    Cross_Validation(classifier, Training_Set, Train_Set_Quality, cv)
    
    K1 = 1
    K2 = 26
    Grid_Search_CV(K1, K2, Training_Set, Train_Set_Quality, cv)
    
main()

Accuracy obtained with KNN: 52.755102040816325 %
Accuracy obtained after cross validation: 56.35520108426512 %
Optimized K:  {'n_neighbors': 1}
Accuracy obtained after GridSearchCV: 61.23054969114082 %
