In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.model_selection as ms

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

redWineFilePath = os.path.abspath('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
redWineData = pd.read_csv(redWineFilePath, engine = 'python')
print(redWineData.describe())


Seems like there is no missing data.
Lets take a look into the data.

Lets divide into 'good' and 'bad' quality to do feature selection and make the graphics for that more readable:

In [None]:
redWineData['goodWine'] = [1 if x>=7 else 0 for x in redWineData['quality']]
redWineData.head()

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sn
corr = redWineData.corr()
plt.figure(figsize=(16,16))
sn.heatmap(corr,annot=True)
plt.figure(figsize=(10,10))
plt.show()

In [None]:
sn.set(color_codes=True)
var = list(redWineData.columns)
var.remove('quality')
var.remove('goodWine')
print(var)
sn.pairplot(redWineData, hue='goodWine', vars = var, palette="husl")
plt.show()

We see that the fixed acidity has only a small influence on the quality. So lets remove it.

In [None]:
redWineDataOptimized = redWineData.drop(['fixed acidity'], axis=1)

Split into optimization target and data to work on:

In [None]:
y = redWineDataOptimized['quality']
x = redWineDataOptimized.drop(['quality'], axis=1)
x.head()

Transform to standard scale

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

#Transform x to standard scale
x = scale.fit_transform(x)

Split into training and validation data

In [None]:
xTrain, xVal, yTrain, yVal = ms.train_test_split(x, y, random_state = 1)

Use KNN and cross validation and find best parameter for k:

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
import math
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

def crossVal(k_candy):
    myPipeline = Pipeline(steps=[
        ('mymodel', KNeighborsClassifier(n_neighbors = k_candy, weights='distance'))
    ])
    scores = -1 * cross_val_score(myPipeline, x, y, cv=3, scoring='neg_mean_absolute_error')
    av_score = scores.mean()
    return av_score

neighbourCandy = [i for i in range(1,200,2) ]
print(neighbourCandy)
bestVal = math.inf
nrNeighbours = 0
scores_list=[]
err={}
bestModel = KNeighborsClassifier(n_neighbors = 1, weights='distance') 
for k in neighbourCandy:
    err[k] = crossVal(k)
    scores_list.append(err[k])
    if err[k] < bestVal:
        bestVal = err[k]
        nrNeighbours = k
print(bestVal, nrNeighbours)

plot results

In [None]:

plt.plot(neighbourCandy,scores_list)
plt.xlabel("Value of K")
plt.ylabel("mean absolute error")