In [None]:
#import libraries
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans 
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

print('Libraries imported!')

In [None]:
#read dataset
pd.set_option('display.max_columns', None)
df_wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', delimiter=',')
df_wine.head()

In [None]:
#make a copy of the dataset so we still have the original data in case we make any changes to the other data

df_wine_original= df_wine.copy()

## Data Exploration

In [None]:
#view the shape of the train data
df_wine.shape

In [None]:
#check the data types of each column
df_wine.info()

In [None]:
#obtain a statistical summary of the dataframe
pd.set_option('display.max_columns', None)
df_wine.describe(include="all")

## Bivariate analysis

### Dependent Variable

In [None]:
#Let’s see how many unique quality is in our target variable
df_wine['quality'].value_counts()

681 with quality of 5, 638 with quality of 6, 199 with quality of 7, 53 with quality of 4, 18 with quality of 8 and 10 with quality of 3

In [None]:
#let's normalize the value counts to True, so we'll see proportions instead of numbers

df_wine['quality'].value_counts(normalize=True)

In [None]:
df_wine['quality'].value_counts().plot.bar()
plt.show()

### Independent Variables

In [None]:
#visualize the variables, starting with alcohol

plt.figure(1) 
plt.subplot(121)
sns.distplot(df_wine['alcohol'])

plt.subplot(122)
df_wine['alcohol'].plot.box(figsize=(16,5))

plt.show()

the distribution is fairly norrmal, but skewed to the right

In [None]:
#fixed acidity

plt.figure(1) 
plt.subplot(121)
sns.distplot(df_wine['fixed acidity'])

plt.subplot(122)
df_wine['fixed acidity'].plot.box(figsize=(16,5))

plt.show()

the distribution is fairly norrmal, and there are few outliers

In [None]:
#density

plt.figure(1) 
plt.subplot(121)
sns.distplot(df_wine['density'])

plt.subplot(122)
df_wine['density'].plot.box(figsize=(16,5))

plt.show()

In [None]:
#total sulfur dioxide

plt.figure(1) 
plt.subplot(121)
sns.distplot(df_wine['total sulfur dioxide'])

plt.subplot(122)
df_wine['total sulfur dioxide'].plot.box(figsize=(16,5))

plt.show()

The distribution is skewed to the right and there are lots of outliers

In [None]:
#free sulfur dioxide

plt.figure(1) 
plt.subplot(121)
sns.distplot(df_wine['free sulfur dioxide'])

plt.subplot(122)
df_wine['free sulfur dioxide'].plot.box(figsize=(16,5))

plt.show()

The distribution is skewed to the right and there are lots of outliers

In [None]:
matrix= df_wine.corr()
sns.set(font_scale=0.9)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(matrix, annot=True, annot_kws={'size': 10}, fmt='.1f', cmap='PiYG', linewidths=.2)
plt.show()

Most positively correlated variables include (fixed acidity and citric acid), (fixed acidity and density), (total sulfur dioxide and free sulfur dioxide). Other positively correlated variables include (alcohol and quality).

Most negatively correlated variables include (fixed acidity and pH),(citric acid and volatile acid). Other negatively correlated variables include (pH and citric acid), (alcohol and density).

In [None]:
#set an arbitrary cutoff for your dependent variable (wine quality) 
#at e.g. 7 or higher getting classified as 'good/1' and the remainder as 'not good/0'.
df_wine['quality'].replace([0,1,2,3,4,5,6],0,inplace=True)
df_wine['quality'].replace([7,8,9,10],1,inplace=True)

In [None]:
df_wine['quality'].value_counts()

In [None]:
df_wine['quality'].value_counts().plot.bar()

plt.show()

The proportion of wine classed as 'good' is way greater than the proportion of wine classed as 'not good'

In [None]:
#make bins for the alcohol variable based on the values in it

bins=[0,10,12.5,15]
group=['Low', 'Average', 'High']
df_wine['alcohol_bin']= pd.cut(df_wine['alcohol'], bins, labels=group) #add a column that categorizes the applicant incomes based on the defined bins

alcohol_bin= pd.crosstab(df_wine['alcohol_bin'],df_wine['quality'])
alcohol_bin.div(alcohol_bin.sum(1).astype(float),axis=0).plot(kind='bar', stacked=True)

plt.xlabel('Alcohol')
plt.ylabel('Percentage')

plt.show()

The alcohol content affects the quality of the wine. The lesser the alcohol content, the lower the wine quality.

### Outlier Treatment

In [None]:
#the total sulfur dioxide distribution is skewed to the right, as the right tail is longer.
#remove the skewness by doing log transformation
#log transformation does not affect smaller values much, but reduced the larger vakues

df_wine['total sulfur dioxide_log']= np.log(df_wine['total sulfur dioxide'])
df_wine['total sulfur dioxide_log'].hist(bins=20)

In [None]:
#also do the same for free sulfur dioxide
df_wine['free sulfur dioxide_log']= np.log(df_wine['free sulfur dioxide'])
df_wine['free sulfur dioxide_log'].hist(bins=20)

In [None]:
#drop the variables used to create log variables
df_wine=df_wine.drop(['free sulfur dioxide', 'total sulfur dioxide', 'alcohol_bin'], axis=1)

## Model Development

In [None]:
#place the target data in a separate dataframe
X= df_wine.drop('quality', 1)
y= df_wine[['quality']]

In [None]:
#split the train dataset for prediction and validation 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X, y, test_size=0.3)

print ('Train set:', x_train.shape,  y_train.shape) 
print ('Test set:', x_test.shape,  y_test.shape) 

In [None]:
#import KNN library and accuracy
from sklearn.neighbors import KNeighborsClassifier 
from sklearn import metrics
from sklearn.metrics import accuracy_score

#start with k=4
k=4
#train the model
neigh = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train) 
#predict the model
yhat = neigh.predict(x_test) #reshape the predictions
#Evaluate the accuracy 
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(x_train))) 
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) 

In [None]:
#Calculate the accuracy for different K from k=1 to say, k=10 
Ks = 10 
mean_acc = np.zeros((Ks-1)) 
std_acc = np.zeros((Ks-1)) 

for n in range(1,Ks): 
    #Train Model and Predict 
    neigh = KNeighborsClassifier(n_neighbors = n)
    neigh.fit(x_train,y_train) 
    yhat=neigh.predict(x_test) 
    yhat=yhat.reshape(480,1)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat) 
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0]) 
mean_acc
    

In [None]:
#Plot the accuracy against the K’s 
plt.plot(range(1,Ks),mean_acc,'g') 
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10) 
plt.fill_between(range(1,Ks),mean_acc - 3 * std_acc,mean_acc + 3 * std_acc, alpha=0.10,color="green") 
plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd')) 
plt.ylabel('Accuracy ') 
plt.xlabel('Number of Neighbors (K)') 
plt.tight_layout() 
plt.show() 

In [None]:
#Print out a statement to show the best K 
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

Lets use k=2 to build a cross validation KNN Model

In [None]:
k=2
#train the model
neigh = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train) 
#predict the model
yhat = neigh.predict(x_test) #reshape the predictions
#Evaluate the accuracy 
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(x_train))) 
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) 