In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,roc_curve, roc_auc_score, plot_roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the dataset

In [None]:
wine = pd.read_csv("/kaggle/input/wine-quality/winequalityN.csv")
wine.head()

## Preprocessing

Let's have a look at the features and their data types

In [None]:
wine.info()

Check to see if have any null values and duplicate values and drop those observations.

In [None]:
print(wine.isnull().sum())
wine = wine.dropna()

In [None]:
print(wine.duplicated().sum())
wine.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)
wine.shape

# Exploratory Data Analysis

Let's get an idea of how many samples we have of each type, red and white

In [None]:
wine['type'].value_counts(normalize=False)

In [None]:
plt.figure(figsize = (5,5))
sns.countplot(x = wine['type'], palette=["#a92323", '#FFFC96']);
plt.title("Count of wine types", size=20, color="black")
plt.xlabels =["Red", "White"]

In [None]:
labels =["White", "Red"]
sizes = [wine['type'].value_counts()[1], wine['type'].value_counts()[0]]
colors = ['#FFFC96','#a92323']
fig1, ax1 = plt.subplots()
ax1.pie(x = sizes, labels = labels, colors = colors, autopct='%1.2f%%')
plt.title("Proportion of wine type", size=20, color="black")
ax1.axis('equal')  
plt.tight_layout()
plt.show()

Transform the categorical variable 'type' by encoding the labels

In [None]:
encoder = LabelEncoder()
encoder.fit(wine['type'])
wine['type'] = encoder.transform(wine['type'])
wine['type']

Looking into the relationship between eacch pair of feature variables

In [None]:
correlation = wine.corr()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(wine.corr(), square=True, annot=True, cmap="YlOrRd");

The variable of interest here is the 'type' variable. Clearly certain variables are more strongly correlated to the wine type than others, like total sulfur dioxide, free sulfur dioxide, volatile acidity and residual sugar 

In [None]:
wine.groupby('type')['total sulfur dioxide'].mean().to_frame(name = 'mean').reset_index()

In [None]:
wine.groupby('type')['total sulfur dioxide'].mean().plot.bar(x='type', y='mean', color=["#a92323", "#FFFC96"])

In [None]:
boxdata = wine[wine['type']==0]
boxdata = boxdata [['total sulfur dioxide']]

plt.boxplot(boxdata, notch=None)
plt.show()

In [None]:
boxdata2 = wine[wine['type']==1]
boxdata2 = boxdata2 [['total sulfur dioxide']]

plt.boxplot(boxdata2, notch=None)
plt.show()

In [None]:
wine.groupby('type')['free sulfur dioxide'].mean().plot.bar(x='type', y='mean',color=["#a92323", "#FFFC96"])

In [None]:
wine.groupby('type')['residual sugar'].mean().plot.bar(x='type', y='mean',color=["#a92323", "#FFFC96"])

In [None]:
wine.groupby('type')['volatile acidity'].mean().plot.bar(x='type', y='mean',color=["#a92323", "#FFFC96"])

In [None]:
wine.groupby('type')['chlorides'].mean().plot.bar(x='type', y='mean',color=["#a92323", "#FFFC96"])

# Scaling the features

In [None]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates']
scaler = StandardScaler()
df = wine
df[features] = scaler.fit_transform(df[features])
df.head(3)

# Splitting the data 

In [None]:
target = df['type']
data = df.drop(['type'], axis = 1)

x_train,x_test,y_train,y_test = train_test_split(data,target,train_size =0.8, test_size=0.2,random_state=11)

In [None]:
x_train.shape

In [None]:
y_train.shape


In [None]:
x_test.shape

# Dummy Classifier
Building a baseline model for comparasion purposes

In [None]:
DummyC = DummyClassifier(strategy='stratified', random_state =0)
DummyC.fit(x_train,y_train)
DummyC

In [None]:
Dummy_pred = DummyC.predict(x_test)

<h3>Evaluation Metrics</h3> 

In [None]:
print(classification_report(y_test,Dummy_pred))

In [None]:
confusion = confusion_matrix(y_test,Dummy_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,Dummy_pred)
auc = roc_auc_score(y_test, Dummy_pred)
plt.plot(fpr, tpr, color='g')
plt.title('ROC CURVE Dummy Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()

dummy_auc = round(auc,3)*100
 
print(f"The score for the ROC Curve is: {round(auc,3)*100}%")

# K Neighbours Classifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)


<h3>Evaluation Metrics</h3> 

In [None]:
print(classification_report(y_test,knn_pred))

In [None]:
confusion = confusion_matrix(y_test,knn_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,knn_pred)
auc = roc_auc_score(y_test, knn_pred)
plt.plot(fpr, tpr, color='g')
plt.title('ROC CURVE KNN')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()

dummy_auc = round(auc,3)*100
 
print(f"The score for the ROC Curve is: {round(auc,3)*100}%")

In [None]:
k_range = list(range(3,20)) # we will test k values in range 3-50
weight_op = ['uniform', 'distance'] # we will test the knn methods uniform & distance
d = {'n_neighbors' :k_range, 'weights': weight_op}

<h3>Hyperparameter Tuning using GridSearchCV</h3> 

In [None]:
grid_temp = GridSearchCV(knn, d, cv=10, scoring='accuracy') # we chose model, d(range, methods), num of cv groups and scoring method
grid_temp.fit(data, target)
print("Score:",grid_temp.best_score_," Parameters:",grid_temp.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 6, weights = 'distance')
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)

<h3>Evaluation Metrics</h3> 

In [None]:
confusion = confusion_matrix(y_test,knn_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,knn_pred)
auc = roc_auc_score(y_test, knn_pred)
plt.plot(fpr, tpr, color='g')
plt.title('ROC CURVE KNN Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()

knn_auc = round(auc,3)*100
 
print(f"The score for the ROC Curve is: {round(auc,3)*100}%")

<h3>Radius Neighbors Classifier</h3>

In [None]:
from sklearn.neighbors import RadiusNeighborsClassifier
rad = RadiusNeighborsClassifier(radius = 4, outlier_label = 'most_frequent')
rad.fit(x_train,y_train)

<h3>Evaluation Metrics</h3> 

In [None]:
rad_pred = rad.predict(x_test)
print(classification_report(y_test,rad_pred))

In [None]:
confusion = confusion_matrix(y_test,rad_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

In [None]:
radius_range = list(range(1,10)) # we will test k values in range 3-50
weight_op = ['uniform', 'distance'] # we will test the knn methods uniform & distance
d = {'radius' :radius_range, 'weights': weight_op}

In [None]:
grid_temp = GridSearchCV(rad, d, cv=10, scoring='accuracy') # we chose model, d(range, methods), num of cv groups and scoring method
grid_temp.fit(data, target)
print("Score:",grid_temp.best_score_," Parameters:",grid_temp.best_params_)

In [None]:
rad = RadiusNeighborsClassifier(radius=3, weights='distance', outlier_label='most_frequent')
rad.fit(x_train,y_train)
rad2_pred = rad.predict(x_test)

<h3>Evaluation Metrics</h3> 

In [None]:
print(classification_report(y_test,rad2_pred))

In [None]:
confusion = confusion_matrix(y_test,rad2_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,rad2_pred)
auc = roc_auc_score(y_test, rad2_pred)
plt.plot(fpr, tpr, color='g')
plt.title('ROC CURVE RNN Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()

rad_auc = round(auc,3)*100
 
print(f"The score for the ROC Curve is: {round(auc,3)*100}%")

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)


In [None]:
log_pred = logreg.predict(x_test)
print(classification_report(y_test, log_pred))

In [None]:
confusion = confusion_matrix(y_test,log_pred)
sns.heatmap(confusion,annot=True,annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel("Prediction")
plt.ylabel("Actual")

<h3>Evaluation Metrics</h3> 

In [None]:
cm = confusion_matrix(y_test, log_pred)
sns.heatmap(cm, annot=True, annot_kws = {'size':15},fmt=".0f", cmap="RdYlGn")
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
print(metrics.accuracy_score(y_test, log_pred))

In [None]:
fpr, tpr, _= roc_curve(y_test, log_pred)
auc= roc_auc_score(y_test, log_pred)
plt.plot(fpr, tpr, label="auc="+str(auc), color='orange')
plt.box(False)
plt.title('ROC CURVE Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()

lr_auc = round(auc,3)*100
 
print(f"The score for the ROC Curve is: {round(auc,3)*100}%")

In [None]:
print("The score for the models:\n")
print("Dummy Classifer:                  ",dummy_auc,"%")
print("KNN:                              ",knn_auc,"%")
print("Radius Nearest Neighbours:        ",rad_auc,"%")
print("Logistic Regression:              ",lr_auc,"%")

# KNN is the most accurate model for our dataset, with an AUC Score of 98.7%.