# Importing all the required packages

In [None]:

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

#importing the dataset 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        




# Obtaining and pre-processing of data

In [None]:
wine = pd.read_csv("/kaggle/input/wine-quality/winequalityN.csv")
wine.head()
wine.info()

In [None]:
wine.shape

In [None]:
wine.isnull().sum()

In [None]:
wine = wine.dropna(axis=0)

In [None]:
wine.isnull().sum()

In [None]:
wine.duplicated().sum()

In [None]:
wine.drop_duplicates(keep='first', inplace=True)

In [None]:
count_cat = wine.nunique()

In [None]:
count_cat

In [None]:
dis = wine['type'].value_counts(normalize = True)

In [None]:
dis

In [None]:
plt.figure(figsize = (5,5))
sns.countplot(x= wine['type'], palette =['#808080', '#722F37'])

In [None]:
labels = ['White', 'Red']
sizes = [wine['type'].value_counts()[0],wine['type'].value_counts()[1]]
col = ['#808080', '#722F37']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=col, autopct = '%1.1f%%')
ax1.axis('equal')
plt.tight_layout()
plt.show()


In [None]:
encoder = LabelEncoder()
encoder.fit(wine['type'])
list(encoder.classes_)
wine['class']=wine['type']
wine['type']=encoder.transform(wine['type'])





# Exploaratory Data Analysis

In [None]:
wine.head()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(wine.corr(), square=True, annot=True, cmap='Purples')

As you can see, quality and alcohol are very weakly correlated to type.
So we're going to drop them from our feature set.

In [None]:
wine.groupby('class')['total sulfur dioxide'].mean().to_frame(name='mean').reset_index()

In [None]:
wine.groupby('class')['total sulfur dioxide'].mean().plot.bar(x='Type',y='mean',color = ["#722F37","#808080"] )

In [None]:
boxdata1 = wine[wine['type']==0]
boxdata1 = boxdata1[['total sulfur dioxide']]

plt.boxplot(boxdata1)
plt.show()



In [None]:
boxdata2 = wine[wine['type']==1]
boxdata2 = boxdata2[['total sulfur dioxide']]

plt.boxplot(boxdata2)
plt.show()



In [None]:
wine.groupby('class')['free sulfur dioxide'].mean().plot.bar(x='Type',y='mean', color =['#722F37', '#808080'])

In [None]:
wine.groupby('class')['residual sugar'].mean().plot.bar(x='Type',y='mean', color =['#722F37', '#808080'])

In [None]:
wine.groupby('class')['volatile acidity'].mean().plot.bar(x='Type',y='mean', color =['#722F37', '#808080'])

In [None]:
wine.groupby('class')['chlorides'].mean().plot.bar(x='Type',y='mean', color =['#722F37', '#808080'])

# **Scaling**

In [None]:
wine.describe()

In [None]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates']
ss = StandardScaler()
wine.drop('class', inplace=True, axis =1)
df = wine
df[features]= ss.fit_transform(df[features])
df.head()

# **Splitting of Data**

In [None]:
target = df['type']
data = df.drop(['type'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(data, target, random_state = 11)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

# **Dummy Classifier**

In [None]:
dm = DummyClassifier(random_state = 11, strategy = 'most_frequent')
dm.fit(X_train, Y_train)

In [None]:
Y_pred = dm.predict(X_test)

In [None]:
print(classification_report(Y_test, Y_pred))

In [None]:
cm_dm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm_dm, annot = True, annot_kws = {'size':15}, fmt=".0f")
plt.xlabel("Predict")
plt.ylabel("Actual")

In [None]:
fpr, tpr, threshholds= roc_curve(Y_test, Y_pred)
auc = roc_auc_score(Y_test, Y_pred)
plt.plot(fpr, tpr, label="auc="+str(auc), color = 'r')
plt.title('ROC Curve Dummy Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()
dummy_auc = round(auc,3)*100
print(f"The score for the ROC Curve is: {round(auc, 3)*100}%")


# **KNN - K-Nearest Neighbours**

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, Y_train)
knn

In [None]:
Y_pred = knn.predict(X_test)

In [None]:
print(classification_report(Y_test, Y_pred))

# Hyperparameter Tuning

In [None]:
k_range = list(range(1,51))
weight_op = ['uniform', 'distance']
d = {'n_neighbors':k_range, 'weights': weight_op}

In [None]:
grid_temp = GridSearchCV(knn, d, cv=5, scoring='accuracy')
grid_temp.fit(data, target)
print("score:" ,grid_temp.best_score_, "params:" ,grid_temp.best_params_)

# Evaluation

In [None]:
fpr, tpr, thresholds= roc_curve(Y_test, Y_pred)
auc = roc_auc_score(Y_test, Y_pred)
plt.plot(fpr, tpr, label="auc="+str(auc), color = 'r')
plt.title('ROC Curve Dummy Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()
dummy_auc = round(auc,3)*100
print(f"The score for the ROC Curve is: {round(auc, 3)*100}%")



# RNN - Radius Neighbors Classifier 

In [None]:
rnn = RadiusNeighborsClassifier( radius = 3.0, weights = 'distance', algorithm ='auto', outlier_label = 'most_frequent')
rnn.fit(X_train, Y_train)
rnn

In [None]:
Y_pred = rnn.predict(X_test)

In [None]:
print(classification_report(Y_test, Y_pred))

# Hyperparameter Tuning

In [None]:
r_range = list(range(1,31))
d = {'radius': r_range}


In [None]:
grid_temp = GridSearchCV(rnn, d, cv=5, scoring='accuracy')
grid_temp.fit(data, target)
print("score:" ,grid_temp.best_score_, "params:" ,grid_temp.best_params_)

# Evaluation

In [None]:
cm_dm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm_dm, annot = True, annot_kws = {'size':15}, fmt=".0f")
plt.xlabel("Predict")
plt.ylabel("Actual")

In [None]:
fpr, tpr, thresholds= roc_curve(Y_test, Y_pred)
auc = roc_auc_score(Y_test, Y_pred)
plt.plot(fpr, tpr, label="auc="+str(auc), color = 'r')
plt.title('ROC Curve Dummy Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()
dummy_auc = round(auc,3)*100
print(f"The score for the ROC Curve is: {round(auc, 3)*100}%")




# **Logistic Regression**

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train, Y_train)
predictions = lr.predict(X_test)

In [None]:
print(classification_report(Y_test, predictions))

# Evaluation

In [None]:
cm = confusion_matrix(Y_test, predictions)
sns.heatmap(cm, annot=True, annot_kws = {'size': 15}, fmt='.0f')
plt.xlabel("Predict")
plt.ylabel("Actual")

In [None]:
print(metrics.accuracy_score(Y_test, predictions))

In [None]:
fpr, tpr, thresholds= roc_curve(Y_test, predictions)
auc = roc_auc_score(Y_test, predictions)
plt.plot(fpr, tpr, label="auc="+str(auc), color = 'r')
plt.title('ROC Curve Dummy Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.grid(True)
plt.show()
dummy_auc = round(auc,3)*100
print(f"The score for the ROC Curve is: {round(auc, 3)*100}%")




# The score for the ROC Curve for each algorithm

**K-Nearest Neighbors - 98.8 <br>
Radius Nearest Neighbors - 96.1 <br>
Logistic Regression - 98.6**