<h2><center>Red Wine EDA and Classification</center></h2>

Table of Contents:- <a id=100></a>
1. [Packages](#1)
2. [Understanding Data](#2)
3. [Exploratory Data Analysis](#3)
4. [Modelling](#4)

### 1. Packages <a id=1></a>
[back to top](#100)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Scaler
from sklearn.preprocessing import RobustScaler

# Train Test Split
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb

#Metrics
from sklearn.metrics import accuracy_score, classification_report

# Cross Validation
from sklearn.model_selection import GridSearchCV

print("Packages imported successfully...")

### 2. Understanding Data <a id=2></a>
[back to top](#100)

#### 2.1 Importing the data

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
print("Data successfully imported...")

#### 2.2 Shape of the data

In [None]:
print(df.shape)
df.sample(5)

#### 2.3 Data Info

In [None]:
df.info()

#### 2.4 Target variable - `quality`
The target value `quality` can be treated both as numerical or categorical. I've considered it as categorical.

In [None]:
a = list(df["quality"].unique())
a.sort()
print(a)

#### 2.5 Helper function to bin the target variable
Binning the `quality` column into 0, 1 and 2.

In [None]:
def helper(row):
    if row.quality < 4.5:
        return 0
    elif row.quality < 6.5:
        return 1
    else:
        return 2
df["quality"] = df.apply(helper,axis=1)
df['quality'].value_counts()

### 3. Exploratory Data Analysis <a id=3></a>
[back to top](#100)

#### 3.1 Univariate Analysis

##### 3.1.1 Separating feautures and labels

In [None]:
X = df.drop('quality',axis=1)
y = df[['quality']]

print(X.columns)
print(y.columns)

##### 3.1.2 Data stats

In [None]:
X.describe().transpose()

##### 3.1.3 Quality Distribution

In [None]:
ax = sns.countplot(data=df, x='quality', palette=['#FA5458','#FDD563','#5F63F1'])
ax.set(xticklabels=['0','1','2'], title="Quality Distribution")
ax.tick_params(bottom=False)

Most of the data lies in the average quality. This gives an idea about the predicted values. A high number of predicted values will be 1.

##### 3.1.4 Distribution of features

In [None]:

fig = plt.figure(figsize=(18,35))
gs = fig.add_gridspec(6,2)
gs.update(wspace=1, hspace=0.5)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])
ax10 = fig.add_subplot(gs[5,0])
ax11 = fig.add_subplot(gs[5,1])

background_color = "#f6f5f5"
color_palette = ["#FA5458","#FDD563","#5F63F1"]

fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color) 
ax8.set_facecolor(background_color) 
ax9.set_facecolor(background_color) 
ax10.set_facecolor(background_color) 
ax11.set_facecolor(background_color) 


# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Distribution plot\n__________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax1,x=df['fixed acidity'],color= "#3339FF", kde=True)
Xstart, Xend = ax1.get_xlim()
Ystart, Yend = ax1.get_ylim()
ax1.text(Xstart, Yend+(Yend*0.15), 'fixed acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax1.set_xlabel("")
ax1.set_ylabel("")

ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax2,x=df['volatile acidity'],color= "#3339FF", kde=True)
Xstart, Xend = ax2.get_xlim()
Ystart, Yend = ax2.get_ylim()
ax2.text(Xstart, Yend+(Yend*0.15), 'volatile acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax2.set_xlabel("")
ax2.set_ylabel("")

ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax3,x=df['citric acid'],color= "#3339FF", kde=True)
Xstart, Xend = ax3.get_xlim()
Ystart, Yend = ax3.get_ylim()
ax3.text(Xstart, Yend+(Yend*0.15), 'citric acid', fontsize=14, fontweight='bold', fontfamily='serif')
ax3.set_xlabel("")
ax3.set_ylabel("")

ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax4,x=df['residual sugar'],color= "#3339FF", kde=True)
Xstart, Xend = ax4.get_xlim()
Ystart, Yend = ax4.get_ylim()
ax4.text(Xstart, Yend+(Yend*0.15), 'residual sugar', fontsize=14, fontweight='bold', fontfamily='serif')
ax4.set_xlabel("")
ax4.set_ylabel("")

ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax5,x=df['chlorides'],color= "#3339FF", kde=True)
Xstart, Xend = ax5.get_xlim()
Ystart, Yend = ax5.get_ylim()
ax5.text(Xstart, Yend+(Yend*0.15), 'chlorides', fontsize=14, fontweight='bold', fontfamily='serif')
ax5.set_xlabel("")
ax5.set_ylabel("")

ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax6,x=df['free sulfur dioxide'],color= "#3339FF", kde=True)
Xstart, Xend = ax6.get_xlim()
Ystart, Yend = ax6.get_ylim()
ax6.text(Xstart, Yend+(Yend*0.15), 'free sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax6.set_xlabel("")
ax6.set_ylabel("")

ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax7,x=df['total sulfur dioxide'],color= "#3339FF", kde=True)
Xstart, Xend = ax7.get_xlim()
Ystart, Yend = ax7.get_ylim()
ax7.text(Xstart, Yend+(Yend*0.15), 'total sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax7.set_xlabel("")
ax7.set_ylabel("")

ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax8,x=df['density'],color= "#3339FF", kde=True)
Xstart, Xend = ax8.get_xlim()
Ystart, Yend = ax8.get_ylim()
ax8.text(Xstart, Yend+(Yend*0.15), 'density', fontsize=14, fontweight='bold', fontfamily='serif')
ax8.set_xlabel("")
ax8.set_ylabel("")

ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax9,x=df['pH'],color= "#3339FF", kde=True)
Xstart, Xend = ax9.get_xlim()
Ystart, Yend = ax9.get_ylim()
ax9.text(Xstart, Yend+(Yend*0.15), 'pH', fontsize=14, fontweight='bold', fontfamily='serif')
ax9.set_xlabel("")
ax9.set_ylabel("")

ax10.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax10,x=df['sulphates'],color= "#3339FF", kde=True)
Xstart, Xend = ax10.get_xlim()
Ystart, Yend = ax10.get_ylim()
ax10.text(Xstart, Yend+(Yend*0.15), 'sulphates', fontsize=14, fontweight='bold', fontfamily='serif')
ax10.set_xlabel("")
ax10.set_ylabel("")

ax11.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.histplot(ax=ax11,x=df['alcohol'],color= "#3339FF", kde=True)
Xstart, Xend = ax11.get_xlim()
Ystart, Yend = ax11.get_ylim()
ax11.text(Xstart, Yend+(Yend*0.15), 'alcohol', fontsize=14, fontweight='bold', fontfamily='serif')
ax11.set_xlabel("")
ax11.set_ylabel("")

None of the features seems to be skewed. `Residual sugar` and `Chlorides` seems to be a little bit right skewed, but not much-can be ignored.

##### 3.1.5 Boxenplot of features

In [None]:
fig = plt.figure(figsize=(18,35))
gs = fig.add_gridspec(6,2)
gs.update(wspace=1, hspace=0.5)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])
ax10 = fig.add_subplot(gs[5,0])
ax11 = fig.add_subplot(gs[5,1])

background_color = "#f6f5f5"
color_palette = ["#FA5458","#FDD563","#5F63F1"]

fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color) 
ax8.set_facecolor(background_color) 
ax9.set_facecolor(background_color) 
ax10.set_facecolor(background_color) 
ax11.set_facecolor(background_color) 


# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Boxenplot plot\n__________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax1,x=df['fixed acidity'],color= "#FA5458")
Xstart, Xend = ax1.get_xlim()
Ystart, Yend = ax1.get_ylim()
ax1.text(Xstart, Yend+(Yend*0.15), 'fixed acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax1.set_xlabel("")
ax1.set_ylabel("")

ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax2,x=df['volatile acidity'],color= "#FA5458")
Xstart, Xend = ax2.get_xlim()
Ystart, Yend = ax2.get_ylim()
ax2.text(Xstart, Yend+(Yend*0.15), 'volatile acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax2.set_xlabel("")
ax2.set_ylabel("")

ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax3,x=df['citric acid'],color= "#FA5458")
Xstart, Xend = ax3.get_xlim()
Ystart, Yend = ax3.get_ylim()
ax3.text(Xstart, Yend+(Yend*0.15), 'citric acid', fontsize=14, fontweight='bold', fontfamily='serif')
ax3.set_xlabel("")
ax3.set_ylabel("")

ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax4,x=df['residual sugar'],color= "#FA5458")
Xstart, Xend = ax4.get_xlim()
Ystart, Yend = ax4.get_ylim()
ax4.text(Xstart, Yend+(Yend*0.15), 'residual sugar', fontsize=14, fontweight='bold', fontfamily='serif')
ax4.set_xlabel("")
ax4.set_ylabel("")

ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax5,x=df['chlorides'],color= "#FA5458")
Xstart, Xend = ax5.get_xlim()
Ystart, Yend = ax5.get_ylim()
ax5.text(Xstart, Yend+(Yend*0.15), 'chlorides', fontsize=14, fontweight='bold', fontfamily='serif')
ax5.set_xlabel("")
ax5.set_ylabel("")

ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax6,x=df['free sulfur dioxide'],color= "#FA5458")
Xstart, Xend = ax6.get_xlim()
Ystart, Yend = ax6.get_ylim()
ax6.text(Xstart, Yend+(Yend*0.15), 'free sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax6.set_xlabel("")
ax6.set_ylabel("")

ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax7,x=df['total sulfur dioxide'],color= "#FA5458")
Xstart, Xend = ax7.get_xlim()
Ystart, Yend = ax7.get_ylim()
ax7.text(Xstart, Yend+(Yend*0.15), 'total sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax7.set_xlabel("")
ax7.set_ylabel("")

ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax8,x=df['density'],color= "#FA5458")
Xstart, Xend = ax8.get_xlim()
Ystart, Yend = ax8.get_ylim()
ax8.text(Xstart, Yend+(Yend*0.15), 'density', fontsize=14, fontweight='bold', fontfamily='serif')
ax8.set_xlabel("")
ax8.set_ylabel("")

ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax9,x=df['pH'],color= "#FA5458")
Xstart, Xend = ax9.get_xlim()
Ystart, Yend = ax9.get_ylim()
ax9.text(Xstart, Yend+(Yend*0.15), 'pH', fontsize=14, fontweight='bold', fontfamily='serif')
ax9.set_xlabel("")
ax9.set_ylabel("")

ax10.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax10,x=df['sulphates'],color= "#FA5458")
Xstart, Xend = ax10.get_xlim()
Ystart, Yend = ax10.get_ylim()
ax10.text(Xstart, Yend+(Yend*0.15), 'sulphates', fontsize=14, fontweight='bold', fontfamily='serif')
ax10.set_xlabel("")
ax10.set_ylabel("")

ax11.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax11,x=df['alcohol'],color= "#FA5458")
Xstart, Xend = ax11.get_xlim()
Ystart, Yend = ax11.get_ylim()
ax11.text(Xstart, Yend+(Yend*0.15), 'alcohol', fontsize=14, fontweight='bold', fontfamily='serif')
ax11.set_xlabel("")
ax11.set_ylabel("")

Almost all the features consists of outliers. But among (1599) tuples present in the dataset, removing the outliers will lead to a loss of training data. The accuracy of the model can be checked in both of the cases. Before and after removal of tha outliers.

#### 3.2 Bivariate Analysis

##### 3.2.1 Correlation matrix

In [None]:

fig = plt.figure(figsize=(10,10))
gs = fig.add_gridspec(1,1)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 

df_corr = X.corr().transpose()
mask = np.triu(np.ones_like(df_corr))
ax0.text(2,-0.1,"Correlation Matrix",fontsize=22, fontweight='bold', fontfamily='serif', color="#000000")
sns.heatmap(df_corr,mask=mask,fmt=".1f",annot=True,cmap="coolwarm")
plt.show()

A mask of 0.7 has been added which shows there's hardly any correlation between the features. 

##### 3.2.2 Correlation values

In [None]:
corr = df.corr()
corr.transpose().loc[:, ["quality"]].sort_values(by="quality",ascending=False)

In [None]:
fig = plt.figure(figsize=(18,35))
gs = fig.add_gridspec(6,2)
gs.update(wspace=1, hspace=0.5)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])
ax10 = fig.add_subplot(gs[5,0])
ax11 = fig.add_subplot(gs[5,1])

background_color = "#f6f5f5"
color_palette = ["#FA5458","#FDD563","#5F63F1"]

fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color) 
ax8.set_facecolor(background_color) 
ax9.set_facecolor(background_color) 
ax10.set_facecolor(background_color) 
ax11.set_facecolor(background_color) 


# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Scatter plot\n__________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax1,x='fixed acidity',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax1.get_xlim()
Ystart, Yend = ax1.get_ylim()
ax1.text(Xstart, Yend+(Yend*0.15), 'fixed acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax1.set_xlabel("")
ax1.set_ylabel("")

ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax2,x='volatile acidity',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax2.get_xlim()
Ystart, Yend = ax2.get_ylim()
ax2.text(Xstart, Yend+(Yend*0.15), 'volatile acidity', fontsize=14, fontweight='bold', fontfamily='serif')
ax2.set_xlabel("")
ax2.set_ylabel("")

ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax3,x='citric acid',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax3.get_xlim()
Ystart, Yend = ax3.get_ylim()
ax3.text(Xstart, Yend+(Yend*0.15), 'citric acid', fontsize=14, fontweight='bold', fontfamily='serif')
ax3.set_xlabel("")
ax3.set_ylabel("")

ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax4,x='residual sugar',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax4.get_xlim()
Ystart, Yend = ax4.get_ylim()
ax4.text(Xstart, Yend+(Yend*0.15), 'residual sugar', fontsize=14, fontweight='bold', fontfamily='serif')
ax4.set_xlabel("")
ax4.set_ylabel("")

ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax5,x='chlorides',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax5.get_xlim()
Ystart, Yend = ax5.get_ylim()
ax5.text(Xstart, Yend+(Yend*0.15), 'chlorides', fontsize=14, fontweight='bold', fontfamily='serif')
ax5.set_xlabel("")
ax5.set_ylabel("")

ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax6,x='free sulfur dioxide',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax6.get_xlim()
Ystart, Yend = ax6.get_ylim()
ax6.text(Xstart, Yend+(Yend*0.15), 'free sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax6.set_xlabel("")
ax6.set_ylabel("")

ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax7,x='total sulfur dioxide',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax7.get_xlim()
Ystart, Yend = ax7.get_ylim()
ax7.text(Xstart, Yend+(Yend*0.15), 'total sulfur dioxide', fontsize=14, fontweight='bold', fontfamily='serif')
ax7.set_xlabel("")
ax7.set_ylabel("")

ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax8,x='density',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax8.get_xlim()
Ystart, Yend = ax8.get_ylim()
ax8.text(Xstart, Yend+(Yend*0.15), 'density', fontsize=14, fontweight='bold', fontfamily='serif')
ax8.set_xlabel("")
ax8.set_ylabel("")

ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax9,x='pH',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax9.get_xlim()
Ystart, Yend = ax9.get_ylim()
ax9.text(Xstart, Yend+(Yend*0.15), 'pH', fontsize=14, fontweight='bold', fontfamily='serif')
ax9.set_xlabel("")
ax9.set_ylabel("")

ax10.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax10,x='sulphates',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax10.get_xlim()
Ystart, Yend = ax10.get_ylim()
ax10.text(Xstart, Yend+(Yend*0.15), 'sulphates', fontsize=14, fontweight='bold', fontfamily='serif')
ax10.set_xlabel("")
ax10.set_ylabel("")

ax11.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.scatterplot(ax=ax11,x='alcohol',y='quality',data=df,color= "#ff8811")
Xstart, Xend = ax11.get_xlim()
Ystart, Yend = ax11.get_ylim()
ax11.text(Xstart, Yend+(Yend*0.15), 'alcohol', fontsize=14, fontweight='bold', fontfamily='serif')
ax11.set_xlabel("")
ax11.set_ylabel("")

There's neither any correlation between different features nor any visibe relationship between features and *quality* target.

##### 3.2.3 Shape

In [None]:
print(df.shape)
print(X.shape)
print(y.shape)

##### 3.2.3 Handling the outliers

Outliers - I'm removing the outliers, but they can be kept as removing them will reduce the training data. 

In [None]:
df = df.drop(df[df["fixed acidity"] > 14].index)
df = df.drop(df[df["volatile acidity"] > 1.2].index)
df = df.drop(df[df["citric acid"] > 0.9].index)
df = df.drop(df[df["residual sugar"] > 10].index)
df = df.drop(df[df["chlorides"] > 0.3].index)
df = df.drop(df[df["free sulfur dioxide"] > 50].index)
df = df.drop(df[df["total sulfur dioxide"] > 200].index)

cond_den_1 = df["density"] > 1.003
cond_den_2 = df["density"] < 0.991
df = df.drop(df[cond_den_1 | cond_den_2].index)

cond_ph_1 = df["pH"] > 3.8
cond_ph_2 = df["pH"] < 2.8
df = df.drop(df[cond_ph_1 | cond_ph_2].index)

df = df.drop(df[df["sulphates"] > 1.25].index)
df = df.drop(df[df["alcohol"] > 14].index)

print("Shape after removing the outliers...")
df.shape

### 4. Modelling <a id=4></a>
[back to top](#100)

#### 3.1 Scaling the data

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 42)
print("The shape after train/test split and scaling...")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### 3.2 Modelling

##### 3.2.1 Dictionary for accuracy

In [None]:
models_accuracy = dict() # initialising an empty dictionary to add all the models' accuracy scores
print(models_accuracy)

##### 3.2.2 Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_proba = logreg.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba,axis=1)
models_accuracy["Logistic Regression"] = accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))

##### 3.2.3 KNeighbors Classifier
I've set an extensive grid just in KNN. For all other models, just the basic parametes have been set.

In [None]:
param_grid = {'n_neighbors':np.arange(1,50), 'weights':['uniform','distance'], 'leaf_size':np.arange(1,10)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train, y_train)
y_pred = knn_cv.predict(X_test)
print(knn_cv.best_params_)
print(knn_cv.best_score_)
models_accuracy["KNN"] = accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))

##### 3.2.4 Decision Tree

In [None]:
param_grid  = {"max_depth":np.arange(2,10), "min_samples_leaf":np.arange(0.02, 0.1), "max_features":[0.2,0.4,0.6,0.8]}
dt = DecisionTreeClassifier()
grid_dt = GridSearchCV(estimator = dt,
                      param_grid = param_grid,
                      scoring="accuracy",
                      cv=10,
                      n_jobs=-1)
grid_dt.fit(X_train, y_train)
y_pred = grid_dt.predict(X_test)
print(grid_dt.best_params_)
print(grid_dt.best_score_)
models_accuracy["Decision Trees"] = accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))

##### 3.2.5 Random Forest

In [None]:
params_rf = {'n_estimators':[100,200,300,400,500],
            'max_depth':[4,6,8,10,12,14],
            'max_features':['log2','sqrt']}
rf = RandomForestClassifier()
grid_rf = GridSearchCV(estimator = rf,
                      param_grid = params_rf,
                      cv=3,
                      scoring = 'neg_mean_squared_error',
                      verbose = 1,
                      n_jobs = -1)
grid_rf.fit(X_train, y_train)
y_pred = grid_rf.predict(X_test)
print(grid_rf.best_params_)
print(grid_rf.best_score_)
models_accuracy["Random Forest"] = accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))

##### 3.2.6 Voting Classifier

In [None]:
classifiers = [('Logistic Regression',logreg),
              ('K Nearest Neighbors', knn),
              ('Classification Tree', dt)]
for clf_name,clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf_name, accuracy_score(y_test,y_pred))
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
models_accuracy["Voting Classifier"] = accuracy_score(y_pred,y_test)
print(accuracy_score(y_test, y_pred))

##### 3.2.7 Model performances

In [None]:
models_accuracy

In [None]:
model = []
accuracy = []
for index,col in enumerate(models_accuracy):
    model.append(col)
    accuracy.append(models_accuracy[col])
print(model)
print(accuracy)

In [None]:
acc = pd.DataFrame({"Models":model, "Accuracy":accuracy})
plt.figure(figsize=(8,6))
sns.scatterplot(x = 'Models', y='Accuracy', data=acc, color='#3339FF',cmap=True)
plt.show()

Since an extensive Grid Search was set for KNN, it is clearly outperforming all other models according to the Scatterplot.

#### If you like the notebook, consider giving an upvote.