In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Contents

    Reading Data
    EDA
    Up/Down Sampling
    Model fitting

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sn
from scipy.stats import norm, boxcox
from collections import Counter
from sklearn.preprocessing import StandardScaler

from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

### Reading Data

In [None]:
df= pd.read_csv(os.path.join(dirname, filename))

In [None]:
df.head()

In [None]:
columns=df.columns
columns

### EDA

In [None]:
df.isnull().sum()

no null values is present in the dataset

In [None]:
# Box Plot

rows= int(len(columns)/3)
i=0
fig, ax = plt.subplots(rows,3,figsize=(24,24))

for row in range (rows):
    for col in range (3):
        if i<len(columns): 
            sn.boxplot(x=df["quality"],y=df[columns[i]],ax=ax[row,col])
            i+=1

Looks like the feature residual sugar and chlorides also sulphates have too many outliers

In [None]:
# normal distribution

rows= int(len(columns)/3)
i=0
fig, ax = plt.subplots(rows,3,figsize=(24,12))

for row in range (rows):
    for col in range (3):
        if i<len(columns)-1:
            sn.distplot(df[columns[i]], fit= norm,ax=ax[row,col])
            i+=1

outlier visualisation

In [None]:
fig = plt.figure(figsize=(12,8))
sn.scatterplot(x="residual sugar",y="alcohol",style="quality",data=df)
plt.title("residual sugar vs alcohol")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(12,8))
sn.scatterplot(x="chlorides",y="alcohol",style="quality",data=df)
plt.title("chlorides vs alcohol")
plt.legend()
plt.show()

As seen in the above plots, there are many outliers in the above plots

Since, Decision tree and Randam Forest are robust to outlier so Outliers are not removed the outliers

In [None]:
x=df['quality'].unique()
x.sort()
x

In [None]:
df.groupby(['quality']).mean()

In [None]:
rows= int(len(columns)/3)
i=0
fig, ax = plt.subplots(rows,3,figsize=(24,24))

for row in range (rows):
    for col in range (3):
        if i<len(columns)-1: 
            sn.barplot(x=x,y=df[[columns[i],'quality']].groupby(['quality']).mean().iloc[:,0],palette="YlOrBr",ax=ax[row,col])
            ax[row,col].title.set_text("{} vs quality".format(columns[i]))
            i+=1

looks like density does not have any effect on the quality so removing the density feature. Remaining features have some effect on quality

In [None]:
df.drop(columns="density", inplace=True)
df.head()

In [None]:
#Dividing the quality into 3 class. 

reviews = []
for i in df['quality']:
    if i == 3:
        reviews.append(1)
    elif i >= 4 and i <= 7:
        reviews.append(2)
    elif i== 8:
        reviews.append(3)
df['Reviews'] = reviews

In [None]:
columns=df.columns

features effect on quality after grouping into 3 classes

In [None]:
rows= int(len(columns)/3)
i=0
fig, ax = plt.subplots(rows,3,figsize=(24,24))

for row in range (rows):
    for col in range (3):
        if i<len(columns)-2: 
            sn.barplot(x=[1,2,3],y=df[[columns[i],'Reviews']].groupby(['Reviews']).mean().iloc[:,0],palette="Blues_r",ax=ax[row,col])
            ax[row,col].title.set_text("{} vs quality".format(columns[i]))
            i+=1

In [None]:
df["quality"].value_counts()

In [None]:
df["Reviews"].value_counts()

In [None]:
X = df.drop(columns=["quality","Reviews"])

y=df["Reviews"]

# Dividing the data into test and train dataset

# Create the Test and Final Training Datasets

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42,shuffle=True)

### Distribution after smote(up/down sampling)

The dataset is unbalanced and need to be balanced

In [None]:
print("Train dataset:",Counter(y_train))
oversampler = SMOTE()
undersampler = RandomUnderSampler(sampling_strategy={1:531,2:531,3:531})
x_train_over, y_train_over = oversampler.fit_resample(x_train, y_train)
print("After applying Smote :",Counter(y_train_over))
x_train_un, y_train_un = undersampler.fit_resample(x_train_over, y_train_over)
print("After applying Undersampling :",Counter(y_train_un))

checking the distribution after SMOTE

In [None]:
fig = plt.figure(figsize=(12,8))
sn.scatterplot(x="fixed acidity",y="alcohol",hue=y_train_over,style=y_train_over,data=x_train_over)
plt.legend(title="Reviews")
plt.show()

Checking the distribution after applying SMOTE and Undersampler

In [None]:
fig = plt.figure(figsize=(12,8))
sn.scatterplot(x="fixed acidity",y="alcohol",hue=y_train_un,style=y_train_un,data=x_train_un)
plt.legend(title="Reviews")
plt.show()

#### Normalising Data

In [None]:
# normalising Data
ss=StandardScaler()

X_train=ss.fit_transform(x_train_un)
X_test = ss.fit_transform(x_test)

#### fitting model

In [None]:
# Decision Tree

from sklearn import tree

model=tree.DecisionTreeClassifier()

model.fit(X_train,y_train_un)

print(model.score(X_train,y_train_un))
print(model.score(X_test,y_test))

Performing post pruning to remove over fitting using ccp alpha

In [None]:
path=model.cost_complexity_pruning_path(X_train,y_train_un)
ccp_alphas,impurities=path.ccp_alphas,path.impurities

In [None]:
train_acc=[]
test_acc=[]

In [None]:
for cc_alpha in ccp_alphas:
    model=tree.DecisionTreeClassifier(ccp_alpha=cc_alpha)

    model.fit(X_train,y_train_un)
    train_acc.append(model.score(X_train,y_train_un))
    test_acc.append(model.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,8))
plt.plot(ccp_alphas[:-18],train_acc[:-18],'ro-',label = "train accuracy")
plt.plot(ccp_alphas[:-18],test_acc[:-18],'bo-',label = "test accuracy")
plt.xlabel("ccp_alpha")
plt.ylabel("Accuracy")
plt.legend(title = "Legend")
plt.show()

As seen in the graph, the test accuracy began to drop after a particular ccp_alpha= 000116 and the training accuracy reaches maximum.

So setting ccp_alpha to that value and fitting the model

In [None]:
model=tree.DecisionTreeClassifier(ccp_alpha=0.00116)

model.fit(X_train,y_train_un)

print(model.score(X_train,y_train_un))
print(model.score(X_test,y_test))

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

model1=RandomForestClassifier()

model1.fit(X_train,y_train_un)

model1.score(X_test,y_test)

grouping the data in a good manner results in good performance.

when grouped,

    1 = quality 3 and 4
    2= quality 5,6,7
    3 = quality 8

resulted in test accuracy around 70%

but when grouped the data,

    1 = quality 3 
    2= quality 4,5,6,7
    3 = quality 8
resulted in test accuracy arount 85%