In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
# showing column wise %ge of NaN values they contains 

for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)


> Since data does'nt contain any null values, we can move further

In [None]:
df.info()

In [None]:
plt.figure(figsize=(5,5))
ax = sns.countplot(x='quality', data=df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


> Here this bar graph easily shows how data is imbalanced. Less than 1% data is in class __3__. So, first, we have to balance the data in to get more precise predictions.
> For that we are using both Under Sampling and Over sampling


> Here we are, firstly, under sampling class 5 and 6 to the level of class 7

In [None]:
class_7 = df[df['quality'] == 7]                            # Class to which we bring other classes.

class_5 = df[df['quality'] == 5].sample(n = len(class_7))   # UnderSampling the class to make data balanced
class_6 = df[df['quality'] == 6].sample(n = len(class_7))   # UnderSampling the class to make data balanced

new_df = pd.concat([df[df['quality'] == 3], df[df['quality'] == 4], class_5, class_6, class_7, df[df['quality'] == 8]]).sample(frac=1)
new_df.head()

In [None]:
plt.figure(figsize=(5,5))
ax = sns.countplot(x='quality', data=new_df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


> Now we are over sampling remaining classes to their level

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler()
x, y = oversample.fit_resample(new_df.drop(['quality'], axis=1), new_df['quality'])

new_df = pd.DataFrame(x, columns=df.drop(['quality'], axis=1).columns)
new_df['quality'] = y

new_df.head()

In [None]:
plt.figure(figsize=(5,5))
ax = sns.countplot(x='quality', data=new_df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


> Here we ca see that all the classes are balanced.

In [None]:
cormap = new_df.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cormap, annot = True)

In [None]:
X = new_df.drop(['quality'], axis=1)
y = new_df['quality']

In [None]:
# Scale the data to be between -1 and 1

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

# Number of features to consider at every split 
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
grid = {'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        }

In [None]:
clf = GridSearchCV( estimator = DecisionTreeClassifier(),  param_grid = grid, cv = 5)
clf = clf.fit(X_train, y_train)
clf.best_params_

In [None]:
clf = GridSearchCV( estimator = DecisionTreeClassifier(),  param_grid = grid, cv = 5)
clf = clf.fit(X_train, y_train)
clf.best_params_

In [None]:
y_pred = clf.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()


In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(mat, annot = True)

In [None]:
from sklearn import metrics

# Measure the Accuracy Score
print("Accuracy score of the predictions: {0}".format(metrics.accuracy_score(y_pred, y_test)))
