In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Let's import the data
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df

We have imported the data successfully. The dataset contains 1599 rows and 12 columns including target variable i.e. quality. Now, we will explore the dataset.

Let's check dataset, i.e. mean. meadian. 25%, 50%, max, min etc.

In [None]:
df.describe()

In [None]:
#Now, we will review the using histogram as
df.hist(figsize=(20,18), bins= 50)


In above histograms, we can see that mostly features are skewed to right and symmetric.
Now, we will check the correlation of the data

In [None]:
df.corr()
plt.figure(figsize=(15,12))
sns.heatmap(df.corr(), annot=True)

From the above correlation matrix, we can see that variable alcohol, sulphates, citric acid and fixed acidity have more correlations than other varibles.

We will check if there is any null value is present in the dataset or not.

In [None]:
df.isnull().sum()

There is no null in dataset. Now we will check for the outlier.

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['fixed acidity'], orient='v', color='grey')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['volatile acidity'], orient='v', color='red')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['citric acid'], orient='v', color='purple')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['citric acid'], orient='v', color='gold')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['residual sugar'], orient='v', color='blue')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['chlorides'], orient='v', color='green')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['free sulfur dioxide'], orient='v', color='red')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['total sulfur dioxide'], orient='v', color='violet')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['density'], orient='v', color='indigo')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['pH'], orient='v', color='khaki')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['sulphates'], orient='v', color='lime')

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(df['quality'], df['alcohol'], orient='v', color='tomato')

**Lets remove the outliers using Z-score**

In [None]:
from scipy import stats
z = np.abs(stats.zscore(df))
z

In [None]:
threshold = 3
print(np.where(z > 3))

In [None]:
df_new= df[(z < 3).all(axis=1)]

In [None]:
print(df.shape)
print(df_new.shape)

In [None]:
df = df_new.copy()
df.shape

In [None]:
#For simplicity lets make variable quality into binomial variable
df['quality'].value_counts()

In [None]:
bins=[2,6,8]
labels=[0,1]
df['quality']=pd.cut(x=df['quality'], bins=bins, labels=labels)

In [None]:
df['quality'].value_counts()

In [None]:
#Now, lest split the data into target variable
y = df['quality']
x = df.drop(['quality'], axis=1)
print(x.shape)
print(y.shape)

In [None]:
#Now, split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_tr, x_tst, y_tr, y_tst = train_test_split(x, y, test_size=0.2, random_state=3)

In [None]:
#The splitted data is imbalanced hence we need to balance it properly, we can done it using SMOTE as
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=3)
x_tr_sm, y_tr_sm = sm.fit_resample(x_tr, y_tr)

In [None]:
import collections
print("Before SMOTE:", collections.Counter(y_tr))
print("After SMOTE:", collections.Counter(y_tr_sm))

In [None]:
#Now, we standardize the data using standard scaler function
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_tr_sm = scaler.fit_transform(x_tr_sm)
x_tst = scaler.transform(x_tst)

In [None]:
#Now, we import Random Forest classifer to predict the output
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=18, bootstrap=False)
rf.fit(x_tr_sm, y_tr_sm)

In [None]:
y_pred_rf = rf.predict(x_tst)
y_pred_rf

In [None]:
results = []

In [None]:
#Now lets measure accuracy of the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, plot_roc_curve, roc_curve
cm = confusion_matrix(y_tst, y_pred_rf)

acc = accuracy_score(y_tst, y_pred_rf)
score = rf.score(x_tst, y_tst)
results.append(acc)

print("Score : ", score)
print("RandomForestClassifier Acc : ", acc)

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rf, x_tst, y_tst, cmap= "pink")  
plt.show()

In [None]:
print(" \t \t  RandomForestClassifier Classification Report")
print(classification_report(y_tst, y_pred_rf))

In [None]:
#Now, we will import another classification algorithm i.e. Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_tr_sm, y_tr_sm)

In [None]:
y_pred_lr = lr.predict(x_tst)
y_pred_lr

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
cm_lr = confusion_matrix(y_tst, y_pred_lr)

acc_lr = accuracy_score(y_tst, y_pred_lr)
score_lr = lr.score(x_tst, y_tst)
results.append(acc_lr)

print("Score : ", score_lr)
print("Logistic Regression Classifier Acc : ", acc_lr)

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(lr, x_tst, y_tst, cmap= "hot")  
plt.show()

In [None]:
print(" \t \t  Logistic Regression  Classification Report")
print(classification_report(y_tst, y_pred_lr))

In [None]:
#ROC AUC Curve for Random Forest Classifier
from sklearn.metrics import plot_roc_curve
roc_rf = plot_roc_curve(rf, x_tst, y_tst)
roc_rf

In [None]:
#ROC AUC Curev for Logistic Regression Classifer
roc_lr = plot_roc_curve(lr, x_tst, y_tst)
roc_lr

**Result**:
From the above, we can see that Random Forest Classifier has highest success among two algorithms, hence concludes.
