In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center> <img src="https://user-images.githubusercontent.com/47216809/86511259-ef33e380-be14-11ea-8ea3-3a3cc4ff5ce8.jpg" width=50%> </center>

Following are the list of all algorithms used in this notebook.

| Algorithms      | 
| ----------- | 
| Decision Tree      | 
| Random Forest   | 
| XGBoost     | 
| KNeighbours  | 
| SVM    | 
| AdaBoost   | 

* Importing all the libraries required for this notebook

In [None]:
import numpy as np
import pandas as pf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

* It is considered as a good practice to work on a copy of original dataset.

In [None]:
main_df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df = main_df.copy()
df.head()

### Exploring dataset

In [None]:
# Dimension of dataset

df.shape

In [None]:
# List of all columns

df.columns

In [None]:
# Basic Information about the dataframe

df.info()

In [None]:
# List of all features with number of unique values present in them

df.nunique()

In [None]:
# Statistical measure of dataset

df.describe()

In [None]:
# Checking for null values

df.isna().sum()

## Visualization 

In [None]:
plt.figure(figsize=(7,5))
sns.heatmap(df.isnull())

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.pairplot(df)

In [None]:
px.scatter(df, x='free sulfur dioxide',y='total sulfur dioxide', color=df['quality'])

In [None]:
diag = px.density_heatmap(df, x="alcohol", y="quality", nbinsx=25, nbinsy=10, color_continuous_scale="thermal")
diag.show()

In [None]:
diag = px.pie(df, values='quality', names='quality', hole=0.5)
diag.show()

In [None]:
labels = df['quality'].unique()
values = df['quality'].value_counts()

# pull is given as a fraction of the pie radius
diag = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0, 0.1, 0.2, 0.2, 0.2])])
diag.show()

In [None]:
diag = px.histogram(df, x="citric acid", color="quality",marginal="violin")
diag.update_traces(opacity=.90)
diag.show()

In [None]:
diag = px.histogram(df, x="sulphates", color="quality",marginal="violin")
diag.update_traces(opacity=.90)
diag.show()

In [None]:
diag = px.histogram(df, x="pH", color="quality",marginal="violin")
diag.update_traces(opacity=.90)
diag.show()

In [None]:
diag = px.density_contour(df, x="pH", color="quality")
diag.update_traces(opacity=0.90)
diag.show()

In [None]:
diag = px.histogram(df, x="volatile acidity", color="quality")
diag.update_traces(opacity=0.90)
diag.show()

In [None]:
sns.factorplot(data=df, kind='box', size=10, aspect=1.5)

In [None]:
sns.factorplot(x=df['fixed acidity'], y=df['quality'], data=df,
                   kind='bar', size=12, aspect=1.5)

In [None]:
diag = px.violin(df, x="volatile acidity",  color="quality")
diag.update_traces(opacity=0.90)
diag.show()

### Preprocessing data

In [None]:
bins = (2, 6.5, 8)
group_names = ['not good', 'good']
df['quality'] = pd.cut(df['quality'], bins=bins, labels=group_names)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encod = LabelEncoder()

In [None]:
df.quality = label_encod.fit_transform(df.quality)

In [None]:
X = df.drop('quality', axis=1)
y = df.quality

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Model Building 

### Decision Tree Classifer

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Creating object of the model

model_dt = DecisionTreeClassifier(max_depth=4, random_state=42)

In [None]:
model_dt.fit(X_train, y_train)

In [None]:
pred_dt = model_dt.predict(X_test)

In [None]:
dt  = round(accuracy_score(y_test, pred_dt)*100, 2)
print(dt) 

In [None]:
print(classification_report(y_test, pred_dt))

In [None]:
print(classification_report(y_test, pred_dt))

In [None]:
# confusion Maxtrix
cm2 = confusion_matrix(y_test, pred_dt)
sns.heatmap(cm2/np.sum(cm2), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("Decision Tree Classifier Confusion Matrix",fontsize=12)
plt.show()

###  Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Creating model object
model_rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42)

In [None]:
# Training Model
model_rf.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_rf = model_rf.predict(X_test)

In [None]:
# Calculating Accuracy Score
rf = round(accuracy_score(y_test, pred_rf)*100, 2)
print(rf)

In [None]:
print(classification_report(y_test,pred_rf))

In [None]:
# confusion Maxtrix
cm3 = confusion_matrix(y_test, pred_rf)
sns.heatmap(cm3/np.sum(cm3), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("RandomForest Classifier Confusion Matrix",fontsize=12)
plt.show()

### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
# Creating model object

model_xgb = XGBClassifier(max_depth= 8, n_estimators= 125, random_state= 0,  learning_rate= 0.03, n_jobs=5)

In [None]:
# Training Model

model_xgb.fit(X_train, y_train)

In [None]:
# Making Prediction

pred_xgb = model_xgb.predict(X_test)

In [None]:
# Calculating Accuracy Score

xgb = round(accuracy_score(y_test, pred_xgb)*100, 2)
print(xgb)

In [None]:
print(classification_report(y_test,pred_xgb))

In [None]:
# confusion Maxtrix
cm4 = confusion_matrix(y_test, pred_xgb)
sns.heatmap(cm4/np.sum(cm4), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("XGBoost Classifier Confusion Matrix",fontsize=12)
plt.show()

### KNeighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Creating model object
model_kn = KNeighborsClassifier(n_neighbors=9, leaf_size=20)

In [None]:
# Training Model
model_kn.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_kn = model_kn.predict(X_test)

In [None]:
# Calculating Accuracy Score
kn = round(accuracy_score(y_test, pred_kn)*100, 2)
print(kn)

In [None]:
print(classification_report(y_test,pred_kn))

In [None]:
# confusion Maxtrix
cm5 = confusion_matrix(y_test, pred_kn)
sns.heatmap(cm5/np.sum(cm5), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("KN Classifier Confusion Matrix",fontsize=12)
plt.show()

### SVM

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
model_svm = SVC(kernel='rbf', random_state = 42)

In [None]:
model_svm.fit(X_train, y_train)

In [None]:
# Making Prediction

pred_svm = model_svm.predict(X_test)

In [None]:
# Calculating Accuracy Score
sv = round(accuracy_score(y_test, pred_svm)*100, 2)
print(sv)

In [None]:
print(classification_report(y_test,pred_kn))

In [None]:
# confusion Maxtrix
cm6 = confusion_matrix(y_test, pred_svm)
sns.heatmap(cm6/np.sum(cm6), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("SVM Classifier Confusion Matrix",fontsize=12)
plt.show()

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_ada = AdaBoostClassifier(learning_rate= 0.002,n_estimators= 205,random_state=42)

In [None]:
model_ada.fit(X_train, y_train)

In [None]:
# Making Prediction

pred_ada = model_ada.predict(X_test)

In [None]:
# Calculating Accuracy Score

ada = round(accuracy_score(y_test, pred_ada)*100, 2)
print(ada)

In [None]:
print(classification_report(y_test,pred_ada))

In [None]:
# confusion Maxtrix
cm7 = confusion_matrix(y_test, pred_ada)
sns.heatmap(cm7/np.sum(cm7), annot = True, fmt=  '0.2%', cmap = 'Reds')
plt.title("Adaboost Classifier Confusion Matrix",fontsize=12)
plt.show()

In [None]:
models = pd.DataFrame({
    'Model':[ 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM', 'AdaBoost'],
    'Accuracy_score' :[dt, rf, xgb, kn, sv, ada]
})
models
sns.barplot(x='Accuracy_score', y='Model', data=models)

models.sort_values(by='Accuracy_score', ascending=False)

####  Conclusion :- Above graph shows the performance and comparsion of different models used in this notebook