# Observe the Data

Read Datasets

In [None]:
import pandas as pd

df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

Check Null Values

In [None]:
df.isnull().sum()

Check Na Values

In [None]:
df.isna().sum()

# EDA

Import Library

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'residual sugar', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'density', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'pH', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = df)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = df)

# Data Preprocessing

In [None]:
# We will Predict Binary Classification, so we will convert quality into 2 categories(bad, good)
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [None]:
df.head()

Label Encoder

In [None]:
# Label Encoder for Quality
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()

In [None]:
# 0 for bad and 1 for good
df['quality'] = le.fit_transform(df['quality'])
df.head()

In [None]:
# Count Quality Value
sns.countplot(df['quality'])

In [None]:
df['quality'].value_counts()

In [None]:
# Input Data into X and y
X = df.drop('quality', axis = 1)
y = df['quality']

# Modelling

## The model we will use

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

Split Data

In [None]:
# 80% : 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Import CrossValScore and Accuracy Score

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

## Random Forest Classifier

![](https://miro.medium.com/max/1170/1*VY3lEFysaQ0nnV_zkxyU-w.png)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train,y_train)

rfc_acc = accuracy_score(y_test, rfc.predict(X_test))
rfc_cv = cross_val_score(rfc,X_train,y_train,cv = 10, scoring='accuracy').mean()

print('Random Forest Classifier accuracy: {:.5f}'.format(rfc_acc))
print('Random Forest Classifier Cross Val Score : {:.5f}'.format(rfc_cv))

## Bagging Classifier

![](https://vitalflux.com/wp-content/uploads/2020/09/Screenshot-2020-09-08-at-4.17.30-PM.png)

In [None]:
from sklearn.ensemble import BaggingClassifier

bgc = BaggingClassifier()
bgc.fit(X_train,y_train)

bgc_acc = accuracy_score(y_test, bgc.predict(X_test))
bgc_cv = cross_val_score(bgc,X_train,y_train,cv = 10, scoring='accuracy').mean()

print('BaggingClassifier accuracy: {:.5f}'.format(bgc_acc))
print('BaggingClassifier Cross Val Score : {:.5f}'.format(bgc_cv))

## LGBM Classifier

![](https://images.akira.ai/glossary/lightgbm-boosting%20framework-akira-ai.png)

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()
lgb.fit(X_train,y_train)

lgb_acc = accuracy_score(y_test, lgb.predict(X_test))
lgb_cv = cross_val_score(lgb,X_train,y_train,cv = 10, scoring='accuracy').mean()

print('LightGBM accuracy: {:.5f}'.format(lgb_acc))
print('LGBM Cross Val Score : {:.5f}'.format(lgb_cv))

## Extra Tree Classifier

![](https://media.springernature.com/original/springer-static/image/chp%3A10.1007%2F978-3-030-26142-9_11/MediaObjects/482374_1_En_11_Fig1_HTML.png)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train,y_train)

etc_acc = accuracy_score(y_test, etc.predict(X_test))
etc_cv = cross_val_score(etc,X_train,y_train,cv = 10, scoring='accuracy').mean()

print('ExtraTreeClassifier accuracy: {:.5f}'.format(etc_acc))
print('ExtraTreeClassifier Cross Val Score : {:.5f}'.format(etc_cv))

## XGB CLassifier

![](https://www.researchgate.net/publication/343566690/figure/fig2/AS:932532258369537@1599344424444/Comparison-between-XGBoost-level-wise-horizontal-tree-growth-and-LightGBM-vertical.png)

In [None]:
from xgboost import XGBClassifier

xgb=XGBClassifier()
xgb.fit(X_train, y_train)

xgb_acc = accuracy_score(y_test, xgb.predict(X_test))
xgb_cv = cross_val_score(xgb,X_train,y_train,cv = 10, scoring='accuracy').mean()

print('XGB accuracy: {:.5f}'.format(xgb_acc))
print('XGB Cross Val Score : {:.5f}'.format(xgb_cv))

## Conclusion

In [None]:
print('---------------------------------------------------------')
print('Random Forest Classifier accuracy: {:.5f}'.format(rfc_acc))
print('Random Forest Classifier Cross Val Score : {:.5f}'.format(rfc_cv))
print('---------------------------------------------------------')
print('BaggingClassifier accuracy: {:.5f}'.format(bgc_acc))
print('BaggingClassifier Cross Val Score : {:.5f}'.format(bgc_cv))
print('---------------------------------------------------------')
print('LightGBM accuracy: {:.5f}'.format(lgb_acc))
print('LGBM Cross Val Score : {:.5f}'.format(lgb_cv))
print('---------------------------------------------------------')
print('ExtraTreeClassifier accuracy: {:.5f}'.format(etc_acc))
print('ExtraTreeClassifier Cross Val Score : {:.5f}'.format(etc_cv))
print('---------------------------------------------------------')
print('XGB accuracy: {:.5f}'.format(xgb_acc))
print('XGB Cross Val Score : {:.5f}'.format(xgb_cv))

# Hyperparameter Tuning

=> 90% acc will be Hyperparameter Tuning

## Random Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

## Random Forest Clasifier

In [None]:
#Randomized Search CV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Params for Tuning
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
rf = RandomForestClassifier()
rf=RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = 1)

In [None]:
# Train Model
rf.fit(X_train,y_train)

In [None]:
# Best Parameter
rf.best_params_

In [None]:
# Best Score in RandomForest Classifier
rfbs = rf.best_score_
print("Random Forest Classifier Best Score : ", rfbs)

## LGBM Classifier

In [None]:
# For the Parameters we using RFC Params for Tuning

lgb = LGBMClassifier()
# Implement Tuning kedalam Model
lgb=RandomizedSearchCV(estimator = lgb, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = 1)

In [None]:
# Train Model
lgb.fit(X_train,y_train)

In [None]:
lgb.best_params_

In [None]:
# Best Score in LGB Classifier
lgbbs = lgb.best_score_
print("LightGBM Classifier Best Score : ", lgbbs)

## Extra Tree Classifier

In [None]:
# For the Parameters we using RFC Params for Tuning

etc = ExtraTreesClassifier()
etc=RandomizedSearchCV(estimator = etc, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = 1)

In [None]:
etc.fit(X_train, y_train)

In [None]:
etc.best_params_

In [None]:
# Best Score in LGB Classifier
etcbs = etc.best_score_
print("Extra Tree Classifier Best Score : ", etcbs)

## XGB Classifier

In [None]:
from scipy.stats import uniform, randint
xgb=XGBClassifier()

In [None]:
# Dictionary HyperParameter Tuning for XGB Classifier
params = {
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

In [None]:
xgb = RandomizedSearchCV(estimator = xgb, param_distributions = params,scoring='accuracy', n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = 1)

In [None]:
xgb.fit(X_train,y_train)

In [None]:
xgb.best_params_

In [None]:
# Best Score in XGB Classifier
xgbbs = xgb.best_score_
print("XGBoost Best Score : ", xgbbs)

# Conclusion

In [None]:
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('Random Forest Classifier accuracy                               : {:.5f}'.format(rfc_acc))
print('Random Forest Classifier Cross Val Score                        : {:.5f}'.format(rfc_cv))
print("Random Forest Classifier After Hyperparameter Tuning Best Score : {:.5f}".format(rfbs))
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('LightGBM Classifier accuracy                                    : {:.5f}'.format(lgb_acc))
print('LightGBM Classifier Cross Val Score                             : {:.5f}'.format(lgb_cv))
print("LightGBM Classifier After Hyperparameter Tuning Best Score      : {:.5f}".format(lgbbs))
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('Extra Tree Classifier accuracy                                  : {:.5f}'.format(etc_acc))
print('Extra Tree Classifier Cross Val Score                           : {:.5f}'.format(etc_cv))
print("Extra Tree Classifier After Hyperparameter Tuning Best Score    : {:.5f}".format(etcbs))
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('---------------------------------------------------------')
print('XGB Classifier accuracy                                         : {:.5f}'.format(xgb_acc))
print('XGB Classifier Cross Val Score                                  : {:.5f}'.format(xgb_cv))
print("XGB Classifier After Hyperparameter Tuning Best Score           : {:.5f}".format(xgbbs))

In [None]:
results = {
    'Model': ['Random Forest', 'LightGBM', 'Extra Tree', 'XGB'],
    'Accuracy': [rfc_acc, lgb_acc, etc_acc, xgb_acc],
    'Cross Val Score': [rfc_cv, lgb_cv, etc_cv, xgb_cv],
    'After Tuning Score': [rfbs, lgbbs, etcbs, xgbbs]
}

result_model = pd.DataFrame(results)

In [None]:
result_model