# 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import shapiro
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 2. Load Data

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

# 3. Exploratory Data Analysis

# Shape of Data

In [None]:
shape = df.shape
print('Data Set contains', shape[0], 'rows and', shape[1], 'columns')

# Getting describption of the data

In [None]:
df.describe().T

In [None]:
df.info()

# Check Distribution of data

Using Shapiro Test

In [None]:
def col_dist(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            d, p = shapiro(frame[i])
            if(p>0.05):
                print(i, 'is normally distributed')
            else:
                print(i, 'is not normally distributed')
                
        if(frame[i].dtype == 'float64'):
            d, p = shapiro(frame[i])
            if(p <= 0.05):
                print(i, 'is normally distributed')
            else:
                print(i, 'is not normally distributed')
                
col_dist(df)

# Univariant Analysis

In [None]:
def univariant(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.distplot(frame[i])
            plt.show()
                
        elif(frame[i].dtype == 'float64'):
            sns.distplot(frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'O'):
            sns.countplot(frame[i])
            plt.show()
            
univariant(df)

# Observation

1. All the features are not normally distributed except density and ph
2. Our target feature(quality) is not balanced so we need to balance it for getting good results

# Biavarient Analysis

In [None]:
def bivariant(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.boxplot(df['quality'], frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'float64'):
            sns.boxplot(df['quality'], frame[i])
            plt.show()
            
bivariant(df)

# Multivariant Analysis

In [None]:
sns.pairplot(df)

# Detecting Outliers

In [None]:
def det_outlier(frame):
    for i in frame.columns:
        if(frame[i].dtype == 'int64'):
            sns.boxplot(frame[i])
            plt.show()
            
        elif(frame[i].dtype == 'float64'):
            sns.boxplot(frame[i])
            plt.show()
            
det_outlier(df)

# Checking null values

In [None]:
df.isnull().sum()

# 4. Splitting Data into Train and Test

In [None]:
x = df.drop('quality', axis=1)
y = df['quality']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

# Balance target feature using SMOTE
I will aplly smote on the best model

In [None]:
smote = SMOTE()
X_Train, Y_Train = smote.fit_resample(X_train, Y_train)

# 5. Let's Build our Models

# 1. Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=15000)
lr.fit(X_train, Y_train)

In [None]:
lr_pred = lr.predict(X_test)
print(classification_report(Y_test, lr_pred))

# 2. XGBoost Classifier

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)

In [None]:
xgb_pred = xgb.predict(X_test)
print(classification_report(Y_test, xgb_pred))

# Apply Smote

In [None]:
sxgb = XGBClassifier()
sxgb.fit(X_Train, Y_Train)

In [None]:
sxgb_pred = sxgb.predict(X_test)
print(classification_report(Y_test, sxgb_pred))

After applying smote my models accuracy decreased

# 3. Ada Boost Classifier

In [None]:
ada = AdaBoostClassifier()
ada.fit(X_train, Y_train)

In [None]:
ada_pred = ada.predict(X_test)
print(classification_report(Y_test, ada_pred))

# 4. Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)

In [None]:
rf_pred = rf.predict(X_test)
print(classification_report(Y_test, rf_pred))

# After HyperParameter Tuning 

In [None]:
arf = RandomForestClassifier(max_depth=4, max_features='log2', n_estimators=650)
arf.fit(X_train, Y_train)

In [None]:
arf_pred = arf.predict(X_test)
print(classification_report(Y_test, arf_pred))

# 5. Gredient Boosting Classifier

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)

In [None]:
gb_pred = gb.predict(X_test)
print(classification_report(Y_test, gb_pred))

# Stacking Classifier

In [None]:
stack = StackingClassifier([
    ('xgb', XGBClassifier()),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier())
])
stack.fit(X_train, Y_train)

In [None]:
stack_pred = stack.predict(X_test)
print(classification_report(Y_test, stack_pred))

# 6. HyperParameter Tuning

In [None]:
rf_grid = {
    'n_estimators' : [j for j in range(300, 1000, 50)],
    'criterion': ['gini', 'entropy'],
    'max_depth' : [2, 3, 4],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [None]:
rf_cv = RandomizedSearchCV(estimator=rf,
                          param_distributions=rf_grid,
                          verbose=7,
                          n_iter=50,
                          n_jobs=4,
                          random_state=10,
                          scoring='accuracy',
                          cv=5)

rf_cv.fit(X_train, Y_train)

In [None]:
rf_cv.best_estimator_

# 7. Feature Selection

In [None]:
th = np.sort(rf.feature_importances_)
for g in th:
    select = SelectFromModel(rf, threshold = g, prefit = True)
    x_train = select.transform(X_train)
    model = RandomForestClassifier()
    model.fit(x_train, Y_train)
    x_test = select.transform(X_test)
    y_pred = model.predict(x_test)
    score = accuracy_score(Y_test, y_pred)
    print('Threshold:', g, 'Model Score:', score)

# 8. Feature Engineering

In [None]:
new_df = df.copy()
new_df['acidity'] = df['fixed acidity'] + df['volatile acidity']
new_df = new_df.drop(['fixed acidity', 'volatile acidity'], axis=1)

In [None]:
x1 = new_df.drop('quality', axis=1)
y1 = new_df['quality']
X1_train, X1_test, Y1_train, Y1_test = train_test_split(x1, y1, test_size = 0.3, random_state = 1)

In [None]:
frf = RandomForestClassifier()
frf.fit(X_train, Y_train)

In [None]:
frf_pred = frf.predict(X_test)
print(classification_report(Y_test, frf_pred))

In [None]:
plt.figure(figsize=(25,12))
sns.heatmap(df.corr(), annot=True)
plt.show()

After hyperparameter tuning, feature selecton, feature engineering my model accuracy is not increasing

My best model is Random Forest Classifier