In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

## Data Analysis

In [None]:
dataset.info()

From above, we can observe that

1. There are 12 cols and 1599 data rows.
2. The data doesn't contain any null values.
3. quality col is our class label.

In [None]:
dataset.head()

In [None]:
dataset['quality'].unique()

There are 6 different qualities of wine. 

## Data Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print(dataset['quality'].value_counts())
sns.countplot(x='quality', data=dataset)
plt.title('Wine Quality Count')
plt.show()

### The data looks very unbalanced for different wine quality classes. The data is very less for quality 3,4,7,8 as compared to 5,6.

In [None]:
sns.catplot(x="quality", y="alcohol", data=dataset)
plt.show()

In [None]:
sns.catplot(x="quality", y="fixed acidity", data=dataset)
plt.show()

In [None]:
plt.figure(figsize=(20,10)) 
sns.heatmap(dataset.corr(), annot=True)
plt.show()

From this heatmap we can say that the wine features are not correlated to each other. 

## Data Modeling

In [None]:
X = dataset.drop(['quality'],axis=1)
Y = dataset['quality']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

In [None]:
# Set seed for reproducibility
SEED = 42

### Since the Red Wine datast is highly imbalanced, we will use oversampling the data so that it's balanced

## Over-Sampling the Imbalanced Data

In [None]:
## oversampling
from imblearn.over_sampling import SMOTE
os=SMOTE()
X_res,y_res=os.fit_sample(X, Y)

In [None]:
y_res.value_counts()

### From above we can observe that the wine quality data is now properly balanced.

In [None]:
# Split dataset into 70% train, 30% test
X_train, X_test, y_train, y_test= train_test_split(X_res, y_res, test_size=0.2, random_state=SEED)

## Feature Scaling

### Normalize the data

In [None]:
# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

### Standardize the data

In [None]:
# fit scaler on training data
stdscale = StandardScaler().fit(X_train)

# transform training data
X_train_std = stdscale.transform(X_train)

# transform testing dataabs
X_test_std = stdscale.transform(X_test)

## Model Selection

In [None]:
# Instantiate individual classifiers
lr = LogisticRegression(max_iter = 500, n_jobs=-1, random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)
svc = SVC(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED)
xgbc = xgb.XGBClassifier(random_state=SEED)
lgbmc = lgbm.LGBMClassifier(random_state=SEED)
cbc = cb.CatBoostClassifier(random_state=SEED, verbose=False)
gbc = GradientBoostingClassifier(random_state=SEED)

# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', lr),
('K Nearest Neighbours', knn),
('SVM', svc),
('Random Forest Classifier', rf),
('Decision Tree', dt),
('XGBClassifier', xgbc),
('LGBMClassifier', lgbmc),
('CatBoostClassifier', cbc),
('GradientBoostingClassifier', gbc)]              

## Models prediction without any normalization or standardization

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

## Models prediction with Normalized data

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train_norm, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test_norm)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

## Models prediction with Standardized data

In [None]:
# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    #fit clf to the training set
    clf.fit(X_train_std, y_train)
    # Predict the labels of the test set
    y_pred = clf.predict(X_test_std)
    # Evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

From above we can that CatBoost Classifier gives us the best result irrespective of the Normalized or Standardized data

## Combining various Models - Voting Classifier

In [None]:
votingC = VotingClassifier(estimators=[('Random Forest', rf), ('LightGBM', lgbmc), ('Catboost', cbc)], voting='soft', n_jobs=-1)
votingC = votingC.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = votingC.predict(X_test)
# Evaluate the accuracy of clf on the test set
accuracy_score(y_test, y_pred)

In [None]:
votingC = VotingClassifier(estimators=[('Random Forest', rf), ('LightGBM', lgbmc), ('Catboost', cbc)], voting='soft', n_jobs=-1)
votingC = votingC.fit(X_train_norm, y_train)
# Predict the labels of the test set
y_pred = votingC.predict(X_test_norm)
# Evaluate the accuracy of clf on the test set
accuracy_score(y_test, y_pred)

In [None]:
votingC = VotingClassifier(estimators=[('Random Forest', rf), ('LightGBM', lgbmc), ('Catboost', cbc)], voting='soft', n_jobs=-1)
votingC = votingC.fit(X_train_std, y_train)
# Predict the labels of the test set
y_pred = votingC.predict(X_test_std)
# Evaluate the accuracy of clf on the test set
accuracy_score(y_test, y_pred)

From above we can say that Voting Classifier marginally improves the accuracy compared to the CatBoost Classifier. 