In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split,
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#### Import Data

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head()

#### EDA

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,6))
sns.kdeplot(df['fixed acidity'], hue=df['quality'])
#Fixed Activity data follows the normal distribution

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['volatile acidity'])
#Volatile Acidity data follows the normal distribution.

In [None]:
df['citric acid'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df['citric acid'])

In [None]:
df['residual sugar'].value_counts()

In [None]:
df.groupby('quality').mean()

**Observations:**
* Fixed acidity falls in same range for all quality level.
* Lower the volatile acidity level, higher the quality of wine.
* Higher the citric acid level, higher the quality of wine
* There is no significant difference in residual sugar value between different quality levels.
* Lower the chlorides level, higher the quality level of wine.
* There is relationship between free sulfur dioxide and total sulfur dioxide.
* Lower the pH value, higher the quality level of wine.
* Higher the sulphates level, quality level of wine increases.
* Alochol values falls in same range for all alcohol quality level.

In [None]:
df.describe()

In [None]:
corr = df.corr()
corr

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(corr, linewidths=3, annot=True)

**Observation:**
* There is collinearity between multiple independent variable

### Scaling

In [None]:
sc = StandardScaler()

df_scaled = pd.DataFrame(sc.fit_transform(df.drop('quality',axis=1)),columns=df.columns[:-1])

df_scaled.head()

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled, df['quality'], test_size=0.3, random_state=100)

In [None]:
y_train.value_counts()

### Modelling

In [None]:
def metrics(y_true, y_pred):
    print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))
    print('\n\nAccuracy Score:\n', accuracy_score(y_true, y_pred))
    print('\n\nClassification Report: \n', classification_report(y_true, y_pred))


def predictions(model,X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
        model.fit(X_train, y_train)
        #predictions
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        actual = [y_train, y_test]
        pred = [train_pred, test_pred]
        for i in range(0,2):
            if i==0:
                print('----Train Metrics----')
            else:
                print('----Test Metrics----')
            metrics(actual[i], pred[i])

### Logistic Regression

In [None]:
lg = LogisticRegression(multi_class='ovr')



In [None]:
predictions(lg)

### KNN

In [None]:
knn = KNeighborsClassifier()

In [None]:
predictions(knn)

#### Naive Bayes

In [None]:
nb = GaussianNB()

predictions(nb)

#### Decision Tree

In [None]:
dtree = DecisionTreeClassifier()

predictions(dtree)

#### Bagging

In [None]:
bag = BaggingClassifier()

In [None]:
predictions(bag)

#### Random Forest

In [None]:
rf = RandomForestClassifier()

predictions(rf)

#### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()

predictions(gb)

#### XG Boost

In [None]:
xgb = XGBClassifier()

predictions(xgb)

#### Light GBM

In [None]:
lgbm = LGBMClassifier()
predictions(lgbm)

#### CAT Boost

In [None]:
cat = CatBoostClassifier()

predictions(cat)