## 1. Load Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler , StandardScaler
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 2. Load Data

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head() # Visualize first 5 rows of data

## 3. Visualize Data

In [None]:
# visualize statistical data
df.describe()

In [None]:
# finding out data types
df.dtypes

In [None]:
# finding out null values
pd.isna(df).sum()

In [None]:
df.hist(figsize = (10, 10));

In [None]:
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(df.corr(), annot = True)

In [None]:
cor['quality'].sort_values(ascending=False)

We can conclude that alcohol highest correlation with quality

In [None]:
sns.barplot(x = 'quality', y = 'alcohol', data = df)

## 4. Preprocessing of Data

In [None]:
print(df.quality.describe())

In [None]:
def create_level(x):
    # function to create levels basis wine quality
    if x <= 5:
        x = "low"
    elif x > 5 and x < 7:
        x = "medium"
    else:
        x = "high"
    return x

In [None]:
df['level'] = df['quality'].apply(lambda x: create_level(x))

In [None]:
from sklearn.preprocessing import LabelEncoder
LB = LabelEncoder()
LB_encoded = LB.fit_transform(df['level'])
print((LB.classes_))
print(LB_encoded)

## 5. Split Data in Training & Testing samples

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:11],LB_encoded,test_size=0.20,
                                                    random_state=21)

print('Shape of Training Xs:{}'.format(x_train.shape))
print('Shape of Test Xs:{}'.format(x_test.shape))
print('Shape of Training y:{}'.format(y_train.shape))
print('Shape of Test y:{}'.format(y_test.shape))

## 6. Check k-fold cross validation before builting model

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf

In [None]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
get_score(LogisticRegression(max_iter=10000),x_train, x_test, y_train, y_test)

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
get_score(DecisionTreeClassifier(),x_train, x_test, y_train, y_test)

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
get_score(RandomForestClassifier(),x_train, x_test, y_train, y_test)

In [None]:
# Support Vector Machine
from sklearn.svm import SVC
get_score(SVC(),x_train, x_test, y_train, y_test)

Random Forest observed highest score amongs other, so we can go through Random Forest Classifier

## 7. Build Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion= "entropy", max_depth = 50, max_features = 'auto', n_estimators = 500)
clf.fit(x_train, y_train)
y_predicted = clf.predict(x_test)
score=clf.score(x_test,y_test)

In [None]:
print(score)
print(y_predicted)

## 8. Predict outcome using the Test Data

In [None]:
y_predicted_labels = LB.inverse_transform(y_predicted)
y_predicted_labels[0:10]

In [None]:
true_labels = LB.inverse_transform(y_test)
true_labels[0:10]

## 9. Confusion Matrix

In [None]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(true_labels, y_predicted_labels)
np.set_printoptions(precision=2)
cnf_matrix

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
#Without Normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=LB.classes_,
                      title='Confusion matrix, without normalization')
# With normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= LB.classes_, normalize=True,
                      title='Normalized confusion matrix')

plt.show()