In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Libraries used in this project
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

# Display all columns
pd.set_option('display.max_columns', None)

print("Pandas version: ", pd.__version__)
print("Matplotlib version: ", matplotlib.__version__)

In [None]:
# Load the dataframe
raw_data = pd.read_csv('../input/wine-quality/winequalityN.csv')

In [None]:
# 1. DATA INVESTIGATION

In [None]:
# DATA INVESTIGATION: Descriptive statistics
raw_data.head(10) # Display it in tabular form of data
# raw_data.shape # Gives total rows and columns
# raw_data.columns # Gives lists of columns
# raw_data.info() # Gives datatype of each column

In [None]:
# DATA INVESTIGATION: Overall descriptive statistics of the raw data
pd.DataFrame([raw_data.mean(),
              raw_data.median(), 
              raw_data.std(), 
              raw_data.var()], 
             index=['Mean', 'Median','Std. dev', 'Variance'])

In [None]:
# - Highest variance was observed in residual sugar ARE'
# 'residual sugar', 'free sulfur dioxide', and 'total sulfur dioxide'

In [None]:
# DATA INVESTIGATION: is there unbalanced dataset in categorical variables
raw_data["type"].value_counts()
# The result is yes.

In [None]:
#  DATA INVESTIGATION: Descriptive statistics for each categorical label
raw_data.groupby("type").describe()

# White wine:Red wine is about 3:1. Hence there is concern of unbalanced dataset
# The max standard deviation for red wine includes 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'alcohol',
# The max standard deviation for white wine includes 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'alcohol'

In [None]:
# DATA INVESTIGATION: Visualise the class imbalance in this dataset
pd.value_counts(raw_data['type']).plot.bar()
plt.title('Wine class histogram')
plt.xlabel('Wine')
plt.ylabel('Frequency')

In [None]:
# DATA INVESTIGATION: Check if any row is duplicated
# Find out the total number of duplicated rows
raw_data.duplicated(subset=None, keep='first').sum() # Considers all columns

In [None]:
# DATA INVESTIGATION: Descriptive statistics of duplicated rows for both red and white wine
duplicate_rows = raw_data[raw_data.duplicated()].copy()
duplicate_rows.groupby("type").describe()

In [None]:
# DATA INVESTIGATION: Check if there is any NAs in the dataset
# Count the na values of all columns  
raw_data.isnull().sum().sort_values(ascending = False)

In [None]:
# Count the na values of all columns in terms of percentage
(raw_data.isnull().sum()* 100 / len(raw_data)).round(2).sort_values(ascending = False)

In [None]:
# 2. DATA PREPROCESSING

# DATA PREPROCESSING: Handling duplicated rows
# - Duplicates are an extreme case of nonrandom sampling.
# - they bias your fitted model. 
# - Including them will essentially lead to the model overfitting.

In [None]:
# DATA PREPROCESSING: Remove the duplicated rows
without_duplicate = raw_data.drop_duplicates(keep='first').copy()
without_duplicate.head(10)

In [None]:
# DATA PREPROCESSING: Descriptive statistics after duplicates are removed
without_duplicate.groupby("type").describe()

In [None]:
# DATA PREPROCESSING: check NA in precentage after duplicates are removed
(without_duplicate.isnull().sum()* 100 / len(without_duplicate)).round(2).sort_values(ascending = False)

# Since the Na values are less than 1% hence they can safely be removed (rule of thumb is 5%)

In [None]:
# DATA PREPROCESSING: Visualise NAs after removing duplicated rows in the form of pattern to see their location
sns.heatmap(without_duplicate.isnull(), cbar=True)

# Nas are located very sparse.

In [None]:
# DATA PREPROCESSING: Remove all NAs
# Drop the whole row that contains the empty cell
# Always assign the new dataframe
without_dup_na = without_duplicate.dropna(how= "any").copy()

In [None]:
# DATA PREPROCESSING: Check if there is any NAs
without_dup_na.isnull().values.any()

In [None]:
# DATA PREPROCESSING: Again visualise the categorical variable after removing duplicates and NAs


pd.value_counts(without_dup_na['type']).plot.bar()
plt.title('Wine class histogram')
plt.xlabel('Wine')
plt.ylabel('Frequency')

# Still we got the unbalanced dataset
# Next task is to balance the dataset

In [None]:
# 3. DATA PREPARATION

In [None]:
# DATA PREPARATION: Prepare target variable and independent variable
X = np.array(without_dup_na.iloc[:,without_dup_na.columns != 'type'])
y = np.array(without_dup_na.iloc[:,without_dup_na.columns == 'type'])

print("Shape of X(PREDICTORS): {}".format(X.shape))
print("Shape of y(TARGET): {}".format(y.shape))

In [None]:
# DATA PREPARATION: Create training/testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=10, shuffle=True)

print("Number of X_train dataset: ", X_train.shape)
print("Number of y_train dataset: ", y_train.shape)
print("\nNumber of X_test dataset: ", X_test.shape)
print("Number of y_test dataset: ", y_test.shape)

In [None]:
# DATA PREPARATION: Handling Unbalanced dataset
# Perform SMOTE algorithm to handle the unbalanced dataset
# https://www.kaggle.com/qianchao/smote-with-imbalance-data

print("Before OverSampling of training dataset, counts of label 'white': {}".format(sum(y_train=="white")))
print("Before OverSampling of training dataset, counts of label 'red': {} \n".format(sum(y_train=="red")))

In [None]:
# DATA PREPARATION: Handling Unbalanced dataset by oversampling label red wine

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=10)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X_res: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y_res: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label 'white': {}".format(sum(y_train_res=="white")))
print("After OverSampling, counts of label 'red': {} \n".format(sum(y_train_res=="red")))

In [None]:
# 4. MODEL - PIPELINE CREATION

In [None]:
# MODEL - PIPELINE: Libraries needed
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier  

In [None]:
# 4. MODEL - PIPELINE CREATION- Create pipelines with or without PCA

##  First create three different pipelines without PCA
# Logistic Regression
pipeline_lr = Pipeline([('scalar1', StandardScaler()),
                       ('lr_classifier', LogisticRegression(random_state=0))])

# Pipeline for decision tree
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])

# Pipeline for random forest classification
pipeline_rf = Pipeline([('rf_classifier', RandomForestClassifier())])

## Now create pipelines with PCA

# Pipeline for logistic regression
pipeline_pca_lr = Pipeline([('scalar1', StandardScaler()),
                       ('pca1', PCA(n_components=2)),
                       ('lr_classifier', LogisticRegression(random_state=0))])

# Pipeline for decision tree
pipeline_pca_dt = Pipeline([('scalar2', StandardScaler()),
                       ('pca2', PCA(n_components=2)),
                       ('dt_classifier', DecisionTreeClassifier())])
# Pipeline for random forest classification
pipeline_pca_rf = Pipeline([('scalar3', StandardScaler()),
                       ('pca3', PCA(n_components=2)),
                       ('rf_classifier', RandomForestClassifier())])

# Create the list of pipelines for classifier
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_pca_lr, pipeline_pca_dt, pipeline_pca_rf]

In [None]:
# 4. MODEL - PIPELINE CREATION: Fit the pipelines in training dataset
for pipe in pipelines:
    pipe.fit(X_train_res, y_train_res)

In [None]:
# 5. MODEL EVALUATION: Dictionary of pipelines and classifier type for ease of reference
pipe_dict = {0: 'Logistic Regression only', 1: 'Decision Tree classifier only', 2: 'Random Forest Classifier only',3: 'First PCA and Logistic Regression', 4: 'First PCA and Decision Tree classifier', 5: 'First PCA and Random Forest Classifier'}

# Model evaluation in training dataset
for i, model in enumerate(pipelines):
    print("{} Training Accuracy: {}".format(pipe_dict[i], model.score(X_train_res, y_train_res).round(4))) 

In [None]:
# 5. MODEL EVALUATION : Model evaluation in testing dataset
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test).round(4)))  

In [None]:
# 5. MODEL EVALUATION :Best Accuracy in testing dataset

# Initializer
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
print("Classifier with best accuracy in test dataset: {}".format(pipe_dict[best_classifier]))

In [None]:
# 6. PERFORMANCE METRICS OF BEST ALGORITHM

# Since Logistic Regression as well as Random forest works really well. Hence we will do detailed performance metrics analysis of these algorithms.
# Similarly, performing PCA before applying LR or Random forests algorithms did not enhance the performance of the model.

In [None]:
# 6. PERFORMANCE METRICS OF BEST ALGORITHM: Pipeline for logistic regression as well as random forest classification

# Preprocessing of training data, fit model for Logistic Regression only (Note:NO PCA)
pipeline_lr.fit(X_train_res, y_train_res)

y_pred_LR = pipeline_lr.predict(X_test)

# Preprocessing of training data, fit model for Random forest classification only (Note:NO PCA)
pipeline_rf.fit(X_train_res, y_train_res)

y_pred_RF = pipeline_rf.predict(X_test)

In [None]:
# 6. DETAILED PERFORMANCE METRICS OF BEST ALGORITHM: LOGISTIC REGRESSION WITHOUT PCA

from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred_LR)
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy score: {:.2f}\n'.format(accuracy_score(y_test, y_pred_LR).round(4)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred_LR, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred_LR, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred_LR, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred_LR, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred_LR, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred_LR, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred_LR, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred_LR, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred_LR, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred_LR, target_names=['Red wine', 'White wine']))

In [None]:
# 6. DETAILED PERFORMANCE METRICS OF BEST ALGORITHM: RANDOM FOREST CLASSIFICATION WITHOUT PCA
confusion = confusion_matrix(y_test, y_pred_RF)
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy score: {:.2f}\n'.format(accuracy_score(y_test, y_pred_RF).round(4)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred_RF, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred_RF, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred_RF, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred_RF, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred_RF, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred_RF, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred_RF, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred_RF, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred_RF, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred_RF, target_names=['Red wine', 'White wine']))

In [None]:
# 7. FINAL EVALUATION

In [None]:
# 7. FINAL EVALUATION: Both Logistic regression and Random forest works equally well. I decided to use the logistic regression as it is simple compared to random forest.

#  Plot the confusion matrix

# Creates a confusion matrix
cm = confusion_matrix(y_test, y_pred_LR) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['Red wine','White wine'], 
                     columns = ['Red wine','White wine'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt='d',annot_kws={"size": 20})
plt.title('Logistic Regression \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_pred_LR)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
# Final conclusion:
# Since the data looks very ideal hence, the Logistic Regression as well as Random Forest gave best result.
