# Table of Contents

### Libraries & Environment Setup
### Data Loading & Formatting
### Missing Values
### Target Variable Binarization
### Exploratory Data Analysis
    1. Distribution of the target variable (Original vs Binary)   
    2. How do our variables interact with one another? (Correlation Matrix)
    3. How does one of the strongest relationship fair between wine types?
    4. How do the different type of acidity vary between good and bad wines?
    5. What are predictors of pH of any type of wine?
### Modeling 
    1. Target Variable Comparison
        a. MultiClass Using the Original Target Variable  
        b. Binary Using the newly created column 
    2. Find best sample Split
    3. Find best k
    4. Hyperparameter Tuning
    5. Check for overfitting

# Libraries & Environment Setup

In [None]:
import numpy as np
import pandas as pd 

import utils_clf_models as clf
import utils_data_prepping as udp
import utils_eda as eda

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(rc={'figure.figsize':(12, 10)})

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Loading the data

In [None]:
df = udp.loading('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df['quality'].unique()

# Checking for any missing values

In [None]:
udp.missing_values(df)

# Modifying the target variable

In [None]:
df['quality_binary'] = np.where(df['quality'] >= 6, 'good', 'bad')
df['quality_binary_num'] = np.where(df['quality'] >= 6, 1, 0)
df.head()

In [None]:
d =  { 3 : 0,  4 : 0, 
       5 : 1,  6 : 1,
       7 : 2,  8 : 2,}

d1 =  { 3 : 'bad',  4 : 'bad', 
       5 : 'good',  6 : 'good',
       7 : 'very good',  8 : 'very good',}

df['quality_tri_num'] = df['quality'].map(d)
df['quality_tri'] = df['quality'].map(d1)
df.head()

# Exploratory Data Analysis

## 1. Distribution of the target variable (Original vs Binary)

In [None]:
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(1, 3, hspace=0.1, wspace=0.1)
(ax1, ax2, ax3) = gs.subplots(sharex=False, sharey=True)
fig.suptitle('Histograms for Target Variable (Original vs Binary)')

sns.histplot(ax=ax1, data=df, x='quality')
sns.histplot(ax=ax2, data=df, x='quality_binary')
sns.histplot(ax=ax3, data=df, x='quality_tri')

plt.show()

## 2. How do our variables interact with one another? (Correlation Matrix)

In [None]:
eda.corr_matrix(df)
plt.show()

## 3. How does one of the strongest relationship fair between wine types?

In [None]:
sns.scatterplot(data=df, x='alcohol', y='density', hue='quality_binary')
plt.show()

In [None]:
sns.scatterplot(data=df, x='alcohol', y='density', hue='quality_tri')
plt.show()

## 4. How do the different type of acidity vary between good and bad wines?

In [None]:
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(2, 2, hspace=0.2, wspace=0.2)
(ax1, ax2), (ax3, ax4) = gs.subplots(sharex=False, sharey=True)
fig.suptitle('Histograms for Target Variable (Original vs Binary)')

sns.scatterplot(ax=ax1, data=df, x='fixed acidity',  y='citric acid', hue='quality_binary')
sns.scatterplot(ax=ax2, data=df, x='volatile acidity', y='citric acid', hue='quality_binary')

sns.scatterplot(ax=ax3, data=df, x='fixed acidity',  y='citric acid', hue='quality_tri')
sns.scatterplot(ax=ax4, data=df, x='volatile acidity', y='citric acid', hue='quality_tri')

plt.show()

## 5. What are predictors of pH of any type of wine?

In [None]:
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(2, 2, hspace=0.2, wspace=0.2)
(ax1, ax2), (ax3, ax4)  = gs.subplots(sharex=False, sharey=True)
fig.suptitle('Histograms for Target Variable (Original vs Binary)')

sns.scatterplot(ax=ax1, data=df, x='fixed acidity',  y='pH', hue='quality_binary')
sns.scatterplot(ax=ax2, data=df, x='citric acid', y='pH', hue='quality_binary')

sns.scatterplot(ax=ax3, data=df, x='fixed acidity',  y='pH', hue='quality_tri')
sns.scatterplot(ax=ax4, data=df, x='citric acid', y='pH', hue='quality_tri')

plt.show()

# Modeling 

## 1. Target Variable Comparison 

### a. MultiClass Using the Original Target Variable

In [None]:
df1 = df.drop(['quality_binary', 'quality_binary_num', 
               'quality_tri', 'quality_tri_num'], axis=1)
X, y = udp.pre_processing(df1, 'quality')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=0.2, state=12)
knn.fit_predict()
multi = knn.metrics().ev['Accuracy']

### b. Binary Using the newly created column

In [None]:
df2 = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_tri_num'], axis=1)
X, y = udp.pre_processing(df2, 'quality_binary_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=0.2, state=12)
knn.fit_predict()
binary = knn.metrics().ev['Accuracy']

### c. Three classes using the newly created column

In [None]:
df3 = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(df3, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=0.2, state=12)
knn.fit_predict()
trio = knn.metrics().ev['Accuracy']

In [None]:
pd.DataFrame({'multi':pd.Series(multi),'binary':pd.Series(binary), 'trio':pd.Series(trio)})
met = pd.DataFrame([multi,binary, trio]).T
met.rename(index= {0:'Accuracy'}, columns={0: "multi_class", 1: "binary_class", 2: "three_classes"})

#### After multiple attempts, it appears that splitting the target variable in three sets works better. We will proceed in finding the best k to maximize accuracy

# Modeling Phase 2: Find best sample Split

In [None]:
# Works using utils_clf_models (See Documentation)
def search_best_split(clf, iters):
    sizes = [round(i, 2) for i in np.arange(0.2, 0.45, 0.05)]
    states = list(range(0, iters+1))
    scores = {}
    for i in sizes:
        for j in states:
            clf.preprocess_split(size=i, state=j)
            clf.fit_predict()
            scores[(i, j)]  = clf.metrics().ev['Accuracy']
    
    best_split = max(scores, key=scores.get)
    return best_split
data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
s, t = search_best_split(knn, 500)

# Rerunning it with the following sampling parameters
data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=s, state=t)
knn.fit_predict()
trio = knn.metrics().ev['Accuracy']
trio

#### From the following loop, we obtain the sampling paramater as the size and state respectively.

# Modeling Phase 3: Find best k

In [None]:
def elbow_method(clf, iters):
    scores = {}    
    for k in range(2, iters+1):
        clf.fit_predict({'n_neighbors':k})
        clf.metrics().ev['Accuracy']
        scores[k]  = clf.metrics().ev['Accuracy']
    k_best = max(scores, key=scores.get)
    return k_best

data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(s, t)
k_best = elbow_method(knn, 30)

# Rerunning it with the following sampling parameters
data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=s, state=t)
knn.fit_predict({'n_neighbors': k_best})
trio = knn.metrics().ev['Accuracy']
trio

# Modeling Phase 4: Hyperparameter Tuning

In [None]:
#Using Grid Search (Exhaustive) with cross validation
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.model_selection import GridSearchCV

data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')

parameters = {'weights':('uniform', 'distance'), 
              'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'),
             'leaf_size': [0, 300], 'p': [0, 300],'n_jobs':(-1, 1)}
svc = knn(n_neighbors=2)
search = GridSearchCV(svc, parameters)
search.fit(X, y)
res = pd.DataFrame(search.cv_results_)
best = res[(res['rank_test_score'] == 1)]
best.sort_values(by=['std_score_time'], ascending=True)

#### Given that we have now our parameters set, we will check for overfitting before proceeding to an overfit check

In [None]:
# Reruning the model using the given parameters 
data = df.drop(['quality_binary', 'quality', 
               'quality_tri', 'quality_binary_num'], axis=1)
X, y = udp.pre_processing(data, 'quality_tri_num')
knn = clf.Classifier(X, y, 'knn')
knn.preprocess_split(size=s, state=t)
knn_params = {'n_neighbors': k_best, 'weights':'uniform', 
              'algorithm': 'kd_tree', 'leaf_size': 300, 
              'p': 300,'n_jobs':1,}
knn.fit_predict(knn_params)
trio = knn.metrics().ev['Accuracy']
trio

# Modeling Phase 4: Check for Overfitting