## Importing necessary Library

In [None]:
import operator as op
import random
random.seed(123)

import numpy as np 
import pandas as pd 
from math import sqrt
from random import randrange
from sklearn.model_selection import train_test_split
from scipy.stats import mode

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
import sklearn.metrics as skm

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(rc={'figure.figsize': (12,8)})

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data import and first exploration

In [None]:
df = pd.read_csv('/kaggle/input/wine-quality-binary-classification/wine.csv')
df.head()

In [None]:
df.info()

In [None]:
# Encoding categorical variable
df['quality_cat'] = df['quality'].astype('category').cat.codes
df.head()

In [None]:
corr = np.corrcoef(df.corr())
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), annot=True, mask=mask)
plt.show()

In [None]:
# Plotting the highest correlated pairs
sns.scatterplot(data=df, x='density', y='alcohol', hue='quality')
plt.show()

# K-Nearest Neighbors
#### A machine learning model hat uses euclidean distance between data points. In other words, clusters are created by proximity from which we make further predictions

In [None]:
models = {}

In [None]:
class kNN():
    def __init__(self, num_k):
        # Number of k clusters
        self.num_k = num_k
    
    def fit(self, X_train, Y_train):
        # Splitting the dataset into training and test dataset
        self.X_train = X_train
        self.Y_train = Y_train
        
        self.m, self.n = X_train.shape

    def predict(self, X_test):
        # Feed into the algorithm the test dataset to make some predictions
        self.X_test = X_test
        self.m1 = X_test.shape[0]
        preds = np.zeros(self.m1)
        for i in range(self.m1):
            x = self.X_test[i]
            neighs = np.zeros(self.num_k)
            neighs = self.get_neighbors(x)
            preds[i] = mode(neighs)[0][0]

        return preds
    
    def accuracy(self, Y_test, Y_pred):
        return np.sum(Y_test == Y_pred) / len(Y_test)

    def get_neighbors(self, x):
        # Locate the most simlar neighbors given a number of k clusters
        distances = np.zeros(self.m)
        for i in range(self.m):
            distances[i] = self.euclidean_distance(x, self.X_train[i])
            
        # Sorting according to euc. dist.
        sorted_dist = distances.argsort()
        Y_train_sorted = self.Y_train[sorted_dist]

        return Y_train_sorted[:self.num_k]
    
    def euclidean_distance(self, X1, X2):
        # Calculating euclidean distance between two vectors (here data points)
        return np.sqrt(np.sum((X1-X2)**2))

# Logistic Regression
#### Based on a sigmoid function, outputting value (probability) between 0 and 1. Using Gradient Descent, we will attempt to iterate through the epochs at step alpha to minime the errors in order to find the global minimu if possible.

In [None]:
class LogReg():
    def __init__(self, alpha, epochs):
        # step alpha, epoch as no. of iterations
        self.alpha = alpha
        self.epochs = epochs
        
    def fit(self, X, y):
        # Splitting it the dataset into training and test dataset and initialize the parameters
        self.X = X
        self.y = y
        self.m, self.n = X.shape
        self.theta = np.ones(self.n)
        self.gradient_descent(X, y, self.theta, self.alpha, self.epochs)
    
    def gradient_descent(self, X, y, theta, alpha, epochs):
        # Base algorithm of gradient descent (GD)
        J = [self.cost(X, y, theta)] 
        for i in range(0, epochs):
            h = self.hypothesis(X, theta)
            for i in range(0, self.n):
                theta[i] -= (alpha/self.m) * np.sum((h-y)*X[:, i])
            J.append(self.cost(X, y, theta))
        return J, theta
    
    def cost(self, X, y, theta):
        # Cost funtion (minmizing the cost)
        h = self.hypothesis(X, theta)
        y_0 = y * np.log(h)
        y_1 = (1 - y) * np.log(1 - h)
        return -(1/self.m) * sum(y_0 + y_1)
    
    def hypothesis(self, X, theta):
        # Calculating the hypothesis using the wights
        z = np.dot(theta, X.T)
        return 1/(1 + np.e**(-z))
    

    def predict(self, X, threshold):
        # Predicting using the algo GD
        J, th = self.gradient_descent(self.X, self.y, self.theta, 
                                      self.alpha, self.epochs) 
        h = self.hypothesis(X, self.theta) 
        return [1 if i >= threshold else 0 for i in h]
    
    def accuracy(self, y_test, y_pred):
        # Actual - predicted to obtain accuracy rate
        return np.sum(y_test == y_pred) / len(y_test)

In [None]:
# Visualizing for the binary categories
def visualize(Y_test, pred_knn, pred_log):
    # Benchmark for prediction of only 1s (only men)
    bm_pred = [0 for _ in range(len(Y_test))]
    bm_auc  = roc_auc_score(Y_test, bm_pred)
    bm_fpr, bm_tpr, _ = roc_curve(Y_test, bm_pred)

    knn_auc = roc_auc_score(Y_test, pred_knn)
    log_auc = roc_auc_score(Y_test, pred_log)
    knn_fpr, knn_tpr, _ = roc_curve(Y_test, pred_knn)
    log_fpr, log_tpr, _ = roc_curve(Y_test, pred_log)

    
    plt.plot(bm_fpr, bm_tpr, linestyle='--', label='Benchmark')
    plt.plot(knn_fpr, knn_tpr, marker='.', label='kNN')
    plt.plot(log_fpr, log_tpr, marker='.', label='Log Reg')
    text = 'BM =%.2f, K-NN =%.2f, Logistic Regression =%.2f'
    plt.title(text % (bm_auc, knn_auc, log_auc))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
X = df.drop(['quality_cat', 'quality'], axis=1).values
Y = df['quality_cat'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size = 0.2, 
                                                    random_state = 2)
# Running knn with the elbow method
clf = kNN(4)
clf.fit(X_train, Y_train)
pred_knn = clf.predict(X_test)
models['k-NN'] = clf.accuracy(Y_test, pred_knn)

# Running Logistic Regression
inter_0 = np.ones((X_train.shape[0], 1))
inter_1 = np.ones((X_test.shape[0], 1))
X_train = np.hstack((inter_0, X_train))
X_test = np.hstack((inter_1, X_test))

model = LogReg(0.001, 10000)
model.fit(X_train, Y_train)
pred_log = model.predict(X_test, 0.5)

visualize(Y_test, pred_knn, pred_log)

plt.legend()
plt.show()