## Import the Libraries

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')

## Import Dataset

In [60]:
data=pd.read_csv('C:/Users/Shikhar/Desktop/ML/Asgn-2/data/winequality-red.csv', sep=';')

## Convert all the values in quality attribute to 0 (bad) if the value is less than or equal to ‘6’ and others to 1(good)

In [61]:
x=data['quality']
for i in range(len(x)):
    if(x[i] <=6):
        x[i]=0
    else:
        x[i]=1

## MIN-MAX Scaling

In [62]:
## Min-max scaling on all the columns of the dataframe

s='fixed acidity'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='volatile acidity'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='citric acid'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='residual sugar'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='chlorides'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='free sulfur dioxide'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='total sulfur dioxide'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='density'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='pH'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='sulphates'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

s='alcohol'
data[s]=(data[s]-data[s].min())/(data[s].max()-data[s].min())

# Dataset is ready for Logistic Regression

## Activation function - sigmoid

In [63]:
#Dataset to be used

X=pd.DataFrame(data = data, columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'])
y=pd.DataFrame(data = data, columns = ['quality'])

In [64]:
x0=np.ones(X.shape[0])
X.insert(0, 'X_ref', x0)                   # A column inserted for x0=1

## Basic Model for Logistic Regression

In [65]:
def model(X,y):
    
    def sigmoid(z):
        a=1/(1+np.exp(-z))
        return a

    def hypothesis(X,theta,m):                           #predicted values from the model

        y_predicted = sigmoid(np.dot(X,theta))
        return y_predicted

    theta=np.zeros(X.shape[1])
    theta_j=np.zeros(X.shape[1])

    m=len(X.index)                                #number of training examples
    num_iter=10000                         #number of iterations for parameters
    learning_rate=0.01

    for i in range(num_iter):
        
        y_predicted = hypothesis(X,theta,m)
        sum1=(1/m)*(np.dot(X.T, y_predicted-y['quality']))             #derivative of cost function
        theta_j = theta - learning_rate*sum1                          #gradient descent on parameters
        theta = np.array(theta_j)


    #Obtaining confusion matrix values to calculate accuracy
    
    True_Negative = 0
    True_Positive = 0
    False_Negative = 0
    False_Positive = 0
    for i in range(m):
        output = np.dot(X.loc[i],theta)
        predicted = sigmoid(output)

        if(predicted >= 0.5 and y['quality'][i]==1):
            True_Positive += 1
        elif(predicted < 0.5 and y['quality'][i]==0):
            True_Negative += 1
        elif(predicted >= 0.5 and y['quality'][i]==0):
            False_Positive += 1
        elif(predicted < 0.5 and y['quality'][i]==1):
            False_Negative += 1

    acc = ((True_Positive+True_Negative)*100.0)/(True_Positive+True_Negative+False_Positive+False_Negative)
    
    return acc

    
model(X,y)

86.42901813633522

## k-fold cross validation from scratch

In [66]:
k=3

X_train_1=X.loc[533:1599]
y_train_1=y.loc[533:1599]
X_train_1=X_train_1.reset_index()
y_train_1=y_train_1.reset_index()
X_train_1=X_train_1.drop(['index'], axis=1)
y_train_1=y_train_1.drop(['index'], axis=1)

X_train_2=pd.concat([X.loc[0:532],X.loc[1066:1598]],axis=0)
y_train_2=pd.concat([y.loc[0:532],y.loc[1066:1598]],axis=0)
X_train_2=X_train_2.reset_index()
y_train_2=y_train_2.reset_index()
X_train_2=X_train_2.drop(['index'], axis=1)
y_train_2=y_train_2.drop(['index'], axis=1)

X_train_3=X.loc[0:1065]
y_train_3=y.loc[0:1065]
X_train_3=X_train_3.reset_index()
y_train_3=y_train_3.reset_index()
X_train_3=X_train_3.drop(['index'],axis=1)
y_train_3=y_train_3.drop(['index'],axis=1)

In [67]:
#Mean accuracy from all 3-folds of Cross Validation

acc1=model(X_train_1,y_train_1)
acc2=model(X_train_2,y_train_2)
acc3=model(X_train_3,y_train_3)

acc=(acc1+acc2+acc3)/3

print(acc)                      # Accuracy from all the three sets of train and test data

86.42901813633522


## Logistic Regression using ScikitLearn Library

In [68]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [69]:
logistic_regression = LogisticRegression()
#using sklearn for comparing results
logistic_regression.fit(X,y)
LogisticRegression(solver='saga')
y_predict = logistic_regression.predict(X)
acc = metrics.accuracy_score(y,y_predict)
print("The accuracy from sklearn is :",acc*100.0)

The accuracy from sklearn is : 87.74233896185115


## k-fold cross validation using scikit learn

In [70]:
k_fold_acc=cross_val_score(logistic_regression, X, y, cv=3)
k_fold_mean=k_fold_acc.mean()
print("The accuracy for the Logistic Regression model using k-fold cross validation is :", k_fold_mean*100)

The accuracy for the Logistic Regression model using k-fold cross validation is : 86.74250387300826


In [71]:
print(confusion_matrix(y,y_predict))

[[1358   24]
 [ 172   45]]


## Metrics (Accuracy score, Precision Score and Recall) from the Model

In [72]:
#Accuracy, Precision, Recall of the Model

print("Accuracy Score :", accuracy_score(y, y_predict))
print("Prcision Score :", precision_score(y,y_predict))
print("Recall Score :", recall_score(y,y_predict))

Accuracy Score : 0.8774233896185115
Prcision Score : 0.6521739130434783
Recall Score : 0.2073732718894009
