# INTRODUCTION

In this kernel, I will explain how we can implement the logistic regression algorithm from scratch and using the sklearn library.
    
* [Logistic Regression From Scratch](#1)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()

In [None]:
data.quality.unique()

There is more than two quality measures, but logistic regression gives us only two classes. We need to reduce quality into two classes which are: 0 and 1.

In [None]:
data.quality = [1 if each > 7 else 0 for each in data.quality]

To classify quality 0 and 1, I used 'greater than 7' condition because I tried two different values 6 and 7. 7 gives better accuracy to both training and testing.

In [None]:
data.head()

In [None]:
data.info()

Now, we can start implementing the logistic regression.

In [None]:
data.quality = data.quality.astype(float)

<a id = '1'></a>
## Logistic Regression From Scratch

In this part, I will implement logistic regression method from scratch. You can see the steps below.

* Train - Test Splitting
* Initialize w and b and Sigmoid Function

### Train - Test Splitting

To split train and test sets, I will use train_test_split method from sklearn library.

In [None]:
x = data.iloc[:,0:11].values
y = data.iloc[:,[11]].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T

### Initialize w and b and Sigmoid Function

To initialize weights and bias, I implemented a function named 'initialize_w_b' and I take one parameter: dimension. 

In [None]:
def initialize_w_b(dimension):
    
    w = np.full((dimension,1), 0.1)
    b = 0.0
    return w, b

#initializes a weight with dimensions (dimension,1) and return it

In [None]:
def sigmoid(x):
    sig = 1 / (1 + np.exp(-x))
    return sig

### Forward and Backward Propagation
In this step, I will implement both forward and backward propagations.

In [None]:
def forward_backward_propagation(w, b, x_train, y_train):
    
    z = np.dot(w.T, x_train) + b
    y_head = sigmoid(z)
    
    loss = - y_train * np.log(y_head) - (1 - y_train) * np.log(1 - y_head)
    cost = (np.sum(loss)) / x_train.shape[1]
    
    derivative_weight = (np.dot(x_train, ((y_head - y_train).T))) / x_train.shape[1]
    derivative_bias = (np.sum(y_head - y_train)) / x_train.shape[1]
    
    gradients = {"derivative_weight" : derivative_weight, "derivative_bias" : derivative_bias}
    return cost, gradients

We need to do the propagation as number of iterations in our algorithm and we must update the weight and bias using the learning rate.

In [None]:
def update(w, b, x_train, y_train, learning_rate, epochs):
    
    costs = []
    costs2 = []
    index = []
    
    for iteration in range(epochs):
        
        cost, gradients = forward_backward_propagation(w, b, x_train, y_train)
        costs.append(cost)
        
        w = w - (learning_rate * gradients["derivative_weight"])
        b = b - (learning_rate * gradients["derivative_bias"])        
        
        if iteration % 10 == 0:
            costs2.append(cost)
            index.append(iteration)
            print ("Cost after iteration %i: %f" %(iteration, cost))
            
    
    param = {"w" : w, "b" : b}
    plt.plot(index, costs2)
    plt.xticks(index, rotation = 90)
    plt.xlabel("Num of Iterations")
    plt.ylabel("Costs")
    plt.show()
    
    return param, gradients, costs


In [None]:
def predict(w, b, x_test):
    
    z = sigmoid(np.dot(w.T, x_test) + b)
    y_pred = np.zeros((1, x_test.shape[1]))
    
    for iteration in range(z.shape[1]):
        
        if z[0, iteration] <= 0.5:
            z[0, iteration] = 0
        else:
            z[0, iteration] = 0
            
    return y_pred


In [None]:
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate, epochs):
    
    dimension = x_train.shape[0]
    w, b = initialize_w_b(dimension)
    
    param, gradients, costs = update(w, b, x_train, y_train, learning_rate, epochs)
    
    y_pred_test = predict(param["w"], param["b"], x_test)
    y_pred = predict(param["w"], param["b"], x_train)

    print("train accuracy: {} %".format(100 - np.mean(np.abs(y_pred - y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(y_pred_test - y_test)) * 100))

In [None]:
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 0.01, epochs = 300)

## Logistic Regression with sklearn

In [None]:
lr = linear_model.LogisticRegression(random_state = 42,max_iter= 150)

In [None]:
lr.fit(x_train.T, y_train.T)

In [None]:
print("Test accuracy: ".format(lr.score(x_train.T, y_train.T)))