# Implementation of Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Logistic Regression on binary data

Recall that logistic regression has the form: **1 / (1 + e^z) where z = (b) + (w1)(x1) + ... (wn)(xn)**

In [88]:
class Logistic_Regression():
    
    def __init__(self):
        self.bias = 0
        self.theta = None
        self.lr = 0.001
        self.epochs = 1000
        self.decision_threshold = 0.5
        
    def split_train_test(self):
        y = self.df.as_matrix(columns=[self.target_feature])
        X = self.df.drop(self.target_feature, axis=1).as_matrix()
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size, random_state=42)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def cost_function(self):
        preds = self.predict(self.X_train)
        error = (self.y_train * np.log(preds)) + ((1 - self.y_train) * np.log(1 - preds))
        return -error.sum() / len(self.X_train)
        
    def sigmoid(self, z):
        return (1 / (1 + np.exp(-z)))
    
    def predict(self, X):
        return self.sigmoid(np.dot(X, self.theta))
        
    def gradient_descent(self):
        preds = self.predict(self.X_train)
        error = preds - self.y_train
        gradient = np.dot(np.transpose(self.X_train), error)
        gradient /= len(self.X_train)
        gradient *= self.lr
        self.theta -= gradient
        
    def decision_threshold(self, y):
        return 1 if y >= self.decision_threshold else 0
    
    def classify(self, X):
        raise NotImplementedError
        
    def train(self, df, target_feature, decision_threshold=0.5, test_size=0.2, learning_rate=0.01, epochs=100):
        self.df = df
        self.decision_threshold = decision_threshold
        self.test_size = test_size
        self.target_feature = target_feature
        self.lr = learning_rate
        self.epochs = epochs
        self.theta = np.zeros((len(df.columns) - 1, 1), dtype=np.float32)
        self.split_train_test()
        for i in range(0, self.epochs):
            self.gradient_descent()
            print("Loss:", self.cost_function())

In [89]:
def run_logistic_regression():
    df = pd.read_csv("wine.csv")
    df = df.dropna() 
    
    # keep only wines of quality 5 or 6 since they are mostly balanced
    # and to show how logistic regression works for binary classification
    print(df['quality'].value_counts(), "\n")
    df = df[df['quality'].isin(['5', '6'])]
    
    # map 5 -> 0 and 6 -> 1 for readability
    df.loc[df['quality'] == 5, 'quality'] = 0
    df.loc[df['quality'] == 6, 'quality'] = 1
    df = df[['pH', 'alcohol', 'quality']]
    print(df.head(), "\n")
    
    logistic_regression = Logistic_Regression()
    logistic_regression.train(df, 'quality')
    
run_logistic_regression()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64 

     pH  alcohol  quality
0  3.51      9.4        0
1  3.20      9.8        0
2  3.26      9.8        0
3  3.16      9.8        1
4  3.51      9.4        0 

Loss: 0.6929991547310735
Loss: 0.6929111537444642
Loss: 0.6928532619561972
Loss: 0.6928104770444732
Loss: 0.6927752754033148
Loss: 0.6927438836653939
Loss: 0.6927144091840419
Loss: 0.6926859025858222
Loss: 0.6926578876033251
Loss: 0.6926301253122141
Loss: 0.6926024958408306
Loss: 0.6925749390597307
Loss: 0.6925474247935092
Loss: 0.6925199378707199
Loss: 0.6924924707290985
Loss: 0.6924650195405884
Loss: 0.6924375823931092
Loss: 0.6924101583126226
Loss: 0.6923827468121357
Loss: 0.6923553476556973
Loss: 0.6923279607052336
Loss: 0.6923005859072454
Loss: 0.692273223203863
Loss: 0.6922458726078888
Loss: 0.692218534052508
Loss: 0.6921912075700503
Loss: 0.692163893175009
Loss: 0.6921365908159994
Loss: 0.6921093004970315
Loss: 0.6920820222601552
Loss: 0.692