In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('classification.csv')

In [3]:
df.iloc[0], df.iloc[0].shape

(age                     41
 ed          college degree
 employ                  17
 address                 12
 income                 176
 debtinc                9.3
 creddebt         11.359392
 othdebt           5.008608
 default                  1
 Name: 0, dtype: object,
 (9,))

In [4]:
"""
h_beta(X) = 1 / (1 + e**(-beta.dot(X)))    # Prediction

"""

'\nh_beta(X) = 1 / (1 + e**(-beta.dot(X)))    # Prediction\n\n'

In [5]:
import numpy as np


class LogisticRegression1:
    """
    Implement logistic regression with GD.
    """
    def __init__(self, lr=0.1, max_iter=3000, metric_change=0.000001):
        """
        :param lr: learning rate
        """
        self._lr = lr
        self._weights = None
        self.max_iter = max_iter
        self.metric_change = metric_change

        self._x = None
        self._y = None


    def loss_function(self):
        """
        TODO: calculate the loss function
        """
        if self._x is None or self._y is None:
            raise ValueError('All methods can be called after fit method is called.')
        _loss = np.sum((1 - self._y) * np.log(1 - self.sigmoid(self._x)) + self._y * np.log(self.sigmoid(self._x)))
        return _loss
        

    def gradient(self, x, y):
        """
        TODO: Calculate the gradient of the loss function.
        """
        grad = []
        for i in range(x.shape[1]):
            grad.append(np.sum(x[:,i]*(self.sigmoid(x) - y)))
        
        return np.array(grad)

    def sigmoid(self, x):
        """
        :param x: feature matrix.
        :returns: sigmoid vector of all features.
        TODO: Add ones to feature matrix and compute sigmoid.
        """
        if self._weights is None:
            raise ValueError('All methods can be called after fit method is called.')

        _sigmoid = 1 / (1 + np.exp(-x.dot(self._weights.T)))

        return _sigmoid
    
    def update_weights(self):
        grad = self.gradient(self._x, self._y)
        self._weights = self._weights - self._lr * grad
        self.coef_ = [self._weights[1:]]
        self.intercept_ = self._weights[0]
        return


    def fit(self, x, y):
        """
        TODO: normalize the data and fit the logistic regression.
        :param x: features matrix
        :param y: labels
        :returns: None if can't fit, weights, if fitted.
        """
        # TODO: initialize weights here
        l = x.shape[0]
        x = np.concatenate([np.ones(l).reshape((-1, 1)), x], axis=1)
        self._x = x
        self._y = y
        self._weights = np.zeros(self._x.shape[1])

        # TODO: SGD code here
        self.L = self.loss_function()
        
        for _ in range(self.max_iter):
            self.update_weights()
            temp_change = np.abs(self.L - self.loss_function())
            self.L = self.loss_function()

            if temp_change < self.metric_change:
                break

        return self._weights

    def predict(self, x, threshold=.5):
        """
        Predict which class is each data in x
        :param x: features matrix
        """
        l = x.shape[0]
        x = np.concatenate([np.ones(l).reshape((-1, 1)), x], axis=1)
        return np.where(self.predict_proba(x) >= threshold, 1, 0)

    def predict_proba(self, x):
        """
        Predict the probability, that x is of class 1.
        """
        return self.sigmoid(x)
    
    def perf_measure(self, y_pred, y):
        a = np.unique(y_pred + y, return_counts=True)
        b = np.unique(y_pred - y, return_counts=True)
        print(a)
        print(b)
        TP = a[1][2]
        TN = a[1][0]
        FP = b[1][2]
        FN = b[1][0]
        print(TP, TN, FP, FN)
        return (TP, FP, TN, FN)
    
    def calculate_metrics(self, tp, fp, tn, fn):
#         Following 4 metrics should be high so the model is good.
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        specificity = tn / (tn + fp)
        npv = tn / (tn + fn)
    
        accuracy = (tp + tn)/(tp + tn + fp + fn)
        f1_score = (2 * precision * recall) / (precision + recall)

        print("Precision:                ", precision)
        print("Recall:                   ", recall)
        print("Specificity:              ", specificity)
        print("Negative predictive value:", npv)
        print("Accuracy:                 ", accuracy)
        print("f1_score:                 ", f1_score)

        return (accuracy, f1_score)

    def score(self, x_test, y):
        """
        TODO: Compute the accuracy and f1 score here.
        """
        tp, fp, tn, fn = self.perf_measure(self.predict(x_test), y)
        
        return self.calculate_metrics(tp, fp, tn, fn)



In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

df = pd.read_csv('classification.csv')
x = df.iloc[:, 5:7].to_numpy()
y = df.iloc[:, -1].to_numpy()

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)

model = LogisticRegression1(lr=0.01, max_iter=5000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# TODO: Calculate metrics here.
scores = model.score(x_test, y_test)


# TODO: Compare your results with sklearn.
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
clf.score(x_test, y_test)


(array([0, 1, 2], dtype=int64), array([97, 31, 12], dtype=int64))
(array([-1,  0,  1], dtype=int64), array([ 27, 109,   4], dtype=int64))
12 97 4 27
Precision:                 0.75
Recall:                    0.3076923076923077
Specificity:               0.9603960396039604
Negative predictive value: 0.782258064516129
Accuracy:                  0.7785714285714286
f1_score:                  0.4363636363636364


0.7642857142857142

In [7]:
model.coef_, model.intercept_

([array([4.87338254, 1.71892885])], -2.4941478302197417)