In [392]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import scale

In [393]:
df = pd.read_csv('balanced_credit.csv')
X = scale(df.iloc[:, :-1].values)
y = df.iloc[:, -1].values

In [394]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [395]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def cross_entropy_gradient(X, y, theta):
    """
    Gradient of Cross Entropy Loss
    """
    z = X.dot(theta)
    grad = (1/len(X))*np.matmul(X.T, (sigmoid(z)-y))
    return grad

def gradient_descent(X, y, theta, alpha, epochs):
    for i in tqdm(range(epochs)):
        theta_new = theta - alpha*cross_entropy_gradient(X,y, theta)
        theta = theta_new
    return theta_new

def predict(X, theta):
    z = X.dot(theta)
    return (sigmoid(z) >=0.5).astype(int)

In [396]:
def confusion_matrix(y_true, y_pred):
    """ Generate a confusion matrix.
    y = actual outcomes (0, 1, 2, ...)
    y_pred = predicted outcomes (0, 1, 2, ...)
    return confusion matrix as a numpy array
    """

    # Find unique identifiers
    unique_classes = set(y_true) | set(y_pred)
    n_classes = len(unique_classes)

    # Create matrix (all zeros)
    matrix = np.zeros(shape=(n_classes, n_classes), dtype=int)

    # Pair up each actual outcome with the corresponding prediction
    actual_prediction = list(zip(y_true, y_pred))

    # For each pair, increment the correct position in the matrix
    for i, j in actual_prediction:
        matrix[i, j] += 1

    return matrix


def metrics(y_true, y_pred, places=4):
    """ Generate accuracy scores for classifier.
    Round each score to <places> decimal places """
    """
     returns the model accuracy, sensitivity, specificity, precision, and f1-score."""

    scores = {}
    C = confusion_matrix(y_true, y_pred)

    scores['accuracy'] = round(C.diagonal().sum() / C.sum(), places)

    # Calculate the success measures
    TN, FP, FN, TP = C.ravel()
    scores['sensitivity'] = round(TP / (TP + FN), places)
    scores['specificity'] = round(TN / (TN + FP), places)
    scores['precision'] = round(TP / (TP + FP), places)
    scores['f1_score'] = round(2 * (scores['precision'] * scores['sensitivity']) / \
                               (scores['precision'] + scores['sensitivity']), places)

    return scores

In [397]:
init_theta = np.zeros((23,))

In [398]:
theta = gradient_descent(X_train, y_train, init_theta, .01, 10000)

100%|██████████| 10000/10000 [00:04<00:00, 2498.24it/s]


In [399]:
y_pred = predict(X_test, theta)

In [400]:
metrics(y_test, y_pred)

{'accuracy': 0.6637,
 'sensitivity': 0.6213,
 'specificity': 0.7062,
 'precision': 0.6801,
 'f1_score': 0.6494}

In [401]:
theta

array([-0.09152163, -0.04912723, -0.05870996, -0.09922221,  0.05018414,
        0.64654538,  0.12847906,  0.10883705,  0.02578754,  0.05540393,
       -0.04391429, -0.21568092,  0.03407758,  0.03169506,  0.03597359,
        0.00587533, -0.00438431, -0.14955998, -0.159734  , -0.0964345 ,
       -0.05919302, -0.02773972, -0.03291276])

In [402]:
from sklearn.linear_model import LogisticRegression

In [404]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [406]:
y_pred = clf.predict(X_test)

In [407]:
metrics(y_test, y_pred)

{'accuracy': 0.6603,
 'sensitivity': 0.6206,
 'specificity': 0.7002,
 'precision': 0.6754,
 'f1_score': 0.6468}