In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

In [24]:
df = pd.read_csv('balanced_credit.csv').sample(frac=1)
X = scale(df.iloc[:, :-1].values)
y = df.iloc[:, -1].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def cross_entropy_gradient(X, y, theta):
    """
    Gradient of Cross Entropy Loss
    """
    z = X.dot(theta)
    grad = (1/len(X))*np.matmul(X.T, (sigmoid(z)-y))
    return grad

def gradient_descent(X, y, theta, alpha, epochs):
    for i in tqdm(range(epochs)):
        theta_new = theta - alpha*cross_entropy_gradient(X,y, theta)
        theta = theta_new
    return theta_new

def predict(X, theta):
    z = X.dot(theta)
    return (sigmoid(z) >=0.5).astype(int)

In [27]:
init_theta = np.zeros((23,))

In [28]:
theta = gradient_descent(X_train, y_train, init_theta, .01, 10000)

100%|██████████| 10000/10000 [00:05<00:00, 1949.17it/s]


In [29]:
y_pred = predict(X_test, theta)

In [30]:
accuracy_score(y_test, y_pred)

0.6764595103578155

In [38]:
# Define the number of folds
k = 5

# Split the data into k folds
kf = KFold(n_splits=k)

# Initialize a list to store the accuracy scores
accuracy_scores = []
f1_scores = []

# Loop over the folds
for train_index, test_index in kf.split(X):
    # Split the data into training and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the logistic regression model using gradient descent
    theta = np.zeros(X_train.shape[1])
    theta_final = gradient_descent(X_train, y_train, theta, alpha=0.01, epochs=10000)

    # Make predictions on the test set and compute the accuracy score
    y_pred = predict(X_test, theta_final)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append the score to the list of scores
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print('Average cross-validated accuracy score:', np.mean(accuracy_scores))
print('Average cross-validated f1 score:', np.mean(f1_scores))

100%|██████████| 10000/10000 [00:04<00:00, 2119.54it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1918.72it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2127.27it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1890.69it/s]
100%|██████████| 10000/10000 [00:04<00:00, 2037.85it/s]

Average cross-validated accuracy score: 0.6705095531458041
Average cross-validated f1 score: 0.6590448802290313





In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

In [35]:
metrics(y_test, y_pred)

{'accuracy': 0.6631,
 'sensitivity': 0.65,
 'specificity': 0.6758,
 'precision': 0.6571,
 'f1_score': 0.6535}