### Import libraries

In [6]:
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
seed = 1234
np.random.seed(seed)

### Process data

In [21]:
# Read data
data = pd.read_csv('data_as_numerical.csv')
# Get rid of unnecessary columns
data = data[['sex', 'has_car',
       'has_property', 'num_children', 'annual_income', 'income_type',
       'education_type', 'marital_status', 'housing_type', 'age',
       'days_employed', 'has_work_phone', 'has_phone',
       'has_email', 'occupation_type', 'num_fam_members', 'default']]

X = data.loc[:, data.columns != 'default']
y = data.loc[:, "default"]

In [22]:
# Feature selection using F-test as scoring function (best 5)
selector = SelectKBest(f_classif, k=5)
X = selector.fit_transform(X, y)
selector.get_feature_names_out()

array(['sex', 'has_property', 'marital_status', 'days_employed',
       'occupation_type'], dtype=object)

In [23]:
# Oversampling
data = data[['sex', 'has_property', 'marital_status', 'days_employed',
       'occupation_type', 'default']]

df_majority = data[data['default']==0]
df_minority = data[data['default']==1]
df_minority_unsampled = resample(df_minority,
                                 replace = True, # sample with replacement
                                 n_samples = df_majority.shape[0], # match majority class
                                 random_state = seed) #reproducable results

df_unsampled = pd.concat([df_minority_unsampled, df_majority])
y = df_unsampled['default']
X = df_unsampled.loc[:, df_unsampled.columns != 'default']

In [24]:
# Split the data
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, random_state=seed, stratify=y)

# Fit the logistic regression model
model1 = LogisticRegression(random_state=seed)
model1.fit(X_tr, y_tr)
Y_predict = model1.predict(X_te)

print('The Accuracy Score for Logistic Regression is {:.5}'.format(accuracy_score(y_te, Y_predict)))
print('The F1 Score for Logistic Regression is {:.5}'.format(f1_score(y_te, Y_predict)))

The Accuracy Score for Logistic Regression is 0.56648
The F1 Score for Logistic Regression is 0.54242
