In [16]:
import pandas as pd
# Step 1 Clean the data
df = pd.read_csv("../loan_data.csv")

# See if there are rows with nulls that need to be removed
rows_with_nulls = df.isna().any(axis=1).sum()
print(f"Number of rows with null values: {rows_with_nulls}")

Number of rows with null values: 0


In [17]:
# Step 2 Some basic EDA
print(df.describe())

         person_age  person_income  person_emp_exp     loan_amnt  \
count  45000.000000   4.500000e+04    45000.000000  45000.000000   
mean      27.764178   8.031905e+04        5.410333   9583.157556   
std        6.045108   8.042250e+04        6.063532   6314.886691   
min       20.000000   8.000000e+03        0.000000    500.000000   
25%       24.000000   4.720400e+04        1.000000   5000.000000   
50%       26.000000   6.704800e+04        4.000000   8000.000000   
75%       30.000000   9.578925e+04        8.000000  12237.250000   
max      144.000000   7.200766e+06      125.000000  35000.000000   

       loan_int_rate  loan_percent_income  cb_person_cred_hist_length  \
count   45000.000000         45000.000000                45000.000000   
mean       11.006606             0.139725                    5.867489   
std         2.978808             0.087212                    3.879702   
min         5.420000             0.000000                    2.000000   
25%         8.590000  

In [18]:
# View df
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [19]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Step 3 Preprocess

# Convert yes no column to boolean
loans_bool = (np.where(df["previous_loan_defaults_on_file"] == "Yes", True, False)).reshape(-1, 1)

# Seperate out dependent variable
y = df["loan_status"]


# Numerical Variables
numerical = df[["person_age", "person_income", "person_emp_exp", "loan_amnt", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length", "credit_score"]]

# Min Max scale numerical data
scaler = preprocessing.MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical)


# Categorical Variables
categorical = df[['person_gender', 'person_education', 'person_home_ownership', 'loan_intent']]
categorical_dummies = pd.get_dummies(categorical, drop_first=True)
categorical_dummies_array = np.array(categorical_dummies)

# Column names / order
columns = list(numerical.columns) + list(categorical_dummies.columns) + ["previous_loan_defaults_on_file"]

# Create Phi with numerical, categorical, and boolean variables
Phi = np.hstack((numerical_scaled, categorical_dummies_array, loans_bool))

print(f"Number of columns after preprocessing: {Phi.shape[1]}")

X_train, X_test, y_train, y_test = train_test_split(Phi, y, test_size=0.2, random_state=23)

Number of columns after preprocessing: 22


In [20]:
# KNN
import numpy as np
import matplotlib.pyplot as plt

def run_knn(X_train, y_train, X_test, y_test, k):
    predictions = []

    for test_point in X_test:
        # L2 distances
        distances = np.sqrt(np.sum((X_train - test_point) ** 2, axis=1))

        # K nearest neighbors
        knn_indices = np.argsort(distances)[:k]
        knn_labels = y_train[knn_indices]

        # Calculate mode
        unique_values, counts = np.unique(knn_labels, return_counts=True)
        mode_index = np.argmax(counts)
        predicted_class = unique_values[mode_index]

        predictions.append(predicted_class)

    # Prediction accuracy
    y_pred = np.array(predictions)
    accuracy = np.mean(y_pred == y_test)
    return accuracy, y_pred

In [21]:
# Params
n = X_train.shape[0]
sqrt_n = int(np.sqrt(n).round())  # 190

k_values = [5, 10, 25, 50, 75, 100, 125, 150, 175, sqrt_n, 200, 225, 250]   # Some test ks

# Convert all to arrays to avoid errors
X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

# Run KNN for different k values in a for loop
accuracies = []
for k in k_values:
    accuracy, y_pred = run_knn(X_train, y_train, X_test, y_test, k)
    accuracies.append(accuracy)
    print(f"K = {k}, Accuracy: {accuracy:.4f}")

# Plot k vs accuracy
plt.plot(k_values, accuracies)
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('KNN Accuracy vs Number of Neighbors')
plt.show()

K = 5, Accuracy: 0.8881
K = 10, Accuracy: 0.8914
K = 25, Accuracy: 0.8950
K = 50, Accuracy: 0.8909
K = 75, Accuracy: 0.8854
K = 100, Accuracy: 0.8742
K = 125, Accuracy: 0.8697


In [49]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# Best k was 25
k = 25
accuracy, predictions = run_knn(X_train, y_train, X_test, y_test, k)
y_pred = np.array(predictions)
print(f"K = {k}, Accuracy: {accuracy:.4f}")

# Precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


K = 25, Accuracy: 0.8950
Precision: 0.8333
Recall: 0.6608
F1 Score: 0.7371

Confusion Matrix:
[[6730  265]
 [ 680 1325]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      6995
           1       0.83      0.66      0.74      2005

    accuracy                           0.90      9000
   macro avg       0.87      0.81      0.84      9000
weighted avg       0.89      0.90      0.89      9000



The best k was clearly 25 (though this is also very dependent on the random state), and the results were decently strong with an accuracy of 90% correct predictions. There is clearly a strong class imbalance with more 0s than 1s, and the model is not handling that very well with significantly lower prediction accuracies for predicting a 1 than a 0. This could potentially be improved by imputing some dummy data points. 

In [56]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest
# We can use the same train test as scaling should not matter for random forest

rf = RandomForestClassifier(max_depth=None, random_state=23, criterion='entropy')

# Train the model
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Feature importance
feature_importance = rf.feature_importances_
feature_names = [f"Feature {i}" for i in range(Phi.shape[1])]

# Make feature importance and display top 10
importance_df = pd.DataFrame({'Feature': columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(importance_df.head(10))



Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      6995
           1       0.90      0.75      0.82      2005

    accuracy                           0.93      9000
   macro avg       0.91      0.86      0.89      9000
weighted avg       0.92      0.93      0.92      9000


Confusion Matrix:
[[6818  177]
 [ 493 1512]]

Feature Importance:
                           Feature  Importance
21  previous_loan_defaults_on_file    0.284019
4                    loan_int_rate    0.140659
5              loan_percent_income    0.139325
1                    person_income    0.113094
3                        loan_amnt    0.059230
7                     credit_score    0.053026
15      person_home_ownership_RENT    0.047536
0                       person_age    0.031821
2                   person_emp_exp    0.028236
6       cb_person_cred_hist_length    0.026445


The model does better at predicting 0s (defaulting the loan) than 1s (completing the payment). This is likely due to a strong class imbalance with more 0s than 1s in the dataset. Despite the strong imbalance, the model results were relatively good with an accuracy of 93% correct predictions. The model is clearly better than the knn, and significantly outperforms it for predicting 1s.

The most important features that provided the best splits were the persons yearly income, the loan percent of the income, the loan interest rate, and by far if the person had previous loan defaults on file. These all make a lot of sense as they seem incredibly relevant to the prediction. All of these statistics are very strongly correlated, so I wonder if they are not all necessary in a simplified model.