In [1]:
# Librarires
#!pip install catboost
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score


In [2]:
# Loading datasets
train_uncleaned = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

In [3]:
# A look into the dataset
train_uncleaned.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
# Searching for any null values within the training dataset
train_uncleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [5]:
# Finding the number of values for each catagorical variable
cat_cols = ["gender","marital_status","education_level","employment_status","loan_purpose","grade_subgrade"]
for col in cat_cols:
    print(col, train_uncleaned[col].unique())

gender ['Female' 'Male' 'Other']
marital_status ['Single' 'Married' 'Divorced' 'Widowed']
education_level ['High School' "Master's" "Bachelor's" 'PhD' 'Other']
employment_status ['Self-employed' 'Employed' 'Unemployed' 'Retired' 'Student']
loan_purpose ['Other' 'Debt consolidation' 'Home' 'Education' 'Vacation' 'Car'
 'Medical' 'Business']
grade_subgrade ['C3' 'D3' 'C5' 'F1' 'D1' 'D5' 'C2' 'C1' 'F5' 'D4' 'C4' 'D2' 'E5' 'B1'
 'B2' 'F4' 'A4' 'E1' 'F2' 'B4' 'E4' 'B3' 'E3' 'B5' 'E2' 'F3' 'A5' 'A3'
 'A1' 'A2']


In [6]:
# Transforming categorical values into numercials for easy categorisation and predictions
for col in cat_cols:
    train_uncleaned[col] = train_uncleaned[col].astype("category").cat.codes

In [7]:
# dropping ID colomn
train_cleaning = train_uncleaned.drop("id",axis=1)
train_cleaning

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,0,2,1,2,6,12,1.0
1,22108.02,0.166,636,4593.10,12.92,1,1,2,0,2,17,0.0
2,49566.20,0.097,694,17005.15,9.76,1,2,1,0,2,14,1.0
3,46858.25,0.065,533,4682.48,16.10,0,2,1,0,2,25,1.0
4,25496.70,0.053,665,12184.43,10.21,1,1,1,0,6,15,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
593989,23004.26,0.152,703,20958.37,10.92,0,2,1,0,0,12,1.0
593990,35289.43,0.105,559,3257.24,14.62,1,2,0,0,2,29,1.0
593991,47112.64,0.072,675,929.27,14.13,0,1,0,0,2,10,1.0
593992,76748.44,0.067,740,16290.40,9.87,1,2,0,0,2,6,1.0


In [8]:
# Now let us split the training data into training and validation sets
X = train_cleaning.drop("loan_paid_back",axis=1)
y = train_cleaning["loan_paid_back"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
# Outlier removal using IsolationForest
iso = IsolationForest(contamination=0.1, random_state=42)
y_pred = iso.fit_predict(X_train)

mask = y_pred == 1
X_train_clean = X_train[mask]
y_train_clean = y_train[mask]

print("Original:", X_train.shape)
print("After IsolationForest:", X_train_clean.shape)

Original: (475195, 11)
After IsolationForest: (427675, 11)


In [None]:
# Normalizing all the values within the data frame using MinMaxScaler()
cat_cols = ["gender","marital_status","education_level",
            "employment_status","loan_purpose",
            "grade_subgrade","loan_paid_back"]
num_cols = train_cleaning.columns.difference(cat_cols)

scaler = MinMaxScaler()

X_train_scaled = X_train_clean.copy()
X_val_scaled = X_val.copy()

X_train_scaled[num_cols] = pd.DataFrame(
    scaler.fit_transform(X_train_clean[num_cols]),
    columns=num_cols,
    index=X_train_clean.index
)

# Transform validation using the same scaler

X_val_scaled[num_cols] = pd.DataFrame(
    scaler.transform(X_val[num_cols]),
    columns=num_cols,
    index=X_val.index
)


In [None]:
# Creating training pools and validation pools 
train_pool = Pool(X_train_scaled, y_train_clean)
val_pool = Pool(X_val_scaled, y_val)

# Creating CatBoostClassifier model

model = CatBoostClassifier(
    iterations=3500,

    learning_rate=0.02,
    depth=8,
    l2_leaf_reg=3,
    max_leaves= 27,
    grow_policy="Lossguide",

    loss_function="Logloss",
    eval_metric="AUC",

    bootstrap_type="Bernoulli",   
    subsample=0.8,               

    random_seed=43,
    verbose=100,
    use_best_model=True,
    early_stopping_rounds=200
)

model.fit(train_pool, eval_set=val_pool)

# Predict AUC on validation

val_pred = model.predict_proba(X_val_scaled)[:, 1]
auc = roc_auc_score(y_val, val_pred)
print("Validation AUC:", auc)


0:	test: 0.8551592	best: 0.8551592 (0)	total: 173ms	remaining: 10m 4s
100:	test: 0.9113044	best: 0.9113044 (100)	total: 17.4s	remaining: 9m 47s
200:	test: 0.9149435	best: 0.9149435 (200)	total: 36.3s	remaining: 9m 56s
300:	test: 0.9161811	best: 0.9161811 (300)	total: 57.7s	remaining: 10m 12s
400:	test: 0.9170325	best: 0.9170325 (400)	total: 1m 17s	remaining: 10m 1s
500:	test: 0.9176024	best: 0.9176032 (499)	total: 1m 39s	remaining: 9m 57s
600:	test: 0.9180517	best: 0.9180517 (600)	total: 2m 1s	remaining: 9m 44s
700:	test: 0.9187524	best: 0.9187524 (700)	total: 2m 23s	remaining: 9m 32s
800:	test: 0.9193771	best: 0.9193771 (800)	total: 2m 46s	remaining: 9m 20s
900:	test: 0.9198376	best: 0.9198376 (900)	total: 3m 8s	remaining: 9m 3s
1000:	test: 0.9202370	best: 0.9202370 (1000)	total: 3m 30s	remaining: 8m 46s
1100:	test: 0.9205062	best: 0.9205062 (1100)	total: 3m 52s	remaining: 8m 27s
1200:	test: 0.9207914	best: 0.9207914 (1200)	total: 4m 14s	remaining: 8m 7s
1300:	test: 0.9210074	best: 0.

In [12]:
# Normalizing test data set
cat_cols = ["gender","marital_status","education_level","employment_status","loan_purpose","grade_subgrade"]
for col in cat_cols:
    test[col] = test[col].astype("category").cat.codes
test_normal = test.drop("id",axis=1)
num_cols = test_normal.columns.difference(cat_cols)
scaler = MinMaxScaler()
test_normal[num_cols] = pd.DataFrame( scaler.fit_transform(test_normal[num_cols]),
                            columns = num_cols,
                            index = test_normal.index)

test_normal

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,0.060776,0.061688,0.508811,0.226198,0.637369,0,2,1,0,6,19
1,0.108409,0.133117,0.742291,0.309378,0.533444,0,1,2,0,6,10
2,0.130640,0.577922,0.475771,0.068023,0.557767,1,2,0,0,2,15
3,0.052404,0.160714,0.607930,0.125348,0.352128,0,2,0,0,2,12
4,0.051136,0.113636,0.645374,0.354872,0.530680,0,1,4,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...
254564,0.231752,0.092532,0.768722,0.602650,0.568270,0,2,0,0,2,6
254565,0.114335,0.129870,0.526432,0.408267,0.352681,0,1,1,0,2,18
254566,0.039122,0.137987,0.711454,0.534212,0.320619,1,2,2,0,2,13
254567,0.074987,0.134740,0.757709,0.218892,0.365395,1,2,0,0,0,11


In [None]:
# Creation of submission dataframe and fitting test data into model for prediction
id_col = 'id'
submission = test[[id_col]].copy()
test_probs = model.predict_proba(test_normal)[:, 1]
submission["loan_paid_back_prob"] = test_probs

submission.to_csv("Data/loan_paid_back_predictions.csv", index=False)

# viewing first rows
print(submission.head())

       id  loan_paid_back_prob
0  593994             0.980349
1  593995             0.983704
2  593996             0.471399
3  593997             0.936369
4  593998             0.986051
