In [1]:
#imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
#load the dataset
df = pd.read_csv("../data/accepted_2007_to_2018Q4.csv", low_memory=False)

In [3]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [4]:
use_cols = ["loan_amnt", "term", "int_rate", "installment", "annual_inc", "dti",
            "revol_util", "open_acc", "grade", "home_ownership", "purpose", "loan_status"]

df = df[use_cols].copy()
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,annual_inc,dti,revol_util,open_acc,grade,home_ownership,purpose,loan_status
0,3600.0,36 months,13.99,123.03,55000.0,5.91,29.7,7.0,C,MORTGAGE,debt_consolidation,Fully Paid
1,24700.0,36 months,11.99,820.28,65000.0,16.06,19.2,22.0,C,MORTGAGE,small_business,Fully Paid
2,20000.0,60 months,10.78,432.66,63000.0,10.78,56.2,6.0,B,MORTGAGE,home_improvement,Fully Paid
3,35000.0,60 months,14.85,829.9,110000.0,17.06,11.6,13.0,C,MORTGAGE,debt_consolidation,Current
4,10400.0,60 months,22.45,289.91,104433.0,25.37,64.5,12.0,F,MORTGAGE,major_purchase,Fully Paid


In [5]:
#Keep only the two main classes (Fully Paid vs Charged Off) and create a binary target

df = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])]

df["target"] = df["loan_status"].map({
    "Fully Paid": 0,
    "Charged Off": 1
})

df[["loan_status", "target"]].head()

Unnamed: 0,loan_status,target
0,Fully Paid,0
1,Fully Paid,0
2,Fully Paid,0
4,Fully Paid,0
5,Fully Paid,0


In [6]:
cat_cols = ["grade", "home_ownership", "purpose"]

encoder = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoder[col] = le

df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,annual_inc,dti,revol_util,open_acc,grade,home_ownership,purpose,loan_status,target
0,3600.0,36 months,13.99,123.03,55000.0,5.91,29.7,7.0,2,1,2,Fully Paid,0
1,24700.0,36 months,11.99,820.28,65000.0,16.06,19.2,22.0,2,1,11,Fully Paid,0
2,20000.0,60 months,10.78,432.66,63000.0,10.78,56.2,6.0,1,1,4,Fully Paid,0
4,10400.0,60 months,22.45,289.91,104433.0,25.37,64.5,12.0,5,1,6,Fully Paid,0
5,11950.0,36 months,13.44,405.18,34000.0,10.2,68.4,5.0,2,5,2,Fully Paid,0


In [10]:
# Convert "term" from strings like "36 months" to integer 36
df["term"] = df["term"].str.extract("(\d+)").astype(int)

#drop nan
df = df.dropna().reset_index(drop=True)

In [11]:
# train test splitting
X = df.drop(["loan_status", "target"], axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((1075264, 11), (268816, 11))

In [12]:
# save the processed files
X_train.to_csv("../data/X_train.csv", index=False)
X_test.to_csv("../data/X_test.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)