In [6]:
#imports
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

In [7]:
#load the dataset
df = pd.read_csv("../data/accepted_2007_to_2018Q4.csv", low_memory=False)

# Select useful columns
use_cols = ["loan_amnt", "term", "int_rate", "installment", "annual_inc", "dti",
            "revol_util", "open_acc", "grade", "home_ownership", "purpose", "loan_status"]

df = df[use_cols].copy()

In [8]:
#Keep only the two main classes (Fully Paid vs Charged Off) and create a binary target
df = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])]
df["target"] = df["loan_status"].map({
    "Fully Paid": 0,
    "Charged Off": 1
})

In [9]:
# Convert term, encode categories, drop missing.
cat_cols = ["grade", "home_ownership", "purpose"]

encoder = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoder[col] = le

# Convert "term" from strings like "36 months" to integer 36
df["term"] = df["term"].str.extract("(\d+)").astype(int)

df = df.drop(columns=["loan_status"])


#drop nan
df = df.dropna().reset_index(drop=True)

# save the clean data
df.to_csv("../data/clean_data.csv", index=False)