In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from autosklearn.classification import AutoSklearnClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from janitor import clean_names
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.compose import ColumnTransformer

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['label'] = 'train'
test['label'] = 'test'

df = pd.concat([train, test], ignore_index=True)

## Feature Preprocessing

In [None]:
# Drop identifier features
df.drop(columns=['Loan_ID'], inplace=True)

# Correct mistyped features
df['Loan_Amount_Requested'].replace({',':''}, regex=True, inplace=True)
df['Loan_Amount_Requested'] = pd.to_numeric(df['Loan_Amount_Requested'])

# 'Month_Since_Deliquency' features contains too many missing values
# it might be because the instance doesn't have historical deluquency
# solution : derive 'Has_Deliquency' feature as follow
# df['Has_Deliquency'] = np.where(df['Months_Since_Deliquency']==np.nan, 0, 1)
df.drop(columns=['Months_Since_Deliquency'], inplace=True)

# for categorical features, missing values replaced with "MISSING"
# remaining categorical features with missing values are 'Length_Employed' and 'Home_Owner'
df.loc[:,['Length_Employed', 'Home_Owner']].fillna("MISSING", inplace=True)

df['Length_Employed'].replace({'< 1 year':'less'}, inplace=True)

# Replace missing values in Annual Income with mean
df['Annual_Income'].fillna(df['Annual_Income'].median(), inplace=True)

# Remove outliers
# numeric_train = train.select_dtypes(include=np.number).drop(columns=['Interest_Rate'])
# abs_z_scores = np.abs(zscore(numeric_train))
# filtered_entries = (abs_z_scores < 3).all(axis=1)
# numeric_train = numeric_train[filtered_entries]

# New Manual Features
df['Monthly_Income'] = df['Annual_Income'] / 12
# test['Monthly_Income'] = test['Annual_Income'] / 12

df['Accounts_Ratio'] = df['Number_Open_Accounts'] / df['Total_Accounts']
#test['Accounts_Ratio'] = test['Number_Open_Accounts'] / test['Total_Accounts']

df['Loan_Income_Ratio'] = df['Loan_Amount_Requested'] / df['Annual_Income']
# test['Loan_Income_Ratio'] = test['Loan_Amount_Requested'] / test['Annual_Income']

#df['Employed_Home'] = df['Length_Employed'] + df['Home_Owner']
# test['Employed_Home'] = test['Length_Employed'] + test['Home_Owner']

df['Inv_Loan_per_Active_Account'] = df['Number_Open_Accounts'] / df['Loan_Amount_Requested']
# test['Loan_per_Active_Account'] = test['Loan_Amount_Requested'] / test['Number_Open_Accounts']

df['Loan_per_Total_Account'] = df['Loan_Amount_Requested'] / df['Total_Accounts']
# test['Loan_per_Total_Account'] = test['Loan_Amount_Requested'] / test['Total_Accounts']

#df['Home_Purpose'] = df['Home_Owner'] + df['Purpose_Of_Loan']
# test['Home_Purpose'] = test['Home_Owner'] + test['Purpose_Of_Loan']

# Reference: Namvar, Anahita. Credit Risk Prediction in an Imbalanced Social Lending Environment
df['NMRA'] = df['Debt_To_Income']*df['Monthly_Income']
# test['NMRA'] = test['Debt_To_Income']*test['Monthly_Income']

# Reference: Namvar, Anahita. Credit Risk Prediction in an Imbalanced Social Lending Environment
df['NDTI'] = df['NMRA'] / df['Monthly_Income']
# test['NDTI'] = test['NMRA'] / test['Monthly_Income']

#df['Verified_Purpose'] = df['Income_Verified'] + df['Purpose_Of_Loan']
# test['Verified_Purpose'] = test['Income_Verified'] + test['Purpose_Of_Loan']

# Feature scaling
#scaler = StandardScaler()
#numeric_df = df.select_dtypes(include=np.number).drop(columns=['Interest_Rate'])
#numeric_test = test.select_dtypes(include=np.number)
#numeric_df = scaler.fit_transform(numeric_df)
#numeric_test = scaler.transform(numeric_test)

# OneHotEncoding for Categorical features
df = pd.get_dummies(df, columns=[
    'Length_Employed', 'Home_Owner',
    'Income_Verified', 'Purpose_Of_Loan', 'Gender']
    )

# cat_df = df.select_dtypes(exclude=np.number)
# cat_test = test.select_dtypes(exclude=np.number)
# encoder = OneHotEncoder()
# encoder.fit_transform(cat_df)
#cat_test = encoder.transform(cat_test)

# clean column names
df = clean_names(df, remove_special=True)

# Reset Index
df.reset_index(drop=True, inplace=True)
#test.reset_index(drop=True, inplace=True)

## Final State of Data

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.corr()['interest_rate'].abs().sort_values()

In [None]:
df.info()

## Data Split

In [None]:
train = df[df['label'] == 'train'].copy()
train.drop(columns=['label'], inplace=True)
train.reset_index(drop=True, inplace=True)

test = df[df['label'] == 'test'].copy()
test.drop(columns=['interest_rate', 'label'], inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
train.info()

In [None]:
test.info()

## Export Dataset

In [None]:
# Uncomment code to export data
train.to_csv('../data/train_ready.csv', index=False)
test.to_csv('../data/test_ready.csv', index=False)

In [None]:
train.head()

## Data Split 2

In [None]:
X = train.drop(columns=['interest_rate'])
y = train['interest_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Resampling
# resampler = RandomUnderSampler()
# X_train, y_train = resampler.fit_resample(X_train, y_train)

## MODEL

In [None]:
classifier_models = {
    'sgdc' : SGDClassifier(),
    'ridge' : RidgeClassifier(),
    'rf' : RandomForestClassifier(),
    'autoskl' : AutoSklearnClassifier(memory_limit=8*1024),
    'gbc' : GradientBoostingClassifier(),
    'hgbc' : HistGradientBoostingClassifier(),
    'xgboost' : XGBClassifier(),
    'lgbm' : LGBMClassifier(),
    'catboost' : CatBoostClassifier(silent=True)
}

for name, model in classifier_models.items():
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #y_prob = model.predict_proba(X_test)
    print(name)
    print(classification_report(y_test, y_pred, digits=4))