## Load Necessary Libraries

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

## Read Data

### Load Data

In [26]:
df = pd.read_stata('lendingclub_train.dta')
# df.head()
# df.describe()
columns_to_drop = [
    "index",
    "depvar",
    "total_acc", 
    "out_prncp", 
    "out_prncp_inv", 
    "total_pymnt", 
    "total_pymnt_inv", 
    "total_rec_prncp", 
    "total_rec_int", 
    "total_rec_late_fee", 
    "recoveries", 
    "collection_recovery_fee",
    "last_pymnt_amnt", 
    "last_fico_range_high", 
    "last_fico_range_low", 
    "tot_coll_amt", 
    "tot_cur_bal", 
    "initial_list_status1", 
    "initial_list_status2", 
    "elapsed_t",
    "purpose1",
    "addr_state1",
    "elapsed_t",
    "debt_settlement_flag1",
    "term1",
    "mths_since_last_delinq1",
    "mths_since_last_major_derog1",
    "mths_since_last_record1",
    "mths_since_rcnt_il1",
    "mths_since_recent_bc1",
    "mths_since_recent_bc_dlq1",
    "mths_since_recent_inq1",
    "mths_since_recent_revol_delinq1"
]

issue_d_count = 1
while(issue_d_count <= 118):
    word_tmp = "issue_d" + str(issue_d_count)
    columns_to_drop.append(word_tmp)
    issue_d_count += 1


### Undersampling
필요없으면 건너뛰어도 무방

In [27]:
# Separate majority and minority classes
df_majority = df[df['depvar'] == 0]
df_minority = df[df['depvar'] == 1]

# Count number of instances in the minority class
minority_count = len(df_minority)

# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=minority_count, random_state=42)

# Combine minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

# Now split into features and target, and then into training and testing sets
y = df_undersampled['depvar']
X_original = df_undersampled
X = df_undersampled.drop(columns=columns_to_drop)

### Get Fico hat

In [28]:
import joblib 

rf_fico_high = joblib.load('random_forest_fico_high.joblib')
rf_fico_low = joblib.load('random_forest_fico_low.joblib')

In [30]:
fico_high_threshold = 0.5
fico_low_threshold = 0.5
last_fico_high_hat= rf_fico_high.predict_proba(X)[:,1]
last_fico_high_onehot = pd.DataFrame((last_fico_high_hat >= fico_high_threshold).astype(int))
last_fico_high_onehot.columns= ['']
last_fico_high_onehot = last_fico_high_onehot.to_numpy()
# last_fico_onehot = pd.DataFrame(last_fico_high_hat[])
# last_fico_high_hat= rf_fico_high.predict(X_original)

last_fico_low_hat= rf_fico_low.predict_proba(X)[:,1]
last_fico_low_onehot = pd.DataFrame((last_fico_low_hat >= fico_low_threshold).astype(int))
last_fico_low_onehot.columns= ['']
last_fico_low_onehot = last_fico_low_onehot.to_numpy()

# X['last_fico_high_hat'] = last_fico_high_onehot
# X['last_fico_low_hat'] = last_fico_low_onehot


X['last_fico_high_hat'] = last_fico_high_hat
X['last_fico_low_hat'] = last_fico_low_hat

KeyboardInterrupt: 

In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Drop Unnecessary Columns

In [16]:
# y = df['depvar']
# X = df.drop(columns=columns_to_drop)
# X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874335 entries, 0 to 874334
Columns: 188 entries, loan_amnt to mths_since_recent_revol_delinq11
dtypes: float64(6), int16(5), int32(6), int8(171)
memory usage: 211.0 MB


### Hyperparameter Tuning

In [17]:
# Assuming X and y are your features and target variable
# X = your_data_features
# y = your_data_target

# Splitting the data into training, validation and testing sets
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10]  ###############################<==이부분을 각자 하나씩 맡으면 됩니다!!
}

# Create a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy: ", val_accuracy)
print("Validation Classification Report: \n", val_report)


## Model Training

In [7]:
# Creating a Random Forest Classifier
# rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], random_state=42)
rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=10, random_state=42)

# Training the model
rf.fit(X_train, y_train)


## Evaluation

In [8]:
# Predicting the Test set results
y_pred = rf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)


0.659932244062533
              precision    recall  f1-score   support

         0.0       0.66      0.66      0.66     28263
         1.0       0.66      0.66      0.66     28411

    accuracy                           0.66     56674
   macro avg       0.66      0.66      0.66     56674
weighted avg       0.66      0.66      0.66     56674



In [9]:
y_pred_proba = rf.predict_proba(X_test)
y_pred_proba

array([[0.31595816, 0.68404184],
       [0.37685502, 0.62314498],
       [0.37579307, 0.62420693],
       ...,
       [0.72587175, 0.27412825],
       [0.7929639 , 0.2070361 ],
       [0.65435306, 0.34564694]])

## Save Model

In [10]:
import joblib

# save
joblib.dump(rf, "random_forest_final.joblib")

['random_forest_final.joblib']

## Setting Threshold

In [11]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [12]:
y_val_pred = rf.predict_proba(X_val)
y_val_pred1 = y_val_pred[:,1]

In [13]:
auc_score = roc_auc_score(y_val,y_val_pred1)
auc_score

0.7145156396992403

In [22]:
#Column drop 안된 원래 애들 생성
X_original_train, X_original_temp, y_train, y_temp = train_test_split(X_original, y, test_size=0.4, random_state=42)
X_original_val, X_original_test, y_val, y_test = train_test_split(X_original_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
# Gain domain(금액 모두 돌려받고 추가로 이자소득이 나옴)
# = TN*펀딩액*이자율 + TP*펀딩액*미국국채이자율
# Loss domain(이자소득을 따지기 이전에 원금이 안돌아옴)
# = FN * (펀딩금액 * (1+국채이자율) - 부도전까지갚은금액 - 부도전까지갚은이자) + FP*펀딩액*(랜딩클럽이자율 - 미국국채이자율)

thresholds = np.arange(0.0, 1.01, 0.01)
gov_bond_rate = 0.04


X_val_thres = X_original_val
X_val_thres['tn_gain'] = X_val_thres['funded_amnt'] * X_val_thres['int_rate'] * 36
X_val_thres['tp_gain'] = X_val_thres['funded_amnt'] * gov_bond_rate
X_val_thres['fn_loss'] = X_val_thres['funded_amnt'] * (1+gov_bond_rate) - X_val_thres['total_rec_prncp'] 
X_val_thres['fp_loss'] = X_val_thres['funded_amnt'] * (X_val_thres['int_rate'] - gov_bond_rate)
X_val_thres['rf_pred_proba'] = rf.predict_proba(X_val)[:,1]

metrics_df = pd.DataFrame(columns=['threshold', 'gain_or_loss'])

for threshold in thresholds:

    # X_val_thres['rf_pred_proba']
    X_val_thres['rf_pred_onehot'] = X_val_thres['rf_pred_proba'].apply(lambda row: 1 if row > threshold else 0)

    X_val_thres['predict_result'] = X_val_thres.apply(lambda row: 'tp' if ((row['depvar'] == row['rf_pred_onehot']) and (row['rf_pred_onehot'] == 1))
                                            else 'tn' if ((row['depvar'] == row['rf_pred_onehot']) and (row['rf_pred_onehot'] == 0))
                                            else 'fp' if (row['rf_pred_onehot'] == 1) 
                                            else 'fn', axis=1)

    X_val_thres['gain_or_loss'] = X_val_thres.apply(lambda row: row['tn_gain'] if (row['predict_result'] == 'tn')
                                                    else row['tp_gain'] if (row['predict_result'] == 'tp')
                                                    else row['fn_loss']*-1 if (row['predict_result'] == 'fn')
                                                    else row['fp_loss']*-1, axis=1)

    X_val_thres['gain_or_loss'].sum()

    metrics_df.loc[len(metrics_df.index)] = [threshold, X_val_thres['gain_or_loss'].sum()]


display(metrics_df)

  X_val_thres['tn_gain'] = X_val_thres['funded_amnt'] * X_val_thres['int_rate'] * 36
  X_val_thres['tp_gain'] = X_val_thres['funded_amnt'] * gov_bond_rate
  X_val_thres['fn_loss'] = X_val_thres['funded_amnt'] * (1+gov_bond_rate) - X_val_thres['total_rec_prncp']
  X_val_thres['fp_loss'] = X_val_thres['funded_amnt'] * (X_val_thres['int_rate'] - gov_bond_rate)
  X_val_thres['rf_pred_proba'] = rf.predict_proba(X_val)[:,1]
  X_val_thres['rf_pred_onehot'] = X_val_thres['rf_pred_proba'].apply(lambda row: 1 if row > threshold else 0)
  X_val_thres['predict_result'] = X_val_thres.apply(lambda row: 'tp' if ((row['depvar'] == row['rf_pred_onehot']) and (row['rf_pred_onehot'] == 1))
  X_val_thres['gain_or_loss'] = X_val_thres.apply(lambda row: row['tn_gain'] if (row['predict_result'] == 'tn')


Unnamed: 0,threshold,gain_or_loss
0,0.00,-1.616925e+07
1,0.01,-1.616925e+07
2,0.02,-1.616925e+07
3,0.03,-1.616925e+07
4,0.04,-1.616925e+07
...,...,...
96,0.96,1.447171e+09
97,0.97,1.447171e+09
98,0.98,1.447171e+09
99,0.99,1.447171e+09


In [24]:
optimal_threshold = metrics_df.threshold[metrics_df.gain_or_loss.idxmax()]
print(optimal_threshold)
print(metrics_df.gain_or_loss.max())

0.8300000000000001
1447196455.4699998
