## Load Necessary Libraries

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

## Read Data

### Load Data

In [48]:
df = pd.read_stata('lendingclub_train.dta')
# df.head()
# df.describe()
columns_to_drop = [
    "index",
    "depvar",
    "total_acc", 
    "out_prncp", 
    "out_prncp_inv", 
    "total_pymnt", 
    "total_pymnt_inv", 
    "total_rec_prncp", 
    "total_rec_int", 
    "total_rec_late_fee", 
    "recoveries", 
    "collection_recovery_fee",
    "last_pymnt_amnt", 
    "last_fico_range_high", 
    "last_fico_range_low", 
    "tot_coll_amt", 
    "tot_cur_bal", 
    "initial_list_status1", 
    "initial_list_status2", 
    "elapsed_t",
    "purpose1",
    "addr_state1",
    "elapsed_t",
    "debt_settlement_flag1",
    "term1",
    "mths_since_last_delinq1",
    "mths_since_last_major_derog1",
    "mths_since_last_record1",
    "mths_since_rcnt_il1",
    "mths_since_recent_bc1",
    "mths_since_recent_bc_dlq1",
    "mths_since_recent_inq1",
    "mths_since_recent_revol_delinq1",
    'last_fico_range_high_dummy',
    'last_fico_range_low_dummy'
]

issue_d_count = 1
while(issue_d_count <= 118):
    word_tmp = "issue_d" + str(issue_d_count)
    columns_to_drop.append(word_tmp)
    issue_d_count += 1

df['last_fico_range_high']


0         584
1         724
2         684
3         639
4         709
         ... 
874330    699
874331    664
874332    804
874333    624
874334    634
Name: last_fico_range_high, Length: 874335, dtype: int16

### Fico Dummy

In [49]:
df['last_fico_range_high_dummy'] = df.apply(lambda x: 0 if (x['last_fico_range_high'] > x['fico_range_high']) else 1, axis=1)
df['last_fico_range_low_dummy'] = df.apply(lambda x: 0 if (x['last_fico_range_low'] > x['fico_range_low']) else 1, axis=1)

  df['last_fico_range_high_dummy'] = df.apply(lambda x: 0 if (x['last_fico_range_high'] > x['fico_range_high']) else 1, axis=1)
  df['last_fico_range_low_dummy'] = df.apply(lambda x: 0 if (x['last_fico_range_low'] > x['fico_range_low']) else 1, axis=1)


In [50]:
# len(df[df['last_fico_range_high_dummy'] == 0])
len(df[df['last_fico_range_high_dummy'] == 1])

480722

# Fico High

### Undersampling
필요없으면 건너뛰어도 무방

In [51]:
# Separate majority and minority classes
df_majority = df[df['last_fico_range_high_dummy'] == 1]
df_minority = df[df['last_fico_range_high_dummy'] == 0]

# Count number of instances in the minority class
minority_count = len(df_minority)

# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=minority_count, random_state=42)

# Combine minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

# Now split into features and target, and then into training and testing sets
y = df_undersampled['last_fico_range_high_dummy']
X_original = df_undersampled
X = df_undersampled.drop(columns=columns_to_drop)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Model Training

In [52]:
# Creating a Random Forest Classifier
# rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], random_state=42)
rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=10, random_state=42)

# Training the model
rf.fit(X_train, y_train)


## Evaluation

In [54]:
# Predicting the Test set results
y_pred = rf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)


0.60588902191552
              precision    recall  f1-score   support

           0       0.60      0.61      0.61    157389
           1       0.61      0.60      0.60    157502

    accuracy                           0.61    314891
   macro avg       0.61      0.61      0.61    314891
weighted avg       0.61      0.61      0.61    314891



In [25]:
y_pred_proba = rf.predict_proba(X_test)
y_pred_proba

array([[0.48022699, 0.51977301],
       [0.46892742, 0.53107258],
       [0.55357248, 0.44642752],
       ...,
       [0.37635777, 0.62364223],
       [0.4782239 , 0.5217761 ],
       [0.4647883 , 0.5352117 ]])

## Save Model

In [55]:
import joblib

# save
joblib.dump(rf, "random_forest_fico_high.joblib")

['random_forest_fico_high.joblib']

# Fico Low

### Undersampling
필요없으면 건너뛰어도 무방

In [56]:
# Separate majority and minority classes
df_majority = df[df['last_fico_range_low_dummy'] == 1]
df_minority = df[df['last_fico_range_low_dummy'] == 0]

# Count number of instances in the minority class
minority_count = len(df_minority)

# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=minority_count, random_state=42)

# Combine minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

# Now split into features and target, and then into training and testing sets
y = df_undersampled['last_fico_range_high_dummy']
X_original = df_undersampled
X = df_undersampled.drop(columns=columns_to_drop)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Model Training

In [57]:

# Creating a Random Forest Classifier
# rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], random_state=42)
rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=10, random_state=42)

# Training the model
rf.fit(X_train, y_train)


## Evaluation

In [58]:
# Predicting the Test set results
y_pred = rf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)


0.6062141940728885
              precision    recall  f1-score   support

           0       0.61      0.61      0.61     78822
           1       0.61      0.60      0.60     78624

    accuracy                           0.61    157446
   macro avg       0.61      0.61      0.61    157446
weighted avg       0.61      0.61      0.61    157446



In [None]:
y_pred_proba = rf.predict_proba(X_test)
y_pred_proba

array([[0.9794646 , 0.0205354 ],
       [0.72461854, 0.27538146],
       [0.97009209, 0.02990791],
       ...,
       [0.94075408, 0.05924592],
       [0.95923372, 0.04076628],
       [0.91520273, 0.08479727]])

## Save Model

In [59]:
import joblib

# save
joblib.dump(rf, "random_forest_fico_low.joblib")

['random_forest_fico_low.joblib']