## Load Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

## Read Data

### Load Data

In [2]:
df = pd.read_stata('lendingclub_train.dta')
# df.head()
# df.describe()
columns_to_drop = [
    "index",
    "depvar",
    "total_acc", 
    "out_prncp", 
    "out_prncp_inv", 
    "total_pymnt", 
    "total_pymnt_inv", 
    "total_rec_prncp", 
    "total_rec_int", 
    "total_rec_late_fee", 
    "recoveries", 
    "collection_recovery_fee",
    "last_pymnt_amnt", 
    # "last_fico_range_high", 
    # "last_fico_range_low", 
    # "tot_coll_amt", 
    # "tot_cur_bal", 
    "initial_list_status1", 
    "initial_list_status2", 
    "elapsed_t",
    "purpose1",
    "addr_state1",
    "elapsed_t",
    "debt_settlement_flag1",
    "term1",
    "mths_since_last_delinq1",
    "mths_since_last_major_derog1",
    "mths_since_last_record1",
    "mths_since_rcnt_il1",
    "mths_since_recent_bc1",
    "mths_since_recent_bc_dlq1",
    "mths_since_recent_inq1",
    "mths_since_recent_revol_delinq1"
]

issue_d_count = 1
while(issue_d_count <= 118):
    word_tmp = "issue_d" + str(issue_d_count)
    columns_to_drop.append(word_tmp)
    issue_d_count += 1


### Undersampling
필요없으면 건너뛰어도 무방

In [None]:
# Separate majority and minority classes
df_majority = df[df['depvar'] == 0]
df_minority = df[df['depvar'] == 1]

# Count number of instances in the minority class
minority_count = len(df_minority)

# Undersample the majority class
df_majority_undersampled = df_majority.sample(n=minority_count*2, random_state=42)

# Combine minority class with the undersampled majority class
df_undersampled = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

# Now split into features and target, and then into training and testing sets
y = df_undersampled['depvar']
X = df_undersampled.drop(columns=columns_to_drop)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# X.info(verbose=True,max_cols=None)
# y.head()
X.head()


### Drop Unnecessary Columns

In [3]:
y = df['depvar']
X = df.drop(columns=columns_to_drop)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874335 entries, 0 to 874334
Columns: 188 entries, loan_amnt to mths_since_recent_revol_delinq11
dtypes: float64(6), int16(5), int32(6), int8(171)
memory usage: 211.0 MB


### Hyperparameter Tuning

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming X and y are your features and target variable
# X = your_data_features
# y = your_data_target

# Splitting the data into training, validation and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]  ###############################<==이부분을 각자 하나씩 맡으면 됩니다!!
}

# Create a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy: ", val_accuracy)
print("Validation Classification Report: \n", val_report)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.2min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.5min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=25.3min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=25.6min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=26.1min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=24.8min
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=25.0min
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=11.6min


KeyboardInterrupt: 

## Model Training

In [4]:

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

# Training the model
rf.fit(X_train, y_train)


0.8874344501821384
              precision    recall  f1-score   support

         0.0       0.92      0.95      0.93    146555
         1.0       0.67      0.59      0.63     28312

    accuracy                           0.89    174867
   macro avg       0.80      0.77      0.78    174867
weighted avg       0.88      0.89      0.88    174867



## Evaluation

In [None]:
# Predicting the Test set results
y_pred = rf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)
