# Loan 데이터 분류모형 예측성능 비교

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('../Data/loan.csv')

In [4]:
df['job'] = df['job'].replace({'Office':0, 'ProfExe':1, 'Other':2, 'Mgr':3, 'Self':4, 'Sales':5})

In [5]:
X = df.drop(['y'], axis=1)
y = df['y']
xname = X.columns
yname = ['Normal','Bad']

## Train/Test 데이터 분할

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)

## Bagging 방법

In [7]:
from sklearn.ensemble import BaggingClassifier

In [8]:
bag_tree = BaggingClassifier(n_estimators=100, random_state=0)
bag_tree.fit(X_train, y_train)
bag_prob = bag_tree.predict_proba(X_test) # 투표결과를 출력해보자
bag_prob

array([[0.19, 0.81],
       [1.  , 0.  ],
       [0.92, 0.08],
       ...,
       [0.02, 0.98],
       [0.87, 0.13],
       [0.51, 0.49]])

In [9]:
# 로지스틱에 bagging 을 적용해보자
from sklearn.linear_model import LogisticRegression
bag_logit = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),n_estimators=100,random_state=0) # 시간을 줄이기 위해 max_iter=1000
bag_logit.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                  n_estimators=100, random_state=0)

In [None]:
bag_prob = bag_logit.predict_proba(X_test)
bag_prob

## Random Forest 방법

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_sqrt = RandomForestClassifier(n_estimators=100,max_features="sqrt",random_state=0)
rf_sqrt.fit(X_train,y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_five = RandomForestClassifier(n_estimators=100,max_features=5,random_state=0)
rf_five.fit(X_train,y_train)

## 다른 분류방법과의 비교

### 로지스틱 회귀분석

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=0, max_iter = 1000)
logit.fit(X_train, y_train)

### 신경망분석

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
c_nn = MLPClassifier(hidden_layer_sizes=(5),random_state=0, max_iter = 1000)
c_nn.fit(X_train_scaled, y_train)

### SVM

In [None]:
from sklearn import svm
c_svm = svm.SVC(kernel='rbf', random_state=0)
c_svm.fit(X_train_scaled,y_train)

### ROC 곡선 및 AUC

In [None]:
from sklearn.metrics import plot_roc_curve
m1_roc=plot_roc_curve(bag_tree, X_test, y_test)
plot_roc_curve(bag_logit, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(rf_sqrt, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(rf_five, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(logit, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(c_nn, X_test_scaled, y_test, ax = m1_roc.ax_)
plot_roc_curve(c_svm, X_test_scaled, y_test, ax = m1_roc.ax_)
plt.title("ROC curve comparison")
plt.show()

## 변수중요도 (랜덤포레스트)

In [None]:
rf_sqrt.feature_importances_

In [None]:
sns.barplot(x=rf_sqrt.feature_importances_, y=xname)
plt.show()

In [None]:
rf_five.feature_importances_
sns.barplot(x=rf_five.feature_importances_, y=xname)
plt.show()