# Loan 데이터 분류모형 예측성능 비교

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('loan.csv')

In [None]:
df['job'] = df['job'].replace({'Office':0, 'ProfExe':1, 'Other':2, 'Mgr':3, 'Self':4, 'Sales':5})

In [None]:
X = df.drop(['y'], axis=1)
y = df['y']
xname = X.columns
yname = ['Normal','Bad']

## Train/Test 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)

## AdaBoost 방법

In [None]:
from sklearn.ensemble import AdaBoostClassifier

#### Classifier 가 의사결정나무인 경우

In [None]:
ada_tree = AdaBoostClassifier(n_estimators=100, random_state=0)
ada_tree.fit(X_train, y_train)

#### Classifier 가 로지스틱 회귀분석인 경우

In [None]:
from sklearn.linear_model import LogisticRegression
ada_logit = AdaBoostClassifier(base_estimator=LogisticRegression(max_iter=1000),n_estimators=100,random_state=0)
ada_logit.fit(X_train, y_train)

## Gradient Boosting 방법

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#### Default 옵션: max_depth=3

In [None]:
gb_tree = GradientBoostingClassifier(n_estimators=100, random_state=0)
gb_tree.fit(X_train, y_train)

#### 트리의 크기를 크게, max_depth=7

In [None]:
gb_depth = GradientBoostingClassifier(n_estimators=100,max_depth=7, random_state=0)
gb_depth.fit(X_train, y_train)

## Random Forest 방법

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_sqrt = RandomForestClassifier(n_estimators=100,max_features="sqrt",random_state=0)
rf_sqrt.fit(X_train,y_train)

## 다른 분류방법과의 비교

### 로지스틱 회귀분석

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=0, max_iter = 1000)
logit.fit(X_train, y_train)

### 신경망분석

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
c_nn = MLPClassifier(hidden_layer_sizes=(5),random_state=0, max_iter = 1000)
c_nn.fit(X_train_scaled, y_train)

### SVM

In [None]:
from sklearn import svm
c_svm = svm.SVC(kernel='rbf', random_state=0)
c_svm.fit(X_train_scaled,y_train)

### ROC 곡선 및 AUC

In [None]:
from sklearn.metrics import plot_roc_curve
m1_roc=plot_roc_curve(ada_tree, X_test, y_test)
plot_roc_curve(ada_logit, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(gb_tree, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(gb_depth, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(rf_sqrt, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(logit, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(c_nn, X_test_scaled, y_test, ax = m1_roc.ax_)
plot_roc_curve(c_svm, X_test_scaled, y_test, ax = m1_roc.ax_)
plt.title("ROC curve comparison")
plt.show()

## 변수중요도 (Gradient Boosting)

In [None]:
gb_tree.feature_importances_

In [None]:
sns.barplot(x=gb_tree.feature_importances_, y=xname)
plt.show()