# Loan 데이터 분류모형 예측성능 비교

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('loan.csv')

In [3]:
df['job'] = df['job'].replace({'Office':0, 'ProfExe':1, 'Other':2, 'Mgr':3, 'Self':4, 'Sales':5})

In [4]:
X = df.drop(['y'], axis=1)
y = df['y']
xname = X.columns
yname = ['Normal','Bad']

## Train/Test 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)

## 로지스틱 회귀분석

In [6]:
from sklearn.linear_model import LogisticRegression
# max_iter: 과도한 수행을 방지
m1 = LogisticRegression(random_state=0, max_iter = 1000)
m1.fit(X_train, y_train)
# 확률 예측
m1_prob = m1.predict_proba(X_test)
pd.DataFrame(m1_prob)

Unnamed: 0,0,1
0,0.797334,0.202666
1,0.772756,0.227244
2,0.765714,0.234286
3,0.756511,0.243489
4,0.642193,0.357807
...,...,...
2792,0.551095,0.448905
2793,0.740449,0.259551
2794,0.485864,0.514136
2795,0.413873,0.586127


### cut-off=0.1

In [7]:
threshold = 0.1
m1_pred = (m1_prob[:,1]>threshold).astype(int)
m1_pred

array([1, 1, 1, ..., 1, 1, 1])

In [8]:
from sklearn.metrics import confusion_matrix
m1_cm = confusion_matrix(y_test, m1_pred)
pd.DataFrame(m1_cm)

Unnamed: 0,0,1
0,426,1695
1,45,631


In [9]:
# 정확도
accu1 = np.trace(m1_cm)/len(y_test)
print('정확도    :', np.round(accu1,2))

# 특이도, 민감도
spec1, sens1 = np.diag(m1_cm) / np.sum(m1_cm, axis = 1)
print('특이도    :', np.round(spec1,2))
print('민감도    :', np.round(sens1,2))

# 정밀도
prec1 = m1_cm[1,1] / np.sum(m1_cm, axis=0)[1]
print('정밀도    :', np.round(prec1,2))

# F1-score
f1_1 = 2*prec1*sens1 / (prec1+sens1)
print('F1-score :', np.round(f1_1,2))

정확도    : 0.38
특이도    : 0.2
민감도    : 0.93
정밀도    : 0.27
F1-score : 0.42


### cut-off=0.3

In [10]:
threshold = 0.3
m1_pred = (m1_prob[:,1]>threshold).astype(int)
m1_pred

array([0, 0, 0, ..., 1, 1, 1])

In [11]:
from sklearn.metrics import confusion_matrix
m1_cm = confusion_matrix(y_test, m1_pred)
pd.DataFrame(m1_cm)

Unnamed: 0,0,1
0,1583,538
1,233,443


In [12]:
# 정확도
accu1 = np.trace(m1_cm)/len(y_test)
print('정확도 :', np.round(accu1,2))

# 특이도, 민감도
spec1, sens1 = np.diag(m1_cm) / np.sum(m1_cm, axis = 1)
print('특이도 :', np.round(spec1,2))
print('민감도 :', np.round(sens1,2))

# 정밀도
prec1 = m1_cm[1,1] / np.sum(m1_cm, axis=0)[1]
print('정밀도 :', np.round(prec1,2))

# F1-score
f1_1 = 2*prec1*sens1 / (prec1+sens1)
print('F1-score :', np.round(f1_1,2))

정확도 : 0.72
특이도 : 0.75
민감도 : 0.66
정밀도 : 0.45
F1-score : 0.53


### cut-off=0.5

In [13]:
threshold = 0.5
m1_pred = (m1_prob[:,1]>threshold).astype(int)
m1_pred

array([0, 0, 0, ..., 1, 1, 0])

In [14]:
from sklearn.metrics import confusion_matrix
m1_cm = confusion_matrix(y_test, m1_pred)
pd.DataFrame(m1_cm)

Unnamed: 0,0,1
0,2003,118
1,520,156


In [15]:
# 정확도
accu1 = np.trace(m1_cm)/len(y_test)
print('정확도 :', np.round(accu1,2))

# 특이도, 민감도
spec1, sens1 = np.diag(m1_cm) / np.sum(m1_cm, axis = 1)
print('특이도 :', np.round(spec1,2))
print('민감도 :', np.round(sens1,2))

# 정밀도
prec1 = m1_cm[1,1] / np.sum(m1_cm, axis=0)[1]
print('정밀도 :', np.round(prec1,2))

# F1-score
f1_1 = 2*prec1*sens1 / (prec1+sens1)
print('F1-score :', np.round(f1_1,2))

정확도 : 0.77
특이도 : 0.94
민감도 : 0.23
정밀도 : 0.57
F1-score : 0.33
