In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

%matplotlib inline

# data 가져오기

In [4]:
df = pd.read_csv('loans_tr.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
# index와 unnamed column이 똑같으므로 index_col=0오로 read한다.
df = pd.read_csv('loans_tr.csv', index_col=0)
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8578 entries, 0 to 9576
Data columns (total 14 columns):
credit.policy        8578 non-null int64
purpose              8578 non-null object
int.rate             8578 non-null float64
installment          8578 non-null float64
log.annual.inc       8578 non-null float64
dti                  8578 non-null float64
fico                 8578 non-null int64
days.with.cr.line    8578 non-null float64
revol.bal            8578 non-null int64
revol.util           8578 non-null float64
inq.last.6mths       8578 non-null int64
delinq.2yrs          8578 non-null int64
pub.rec              8578 non-null int64
not.fully.paid       8578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1005.2+ KB


In [7]:
df.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0
mean,0.803567,0.122617,318.093373,10.932431,12.622325,710.749709,4547.756538,17065.79,46.757076,1.571695,0.164607,0.062252,0.159594
std,0.397323,0.026817,206.99427,0.615859,6.881663,38.088227,2494.931515,34504.48,29.00033,2.197341,0.550745,0.264656,0.366251
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.57,10.555813,7.2425,682.0,2792.78125,3193.25,22.6,0.0,0.0,0.0,0.0
50%,1.0,0.1221,267.74,10.928884,12.66,707.0,4110.041667,8690.0,46.2,1.0,0.0,0.0,0.0
75%,1.0,0.1407,430.75,11.294022,17.96,737.0,5729.958333,18433.75,70.9,2.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


# Corr

In [10]:
df.corr()['not.fully.paid'].sort_values(ascending=False)
#강한 상관관계가 있는 0.5보다 크거나 -0.5보다 작은 feature가 보이지 않는다.

not.fully.paid       1.000000
int.rate             0.155071
inq.last.6mths       0.142262
revol.bal            0.107971
revol.util           0.083546
pub.rec              0.056377
installment          0.031963
dti                  0.015943
days.with.cr.line   -0.002703
log.annual.inc      -0.008408
delinq.2yrs         -0.019143
credit.policy       -0.104684
fico                -0.110169
Name: not.fully.paid, dtype: float64

In [12]:
# 상관관계가없는 feature들을 삭제한다. ex) dti, days.with.cr.line
df.drop(['dti', 'days.with.cr.line', 'log.annual.inc'], axis=1, inplace= True)

# Null값 처리

In [9]:
df.isnull().sum()
# Null값은 없다.

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

# categorical data 전처리

In [10]:
# purpose column이 categrical data다.
df['purpose'].value_counts()

debt_consolidation    3547
all_other             2082
credit_card           1143
home_improvement       556
small_business         547
major_purchase         395
educational            308
Name: purpose, dtype: int64

In [11]:
purpose = pd.get_dummies(df['purpose'])
purpose.head()
# 이후 X데이터에 concat한다.

Unnamed: 0,all_other,credit_card,debt_consolidation,educational,home_improvement,major_purchase,small_business
0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0


# X(feature)와 y(label) 구분

In [12]:
# 0과 1로 label이 붙어있다.
# 분류(classify) 문제임을 알 수 있다. ex)LogisticRegression
df['not.fully.paid'].value_counts()

0    7209
1    1369
Name: not.fully.paid, dtype: int64

In [13]:
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']

# train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1차 모델 : Pipeline 사용

In [15]:
# scaler, model, cross validation까지 한 번에 돌려본다.
df = pd.read_csv('loans_tr.csv', index_col=0)
purpose = pd.get_dummies(df['purpose'])
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))

results = []
for name, pipeline in pipelines:
    kfold = KFold(n_splits = 10)
    score = cross_val_score(pipeline, X_train, y_train, scoring= 'accuracy', cv=kfold)
    results.append((name, score.mean()))

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt 

  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, *

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt 

  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, *

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt 

  Xt = transform.transform(Xt)


In [22]:
results
# LogisticRegression이 가장 높게 나왔으므로 다음 모델에서 따로 만들어본다.

[('ScaledLR', 0.8400984470327233),
 ('ScaledLDA', 0.8399312257348864),
 ('ScaledKNN', 0.8244442595673875),
 ('ScaledCART', 0.7381619523017193),
 ('ScaledNB', 0.7742989462007765),
 ('ScaledRF', 0.8312753743760399)]

# 2차 모델 (LogisticRegression)


In [20]:
df = pd.read_csv('loans_tr.csv', index_col=0)
df.drop(['dti', 'days.with.cr.line', 'log.annual.inc'], axis=1, inplace= True)

purpose = pd.get_dummies(df['purpose'])
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
#Scaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler. transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)

  return self.partial_fit(X, y)


0.8368298368298368

# Test data

In [22]:
df = pd.read_csv('loans_ts.csv', index_col=0)
df.drop(['dti', 'days.with.cr.line', 'log.annual.inc'], axis=1, inplace= True)
purpose = pd.get_dummies(df['purpose'])
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']
scaler = MinMaxScaler()
scaler.fit(X)
X_test_scaled = scaler.transform(X)

lr.score(X_test_scaled, y)

  return self.partial_fit(X, y)


0.832

# 3차 모델 : SVM





In [2]:
df = pd.read_csv('loans_tr.csv', index_col=0)
df.drop(['dti', 'days.with.cr.line', 'log.annual.inc'], axis=1, inplace= True)
purpose = pd.get_dummies(df['purpose'])
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
#Scaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler. transform(X_test)

  return self.partial_fit(X, y)


In [None]:
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train_scaled, y_train)
        score = svm.score(X_test_scaled, y_test)
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
print(best_score)
print(best_parameters)

In [4]:
# best_prams_에 나온 것을 사용해서 평가해본다.
score = cross_val_score(SVC(C=0.001, gamma=0.001), X_train_scaled, y_train)
score.mean()



0.8417722277722278

In [5]:
svm = SVC(C=0.001, gamma=0.001)
svm.fit(X_train_scaled, y_train)
svm.score(X_test_scaled, y_test)

0.8372183372183373

# Test Data

In [7]:
df = pd.read_csv('loans_ts.csv', index_col=0)
df.drop(['dti', 'days.with.cr.line', 'log.annual.inc'], axis=1, inplace= True)
purpose = pd.get_dummies(df['purpose'])
x_drop = df.drop(['not.fully.paid', 'purpose'], axis=1)
X = pd.concat([x_drop, purpose], axis=1)
y = df['not.fully.paid']
scaler = MinMaxScaler()
scaler.fit(X)
X_test_scaled = scaler.transform(X)

svm.score(X_test_scaled, y)

  return self.partial_fit(X, y)


0.836