In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier 

%matplotlib inline

In [2]:
# loans_tr 데이터를 load
dataset = pd.read_csv('loans_tr.csv')

In [3]:
# Exploratory Data Aanalysis
dataset.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
dataset.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
dataset.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [6]:
# Object type data 
dataset = pd.get_dummies(dataset)
dataset.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [7]:
print(dataset.shape)

(8578, 20)


In [8]:
# X_data, y_data division
X = dataset.drop('not.fully.paid', axis = 1)
y = dataset['not.fully.paid']

In [9]:
# train data, test data division
validation_size = 0.20
seed = 15

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = validation_size, random_state=seed) 

In [10]:
num_folds = 10
#num_instances = len(X_train)
scoring = 'accuracy'

In [11]:
# Compare Algorithms - model selection
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVC',SVC()))

In [12]:
import warnings
warnings.simplefilter('ignore')

models

[('LR',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False)),
 ('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform')),
 ('CART',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=None,
              splitter='best')),
 ('NB', GaussianNB(priors=None, var_smoothing=1e-09)),
 ('RF',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_d

In [13]:
# cross validaton을 통해 optimal model selection
result = []
names = []
best_model_mean = 0
best_model_std = 0
best_name = ''

for name, model in models:
    kf = KFold(n_splits= num_folds, random_state= seed)
    cv_result = cross_val_score(model, X_train, y_train, cv = kf, scoring = scoring)
    result.append(cv_result)
    names.append(name)
    if cv_result.mean() > best_model_mean:
        best_model_mean = cv_result.mean()
        best_model_std = cv_result.std()
        best_name = name
    msg = "%s: %f (%f)" % (name, cv_result.mean(), cv_result.std())
    print(msg)
    
print("best_model: %s: %f (%f)" %(best_name, best_model_mean, best_model_std))

LR: 0.840278 (0.016748)
KNN: 0.821771 (0.014297)
CART: 0.742200 (0.015593)
NB: 0.824247 (0.012752)
RF: 0.832847 (0.015148)
SVC: 0.841736 (0.016274)
best_model: SVC: 0.841736 (0.016274)


In [14]:
# pipeline을 활용해서 scaling하고 optimal model selection

In [15]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))

results = []
names = []

best_model_mean2 = 0
best_model_std2 = 0
best_name2 = ''

for name, model in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    
    if cv_results.mean() > best_model_mean2:
        best_model_mean2 = cv_results.mean()
        best_model_std2 = cv_results.std()
        best_name2 = name
    
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
print("best_model: %s: %f (%f)" %(best_name2, best_model_mean2, best_model_std2))

ScaledLR: 0.840278 (0.015428)
ScaledLDA: 0.838967 (0.015020)
ScaledKNN: 0.824101 (0.014794)
ScaledCART: 0.738414 (0.014281)
ScaledNB: 0.768723 (0.016297)
ScaledRF: 0.833867 (0.015600)
best_model: ScaledLR: 0.840278 (0.015428)


In [16]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('LR', LogisticRegression())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('NB', GaussianNB())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('RF', RandomForestClassifier())])))
results = []
names = []
best_model_mean3 = 0
best_model_std3 = 0
best_name3 = ''
for name, model in pipelines:
    kfold = KFold( n_splits=num_folds, random_state=seed)
    cv_results1 = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results1)
    names.append(name)
    
    if cv_results.mean() > best_model_mean3:
        best_model_mean3 = cv_results1.mean()
        best_model_std3 = cv_results1.std()
        best_name3 = name
    
    msg = "%s: %f (%f)" % (name, cv_results1.mean(), cv_results1.std())
    print(msg)
print("best_model: %s: %f (%f)" %(best_name3, best_model_mean3, best_model_std3))

ScaledLR: 0.842027 (0.015357)
ScaledKNN: 0.821186 (0.018027)
ScaledCART: 0.742640 (0.015928)
ScaledNB: 0.768723 (0.016297)
ScaledRF: 0.834888 (0.017204)
best_model: ScaledLR: 0.842027 (0.015357)


In [17]:
pipelines

[('ScaledLR', Pipeline(memory=None,
       steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))])),
 ('ScaledKNN', Pipeline(memory=None,
       steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform'))])),
 ('ScaledCART', Pipeline(memory=None,
       steps=[('Scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('CART', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,

In [18]:
# ensemble 과 grid search를 활용한 model selection
clflog = LogisticRegression(random_state=seed) 
clfdt = DecisionTreeClassifier(random_state=seed) 
clfgn = GaussianNB() 
eclf_h = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='hard') 
eclf_s = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='soft') 

models = [clflog, clfdt, clfgn, eclf_h, eclf_s] 


In [19]:
 for model in models: 
        model.fit(X_train, y_train) 
        pred = model.predict(X_test) 
        score = model.score(X_test, y_test) 
        print(model) 
        print(score) 
        print('-'*20) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=15, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.8321678321678322
--------------------
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=15,
            splitter='best')
0.75
--------------------
GaussianNB(priors=None, var_smoothing=1e-09)
0.8228438228438228
--------------------
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=15, solver=

In [25]:
from sklearn.model_selection import cross_val_score 
for model in models: 
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(scores) 
    print(scores.mean()) 
    print('-'*20)

[0.83988355 0.83965015 0.84183673 0.84110787 0.84110787]
0.8407172351161301
--------------------
[0.72270742 0.74125364 0.75218659 0.73542274 0.73906706]
0.738127490547061
--------------------
[0.83042213 0.83090379 0.8287172  0.82507289 0.81413994]
0.8258511888847865
--------------------
[0.83406114 0.83965015 0.83600583 0.83746356 0.8287172 ]
0.8351795740130112
--------------------
[0.8216885  0.83381924 0.83746356 0.82944606 0.82142857]
0.8287691870260268
--------------------


In [26]:
clf1 = LogisticRegression(random_state=seed) 
clf2 = DecisionTreeClassifier(random_state=seed) 
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard') 
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='soft')


In [27]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
params ={ 
        "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], 
        "lr__C" : c_params, "dt__criterion" : ["gini", "entropy"], 
        "dt__max_depth" : [10,8,7,6,5,4,3,2], 
        "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9] 
        }

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) 
grid = grid.fit(X, y)

KeyboardInterrupt: 

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
 c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
    
params ={"solver" : ['liblinear'], "penalty" : ["l2"], "C" : c_params} 
grid = GridSearchCV(clf1, param_grid=params, cv=5) 
grid = grid.fit(X, y)

In [None]:
grid.best_score_

In [None]:
 grid.best_params_

In [None]:
# final test score - test data

In [20]:
dataset_test = pd.read_csv('loans_ts.csv')

In [21]:
#pipelines.append(('ScaledLR', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('LR', LogisticRegression())])))
dataset_test.drop('Unnamed: 0', axis = 1, inplace = True)
X = dataset_test.drop('not.fully.paid', axis = 1)
y = dataset_test['not.fully.paid']
dataset_test = pd.get_dummies(dataset_test)













In [34]:
lgfinal = LogisticRegression()
lgfinal.fit(X_train,y_train)
lgfinal.predict(X)



ValueError: could not convert string to float: 'small_business'