In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


%matplotlib inline

In [2]:
# loans_tr 데이터를 load
loan = pd.read_csv('loans_tr.csv')

In [3]:
# Exploratory Data Aanalysis
loan.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
loan = loan.drop('Unnamed: 0', axis = 1)
loan.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
loan= pd.get_dummies(loan)

In [6]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8578 entries, 0 to 8577
Data columns (total 20 columns):
credit.policy                 8578 non-null int64
int.rate                      8578 non-null float64
installment                   8578 non-null float64
log.annual.inc                8578 non-null float64
dti                           8578 non-null float64
fico                          8578 non-null int64
days.with.cr.line             8578 non-null float64
revol.bal                     8578 non-null int64
revol.util                    8578 non-null float64
inq.last.6mths                8578 non-null int64
delinq.2yrs                   8578 non-null int64
pub.rec                       8578 non-null int64
not.fully.paid                8578 non-null int64
purpose_all_other             8578 non-null uint8
purpose_credit_card           8578 non-null uint8
purpose_debt_consolidation    8578 non-null uint8
purpose_educational           8578 non-null uint8
purpose_home_improvement      8

In [7]:
loan.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [126]:
# Object type data 

In [8]:
# X_data, y_data division
X_data = loan.drop('not.fully.paid', axis = 1)
y_data = loan['not.fully.paid']

In [11]:
# train data, test data division
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.3, random_state = 77)

In [14]:
# Compare Algorithms - model selection
models = []
models.append(('LR', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))

In [15]:
import warnings
warnings.simplefilter('ignore')

models

[('LR',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False)),
 ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)),
 ('LDA',
  LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                solver='svd', store_covariance=False, tol=0.0001)),
 ('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform')),
 ('CART',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_le

In [21]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [25]:
f1score_results = []
ac_score_results= []
names = []
for name, model in models:
    
    cv_fits = model.fit(X_train,y_train)
    cv_preds = model.predict(X_test)
    score = f1_score(y_test, cv_preds)
    ac_score= accuracy_score(y_test, cv_preds)
    
    f1score_results.append(score)
    ac_score_results.append(ac_score)
    names.append(name)
    msg = "%s: f1_score= %f, accuracy_score = %f" % (name, score, ac_score)
    print(msg)
    


LR: f1_score= 0.019048, accuracy_score = 0.839938
SVC: f1_score= 0.000000, accuracy_score = 0.839549
LDA: f1_score= 0.079823, accuracy_score = 0.838772
KNN: f1_score= 0.094862, accuracy_score = 0.822067
CART: f1_score= 0.216783, accuracy_score = 0.738928
NB: f1_score= 0.108000, accuracy_score = 0.826729
RF: f1_score= 0.069414, accuracy_score = 0.833333


In [29]:
results = []
names = []
for name, model in models:
    
   
    cv_results = cross_val_score(model, X_train, y_train)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

LR: 0.839773 (0.001496)
SVC: 0.840773 (0.000161)
LDA: 0.837606 (0.004213)
KNN: 0.817457 (0.004213)
CART: 0.743340 (0.005319)
NB: 0.822950 (0.007286)
RF: 0.829446 (0.004762)


In [20]:
# cross validaton을 통해 optimal model selection


In [30]:
# pipeline을 활용해서 scaling하고 optimal model selection
# Standardize the dataset
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledSVC', Pipeline([('Scaler', StandardScaler()),('SVC', SVC())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))

In [31]:
pipelines

[('ScaledLR', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))])),
 ('ScaledSVC', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))])),
 ('ScaledLDA', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LDA', LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                so

In [34]:
scores = []

for name, model in pipelines:
    
    cv_fits = model.fit(X_train,y_train)
    score = model.score(X_test, y_test)
    
    
    scores.append(score)
    msg = "%s: score= %f" % (name, score)
    print(msg)
    


ScaledLR: score= 0.840326
ScaledSVC: score= 0.839161
ScaledLDA: score= 0.838772
ScaledKNN: score= 0.820124
ScaledCART: score= 0.740093
ScaledNB: score= 0.772727
ScaledRF: score= 0.832168


In [38]:
# ensemble 과 grid search를 활용한 model selection

param_grid = [
{'classifier': [SVC()], 'preprocessing': [StandardScaler()],
'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
{'classifier': [RandomForestClassifier(n_estimators=100)],
'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]


pipe = Pipeline([('preprocessing', StandardScaler()),('classifier', SVC())])
pipe2 = Pipeline([('preprocessing', StandardScaler()),('classifier',RandomForestClassifier())])
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
grid2 = GridSearchCV(pipe2, param_grid, cv=5)
grid2.fit(X_train, y_train)
print(" SVC best_param:\n{}\n".format(grid.best_params_))
print(" SVC best_score: {:.2f}".format(grid.best_score_))
print(" SVC score: {:.2f}".format(grid.score(X_test, y_test)))

print("RF best_param :\n{}\n".format(grid2.best_params_))
print("RF best_score : {:.2f}".format(grid2.best_score_))
print("RF score : {:.2f}".format(grid2.score(X_test, y_test)))

 SVC best_param:
{'classifier': SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 1, 'classifier__gamma': 0.1, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

 SVC best_score: 0.84
 SVC score: 0.84
RF best_param :
{'classifier': SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 1, 'classifier__gamma': 0.1, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

RF best_score : 0.84
RF score : 0.84


In [39]:
# final test score - test data
loan_test = pd.read_csv('loans_ts.csv')
loan_test.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,8150,0,small_business,0.1299,505.34,11.407565,10.24,672,2250.0,4162,68.4,14,0,1,1
1,7086,1,credit_card,0.1062,618.64,11.289782,13.29,737,5070.0,14462,36.1,1,0,0,0
2,9374,0,all_other,0.1531,139.27,10.555813,8.34,667,3810.041667,1151,9.8,6,0,0,0
3,7439,1,home_improvement,0.157,280.09,10.874266,7.36,662,4770.0,10973,87.8,0,1,0,1
4,5345,1,credit_card,0.1531,870.39,11.727372,16.65,707,10590.0,40413,85.6,0,0,0,0


In [42]:
loan_test = loan_test.drop('Unnamed: 0', axis =1)


KeyError: "['Unnamed: 0'] not found in axis"

In [43]:
loan_test = pd.get_dummies(loan_test)

In [44]:
loan_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
credit.policy                 1000 non-null int64
int.rate                      1000 non-null float64
installment                   1000 non-null float64
log.annual.inc                1000 non-null float64
dti                           1000 non-null float64
fico                          1000 non-null int64
days.with.cr.line             1000 non-null float64
revol.bal                     1000 non-null int64
revol.util                    1000 non-null float64
inq.last.6mths                1000 non-null int64
delinq.2yrs                   1000 non-null int64
pub.rec                       1000 non-null int64
not.fully.paid                1000 non-null int64
purpose_all_other             1000 non-null uint8
purpose_credit_card           1000 non-null uint8
purpose_debt_consolidation    1000 non-null uint8
purpose_educational           1000 non-null uint8
purpose_home_improvement      10

In [45]:
X_test = loan_test.drop('not.fully.paid', axis =1)
Y_test = loan_test['not.fully.paid']

In [47]:
grid.score(X_test, Y_test)

0.836

In [48]:
grid2.score(X_test, Y_test)

0.836

In [51]:
X_test.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,0.1299,505.34,11.407565,10.24,672,2250.0,4162,68.4,14,0,1,0,0,0,0,0,0,1
1,1,0.1062,618.64,11.289782,13.29,737,5070.0,14462,36.1,1,0,0,0,1,0,0,0,0,0
2,0,0.1531,139.27,10.555813,8.34,667,3810.041667,1151,9.8,6,0,0,1,0,0,0,0,0,0
3,1,0.157,280.09,10.874266,7.36,662,4770.0,10973,87.8,0,1,0,0,0,0,0,1,0,0
4,1,0.1531,870.39,11.727372,16.65,707,10590.0,40413,85.6,0,0,0,0,1,0,0,0,0,0
