In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


%matplotlib inline

In [22]:
loans = pd.read_csv('loans_tr.csv')

In [1]:
# loans_tr 데이터를 load

In [23]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8578 entries, 0 to 8577
Data columns (total 15 columns):
Unnamed: 0           8578 non-null int64
credit.policy        8578 non-null int64
purpose              8578 non-null object
int.rate             8578 non-null float64
installment          8578 non-null float64
log.annual.inc       8578 non-null float64
dti                  8578 non-null float64
fico                 8578 non-null int64
days.with.cr.line    8578 non-null float64
revol.bal            8578 non-null int64
revol.util           8578 non-null float64
inq.last.6mths       8578 non-null int64
delinq.2yrs          8578 non-null int64
pub.rec              8578 non-null int64
not.fully.paid       8578 non-null int64
dtypes: float64(6), int64(8), object(1)
memory usage: 1005.3+ KB


In [24]:
loans.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [25]:
pd.get_dummies(loans['purpose'], prefix = 'purpose').head()

Unnamed: 0,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0


In [26]:
pd.get_dummies(loans['purpose'], prefix = 'purpose').columns[1:]

Index(['purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business'],
      dtype='object')

In [27]:
for i, col in enumerate(pd.get_dummies(loans['purpose'], prefix = 'purpose').columns[1:]):
    loans[col] = pd.get_dummies(loans['purpose'], prefix = 'purpose')[pd.get_dummies(loans['purpose'], prefix = 'purpose').columns[i+1]]

In [28]:
loans.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,...,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,...,0,0,0,0,0,1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,...,0,0,0,0,1,0,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,...,1,0,0,0,0,1,0,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,...,1,0,0,0,0,1,0,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,...,0,1,0,0,1,0,0,0,0,0


In [29]:
del loans['purpose']

In [30]:
loans.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [31]:
del loans[loans.columns[0]]

In [32]:
loans.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [33]:
loans.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
count,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0,8578.0
mean,0.803567,0.122617,318.093373,10.932431,12.622325,710.749709,4547.756538,17065.79,46.757076,1.571695,0.164607,0.062252,0.159594,0.133248,0.4135,0.035906,0.064817,0.046048,0.063768
std,0.397323,0.026817,206.99427,0.615859,6.881663,38.088227,2494.931515,34504.48,29.00033,2.197341,0.550745,0.264656,0.366251,0.339862,0.49249,0.186066,0.246217,0.209601,0.244353
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.57,10.555813,7.2425,682.0,2792.78125,3193.25,22.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.1221,267.74,10.928884,12.66,707.0,4110.041667,8690.0,46.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1407,430.75,11.294022,17.96,737.0,5729.958333,18433.75,70.9,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [2]:
# Exploratory Data Aanalysis

In [34]:
loans.columns

Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business'],
      dtype='object')

In [35]:
loans.columns[12]

'not.fully.paid'

In [42]:
y_data = loans[loans.columns[12]]

In [43]:
loans.columns[0:12], loans.columns[13:]

(Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
        'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
        'inq.last.6mths', 'delinq.2yrs', 'pub.rec'],
       dtype='object'),
 Index(['purpose_credit_card', 'purpose_debt_consolidation',
        'purpose_educational', 'purpose_home_improvement',
        'purpose_major_purchase', 'purpose_small_business'],
       dtype='object'))

In [44]:
X_col = []

for c in loans.columns[0:12]:
    X_col.append(c)
for b in loans.columns[13:]:
    X_col.append(b)

X_col

['credit.policy',
 'int.rate',
 'installment',
 'log.annual.inc',
 'dti',
 'fico',
 'days.with.cr.line',
 'revol.bal',
 'revol.util',
 'inq.last.6mths',
 'delinq.2yrs',
 'pub.rec',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_major_purchase',
 'purpose_small_business']

In [45]:
X_data = loans[X_col]

In [46]:
X_data.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,1,0,0,0,0,0


In [126]:
# Object type data 

In [None]:
# X_data, y_data division

In [47]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data)

In [None]:
# train data, test data division

In [57]:
from sklearn.svm import SVC

In [58]:
LR = LogisticRegression()
SVC = SVC()
KNN = KNeighborsClassifier()
CART = DecisionTreeClassifier()
NB = GaussianNB()
RF = RandomForestClassifier()

In [59]:
models = [('LR', LR), ('SVC', SVC), ('KNN', KNN), ('CART', CART), ('NB', NB), ('RF', RF)]

In [60]:
import warnings
warnings.simplefilter('ignore')

models

[('LR',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False)),
 ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)),
 ('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform')),
 ('CART',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_

In [None]:
# Compare Algorithms - model selection

In [145]:
import warnings
warnings.simplefilter('ignore')

models

[('LR',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False)),
 ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)),
 ('KNN',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=None, n_neighbors=5, p=2,
             weights='uniform')),
 ('CART',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_

In [68]:
scores = []
for model_name, model in models:
    model.fit(X_train, y_train)
    scores.append( ( model_name, model.score(X_val, y_val) ) )

print(scores)

[('LR', 0.8368298368298368), ('SVC', 0.8368298368298368), ('KNN', 0.8223776223776224), ('CART', 0.7375291375291375), ('NB', 0.8247086247086247), ('RF', 0.8293706293706293)]


In [70]:
from sklearn.svm import SVC

In [80]:
ScaledLR = Pipeline([('Scaler', StandardScaler()), ("LR", LogisticRegression())])
ScaledSVC = Pipeline([('Scaler', StandardScaler()), ("SVC", SVC())])
ScaledKNN = Pipeline([('Scaler', StandardScaler()), ("KNN", KNeighborsClassifier())])
ScaledCART = Pipeline([('Scaler', StandardScaler()), ("CART",DecisionTreeClassifier())])
ScaledNB = Pipeline([('Scaler', StandardScaler()), ("NB",GaussianNB())])
ScaledRF = Pipeline([('Scaler', StandardScaler()), ("RF",RandomForestClassifier())])

In [81]:
pipelines = [('ScaledLR', ScaledLR), ('ScaledSVC', ScaledSVC), ('ScaledKNN', ScaledKNN), ('ScaledCART', ScaledCART), ('ScaledNB', ScaledNB), ('ScaledRF', ScaledRF)]

In [None]:
# cross validaton을 통해 optimal model selection

In [3]:
# pipeline을 활용해서 scaling하고 optimal model selection

In [82]:
pipelines

[('ScaledLR', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))])),
 ('ScaledSVC', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))])),
 ('ScaledKNN', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_

In [149]:
pipelines

[('ScaledLR', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=None, solver='warn',
            tol=0.0001, verbose=0, warm_start=False))])),
 ('ScaledSVC', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))])),
 ('ScaledKNN', Pipeline(memory=None,
       steps=[('Scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_

In [83]:
scores_s = []
for model_name, model in pipelines:
    model.fit(X_train, y_train)
    scores_s.append( ( model_name, model.score(X_val, y_val) ) )

print(scores_s)

[('ScaledLR', 0.8386946386946387), ('ScaledSVC', 0.8382284382284382), ('ScaledKNN', 0.8228438228438228), ('ScaledCART', 0.7277389277389278), ('ScaledNB', 0.7892773892773893), ('ScaledRF', 0.8335664335664336)]


In [90]:
from sklearn.svm import SVC

In [91]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LogisticRegression())])

In [95]:
param_grid = [
    {'classifier': [LogisticRegression()], 'preprocessing': [StandardScaler()],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [SVC()], 'preprocessing': [StandardScaler()],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma' : [0.001, 0.01, 0.1, 1, 10, 100] },
    {'classifier': [KNeighborsClassifier()], 'preprocessing': [StandardScaler()],
    'classifier__n_neighbors': [2, 4, 8, 16, 32, 64]},
    {'classifier': [DecisionTreeClassifier()], 'preprocessing': [StandardScaler()],
    'classifier__max_depth': [4, 8, 16, 32, 64, 128]},
    {'classifier': [GaussianNB()], 'preprocessing': [StandardScaler()]},
    {'classifier': [RandomForestClassifier()], 'preprocessing': [StandardScaler()],
    'classifier__n_estimators': [5, 10, 20, 40, 80, 160], 'classifier__max_depth': [2, 4, 8, 16, 32, 64]}
]

In [96]:
grid = GridSearchCV(pipe, param_grid, cv=5)

In [97]:
grid.fit(X_data, y_data)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'classifier': [LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)], 'prepr...lassifier__n_estimators': [5, 10, 20, 40, 80, 160], 'classifier__max_depth': [2, 4, 8, 16, 32, 64]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
     

In [98]:
grid.best_params_

{'classifier': SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'classifier__C': 0.001,
 'classifier__gamma': 0.001,
 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [99]:
grid.best_score_

0.8404056889717882

In [100]:
pipe_SVC = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

param_grid_SVC = [{'preprocessing': [StandardScaler(), MinMaxScaler()], 'classifier': [SVC()], 'classifier__C': [0.001, 0.0005, 0.0001, 0.00005, 0.00001], 'classifier__gamma' : [0.001, 0.0005, 0.0001, 0.00005, 0.00001]}]

grid_SVC = GridSearchCV(pipe_SVC, param_grid, cv=5)

grid_SVC.fit(X_data, y_data)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'classifier': [LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)], 'prepr...lassifier__n_estimators': [5, 10, 20, 40, 80, 160], 'classifier__max_depth': [2, 4, 8, 16, 32, 64]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [101]:
grid_SVC.best_score_

0.8404056889717882

In [102]:
grid_SVC.best_params_

{'classifier': SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'classifier__C': 0.001,
 'classifier__gamma': 0.001,
 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [114]:
test = pd.read_csv('loans_ts.csv')

In [115]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
Unnamed: 0           1000 non-null int64
credit.policy        1000 non-null int64
purpose              1000 non-null object
int.rate             1000 non-null float64
installment          1000 non-null float64
log.annual.inc       1000 non-null float64
dti                  1000 non-null float64
fico                 1000 non-null int64
days.with.cr.line    1000 non-null float64
revol.bal            1000 non-null int64
revol.util           1000 non-null float64
inq.last.6mths       1000 non-null int64
delinq.2yrs          1000 non-null int64
pub.rec              1000 non-null int64
not.fully.paid       1000 non-null int64
dtypes: float64(6), int64(8), object(1)
memory usage: 117.3+ KB


In [116]:
test.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,8150,0,small_business,0.1299,505.34,11.407565,10.24,672,2250.0,4162,68.4,14,0,1,1
1,7086,1,credit_card,0.1062,618.64,11.289782,13.29,737,5070.0,14462,36.1,1,0,0,0
2,9374,0,all_other,0.1531,139.27,10.555813,8.34,667,3810.041667,1151,9.8,6,0,0,0
3,7439,1,home_improvement,0.157,280.09,10.874266,7.36,662,4770.0,10973,87.8,0,1,0,1
4,5345,1,credit_card,0.1531,870.39,11.727372,16.65,707,10590.0,40413,85.6,0,0,0,0


In [117]:
test.describe()

Unnamed: 0.1,Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,4702.098,0.817,0.122841,327.63345,10.929426,12.47247,711.675,4672.372625,15611.572,47.16088,1.627,0.156,0.061,0.164
std,2756.941306,0.38686,0.027118,207.638384,0.60606,6.90572,36.953822,2512.498319,26460.949572,29.147029,2.225494,0.505885,0.23945,0.370461
min,22.0,0.0,0.06,25.02,8.29405,0.0,642.0,181.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2367.0,1.0,0.1025,168.09,10.571317,6.8725,682.0,2961.760417,3169.25,22.2,0.0,0.0,0.0,0.0
50%,4654.5,1.0,0.1229,283.385,10.927721,12.69,707.0,4289.979166,8071.5,47.35,1.0,0.0,0.0,0.0
75%,7096.75,1.0,0.1411,450.74,11.289794,17.855,737.0,5790.0,17032.5,71.05,2.0,0.0,0.0,0.0
max,9577.0,1.0,0.2121,916.95,13.543702,29.7,822.0,15089.95833,275925.0,106.4,18.0,5.0,1.0,1.0


In [118]:
pd.get_dummies(test['purpose'], prefix = 'purpose').head()

Unnamed: 0,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0


In [119]:
pd.get_dummies(test['purpose'], prefix = 'purpose').columns[1:]

Index(['purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business'],
      dtype='object')

In [120]:
for i, col in enumerate(pd.get_dummies(test['purpose'], prefix = 'purpose').columns[1:]):
    test[col] = pd.get_dummies(test['purpose'], prefix = 'purpose')[pd.get_dummies(test['purpose'], prefix = 'purpose').columns[i+1]]

In [121]:
test.head()

Unnamed: 0.1,Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,...,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,8150,0,small_business,0.1299,505.34,11.407565,10.24,672,2250.0,4162,...,14,0,1,1,0,0,0,0,0,1
1,7086,1,credit_card,0.1062,618.64,11.289782,13.29,737,5070.0,14462,...,1,0,0,0,1,0,0,0,0,0
2,9374,0,all_other,0.1531,139.27,10.555813,8.34,667,3810.041667,1151,...,6,0,0,0,0,0,0,0,0,0
3,7439,1,home_improvement,0.157,280.09,10.874266,7.36,662,4770.0,10973,...,0,1,0,1,0,0,0,1,0,0
4,5345,1,credit_card,0.1531,870.39,11.727372,16.65,707,10590.0,40413,...,0,0,0,0,1,0,0,0,0,0


In [122]:
del test[test.columns[0]]

In [123]:
del test['purpose']

In [124]:
test.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,0,0.1299,505.34,11.407565,10.24,672,2250.0,4162,68.4,14,0,1,1,0,0,0,0,0,1
1,1,0.1062,618.64,11.289782,13.29,737,5070.0,14462,36.1,1,0,0,0,1,0,0,0,0,0
2,0,0.1531,139.27,10.555813,8.34,667,3810.041667,1151,9.8,6,0,0,0,0,0,0,0,0,0
3,1,0.157,280.09,10.874266,7.36,662,4770.0,10973,87.8,0,1,0,1,0,0,0,1,0,0
4,1,0.1531,870.39,11.727372,16.65,707,10590.0,40413,85.6,0,0,0,0,1,0,0,0,0,0


In [125]:
test.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.817,0.122841,327.63345,10.929426,12.47247,711.675,4672.372625,15611.572,47.16088,1.627,0.156,0.061,0.164,0.119,0.41,0.035,0.073,0.042,0.072
std,0.38686,0.027118,207.638384,0.60606,6.90572,36.953822,2512.498319,26460.949572,29.147029,2.225494,0.505885,0.23945,0.370461,0.323951,0.492079,0.183872,0.260267,0.20069,0.258617
min,0.0,0.06,25.02,8.29405,0.0,642.0,181.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1025,168.09,10.571317,6.8725,682.0,2961.760417,3169.25,22.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.1229,283.385,10.927721,12.69,707.0,4289.979166,8071.5,47.35,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1411,450.74,11.289794,17.855,737.0,5790.0,17032.5,71.05,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,0.2121,916.95,13.543702,29.7,822.0,15089.95833,275925.0,106.4,18.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [126]:
test.columns

Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business'],
      dtype='object')

In [127]:
test.columns[12]

'not.fully.paid'

In [128]:
y_test = test[test.columns[12]]

In [129]:
X_col_test = []

In [139]:
for c in test.columns[0:12]:
    X_col_test.append(c)
for b in test.columns[13:]:
    X_col_test.append(b)

In [142]:
X_col_test

['credit.policy',
 'int.rate',
 'installment',
 'log.annual.inc',
 'dti',
 'fico',
 'days.with.cr.line',
 'revol.bal',
 'revol.util',
 'inq.last.6mths',
 'delinq.2yrs',
 'pub.rec',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_educational',
 'purpose_home_improvement',
 'purpose_major_purchase',
 'purpose_small_business']

In [143]:
X_test = test[X_col_test]

In [144]:
grid_SVC.score(X_test,y_test)

0.836

In [4]:
# ensemble 과 grid search를 활용한 model selection

SVC로 모델을 선택하겠습니다.

In [None]:
# final test score - test data