In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


%matplotlib inline

In [None]:
# loans_tr 데이터를 load
loans = pd.read_csv('loans_tr.csv')
loans.head()

In [None]:
# Exploratory Data Aanalysis
loans.info()

In [None]:
# Object type data 
loans_clean = pd.get_dummies(loans)
loans_clean.info()

In [None]:
from sklearn.model_selection import train_test_split
x = loans_clean.drop('not.fully.paid',axis=1)
y = loans_clean['not.fully.paid']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=101)

In [None]:
del loans_clean['Unnamed: 0']
loans_clean.head()

In [None]:
# Compare Algorithms - model selection
models = []
models.append(('LR', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))

In [None]:
import warnings
warnings.simplefilter('ignore')

models

In [None]:
# cross validaton을 통해 optimal model selection
results = []
names = []
for name, model in models:
    names.append(name)
    score = cross_val_score(model, x_train, y_train, cv=5, scoring = 'accuracy')
    results.append(score)
#     print(results)
    print(name,':', score.mean(), score.std())
    

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)

In [None]:
# pipeline을 활용해서 scaling하고 optimal model selection
pipelines=[]

pipelines.append(('ScaledLR', Pipeline([('scaler', StandardScaler()), ('LR', LogisticRegression())])))
pipelines.append(('ScaledSVC', Pipeline([('scaler', StandardScaler()), ('SVC',SVC())])))
pipelines.append(('ScaledKNN', Pipeline([('scaler', StandardScaler()), ('KNN',KNeighborsClassifier())])))
pipelines.append(('ScaledDT', Pipeline([('scaler', StandardScaler()), ('DT',DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('scaler', StandardScaler()), ('NB',GaussianNB())])))
pipelines.append(('ScaledRF', Pipeline([('scaler', StandardScaler()), ('RF',RandomForestClassifier())])))


In [None]:
pipelines

In [None]:
# cross validaton을 통해 optimal model selection
results = []
names = []
for name, model in models:
    names.append(name)
    score = cross_val_score(model, x_train, y_train, cv=5, scoring = 'accuracy')
    results.append(score)
    print(name,':', score.mean(), score.std())

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)

In [None]:
# decisiontree

In [None]:
params = {'C':[0.01,0.1,1,10,100],
         'gamma':[0.01,0.1,1,10,100]}
model=SVC()
grid = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=5)
grid = grid.fit(x_train, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
svc = SVC(C= 0.01,gamma= 0.01)

In [None]:
svc.fit(x_train,y_train)

In [None]:
svc.score(x_test, y_test)

In [None]:
# ensemble 과 grid search를 활용한 model selection

from sklearn.ensemble import AdaBoostClassifier
adab = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), n_estimators=500,
                          learning_rate=0.1)

params = {'base_estimator__criterion': ['gini', 'entropy'],
         'base_estimator__max_features':[7,8],
         'base_estimator__max_depth':[3,5,7],
         'n_estimators': [20,50,100],
         'learning_rate':[0.4, 0.6,0.8]}

grid_adab = GridSearchCV(estimator = adab, param_grid=params, cv=5)
grid_adab.fit(x_train, y_train)


In [None]:
grid_adab.best_score_

In [None]:
grid_adab.best_params_

In [None]:
adab = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(criterion='gini', \
                            max_depth=3, max_features=7), n_estimators=50,
                          learning_rate=0.4)

In [None]:
adab.fit(x_train, y_train)
adab.score(x_test, y_test)

In [None]:

from sklearn.ensemble import GradientBoostingClassifier
gbr = GradientBoostingClassifier(n_estimators=200, subsample=0.5, max_depth=2, learning_rate=0.55)

params = {'n_estimators': [20, 50, 100],
         'subsample':[0.5],
         'max_depth':[3,5,7],
         'min_samples_leaf': [3,5],
         'max_features':[0.4, 0.6, 0.8, 1.0]}

grid_grb = GridSearchCV(estimator =gbr, param_grid=params, cv=5, n_jobs=-1)
grid_grb.fit(x_train, y_train)

In [None]:
grid_grb.best_score_

In [None]:
grid_grb.best_params_

In [None]:
grb = GradientBoostingClassifier(max_depth = 3, max_features=0.4, \
                    min_samples_leaf=3, n_estimators=50)

In [None]:
grb.fit(x_train, y_train)
print(grb.score(x_test, y_test))

In [None]:
# final test score - test data


In [None]:
loans_ts = pd.read_csv('loans_ts.csv')

y_ts = loans_ts.iloc[:, -1:]
x_ts = loans_ts.iloc[:, :-1]

x_ts = pd.get_dummies(x_ts)

scaler = StandardScaler()
scaler.fit(x_train)
x_ts = scaler.transform(x_ts)

print('rfc :', rfc.score(x_ts, y_ts))
print('adab :', adab.score(x_ts, y_ts))
print('grb :', grb.score(x_ts, y_ts))