In [60]:
#SET PARAMETER
SCORE = 'precision'

In [61]:
# 以下は拡張前のベースとなるコードです.
# import basice apis
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, roc_auc_score

# import Sample Data to learn models
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])

# cross-validation by holdout
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.20, random_state=1)

# set pipelines for two different algorithms
pipe_knn = Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl', StandardScaler()), ('est',LogisticRegression(random_state=1))])
pipe_rf = Pipeline([('scl', StandardScaler()), ('est', RandomForestClassifier(random_state=1))])
pipe_gbc = Pipeline([('scl', StandardScaler()),('est', GradientBoostingClassifier(random_state=1))])
pipe_mlpc = Pipeline([('scl', StandardScaler()),('est',MLPClassifier(max_iter=500, random_state=1))])
pipe_svc = Pipeline([('scl', StandardScaler()), ('est',LinearSVC(random_state=1))])



# Evaluation function
def get_model_score(y_true, X_test, pipeline, score_type):
    if score_type == 'auc':
        y_pred = pipeline.predict_proba(X_test)[:,1]
        score = roc_auc_score(y_true, y_pred)
    elif score_type == 'f1':
        y_pred = pipeline.predict(X_test)
        score = f1_score(y_true, y_pred)
    elif score_type == 'precision':
        y_pred = pipeline.predict(X_test)
        score = precision_score(y_true, y_pred)
    elif score_type == 'accuracy':
        y_pred = pipeline.predict(X_test)
        score = accuracy_score(y_true, y_pred)
    elif score_type == 'recall':
        y_pred = pipeline.predict(X_test)
        score = recall_score(y_true, y_pred)
    else:
        score = None
    return score

# Fit & Evaluation
pipe_names = ['pipe_knn', 'pipe_logistic', 'pipe_rf', 'pipe_gbc', 'pipe_mlpc', 'pipe_svc']
pipe_lines = [pipe_knn, pipe_logistic, pipe_rf, pipe_gbc, pipe_mlpc, pipe_svc]
model_score = []
for (i,pipe) in enumerate(pipe_lines):
    pipe.fit(X_train, y_train.as_matrix().ravel())
    score = get_model_score(y_test.as_matrix().ravel(), X_test, pipe, SCORE)
    print(i, ':', pipe_names[i],'-->',score)
    model_score.append(score)
# パイプラインの学習・評価を追加。
# 評価は指定指標の下で実施されるようにする。

0 : pipe_knn --> 0.935064935064935
1 : pipe_logistic --> 0.972972972972973
2 : pipe_rf --> 0.9459459459459459
3 : pipe_gbc --> 0.9473684210526315
4 : pipe_mlpc --> 0.9473684210526315
5 : pipe_svc --> 0.9726027397260274


In [62]:
#Ger the best model and save
# Reference: http://oppython.hatenablog.com/entry/2016/05/31/205746
index_at_max_score = [i for i,x in enumerate(model_score) if x == max(model_score)]
print(index_at_max_score)
from sklearn.externals import joblib
#with open('test.txt', 'r') as f:
for index  in index_at_max_score:
    pipe_lines[index].fit(X,y.as_matrix().ravel())
    joblib.dump(pipe_lines[index],pipe_names[index] + '.pkl')


[1]
