# sklearnで、交差検証ありのコード

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# データセットを取得

In [2]:
# データの読み込み
iris=load_iris()
X,y=iris.data,iris.target

# 行・列数の確認
print(X.shape)
print(y.shape)

(150, 4)
(150,)


# 交差検証で、foldごとにモデルを訓練

In [3]:
# データ
kf = KFold(n_splits=5, shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
                      
scores=[]
#scoring = {"acc":"accuracy", "prc": "precision_macro","rec": "recall_macro", "f1":"f1_macro", "auc":"roc_auc"}
scoring = {"acc":"accuracy", "prc": "precision_macro","rec": "recall_macro", "f1":"f1_macro"}


# KFoldで、分割して作成されたデータセットのインデックスを取得できる
for train_id, test_id in kf.split(X):
    train_x = X[train_id]
    train_y = y[train_id]
    test_x = X[test_id]
    test_y = y[test_id]
    
    print("==="*40)
    # モデルを設定
    clf=DecisionTreeClassifier(max_depth=3, random_state=94)
    # モデルの学習
    clf.fit(train_x, train_y)
    
    # 推論
    pred_y = clf.predict(test_x)
    # 精度指標の取得
    #scores_cv = cross_validate(clf, iris.data, iris.target, cv=skf, scoring=scoring)
    scores_cv = cross_validate(clf, X=train_x, y=train_y, cv=kf, scoring=scoring)
    pprint(scores_cv)
    
    score=accuracy_score(test_y, pred_y)
    scores.append(score)

#
print("==="*40)
print("\n")

scores = np.array(scores)
print(scores.mean())

{'fit_time': array([0.        , 0.00099993, 0.        , 0.        , 0.00099993]),
 'score_time': array([0.0019989 , 0.00100064, 0.00200009, 0.00200033, 0.00099969]),
 'test_acc': array([0.95833333, 0.91666667, 1.        , 0.95833333, 0.875     ]),
 'test_f1': array([0.95681511, 0.91111111, 1.        , 0.95213675, 0.9047619 ]),
 'test_prc': array([0.95238095, 0.91111111, 1.        , 0.95238095, 0.91666667]),
 'test_rec': array([0.96666667, 0.91111111, 1.        , 0.95833333, 0.91666667])}
{'fit_time': array([0.        , 0.        , 0.        , 0.        , 0.00100017]),
 'score_time': array([0.00199986, 0.00199962, 0.00200009, 0.00200033, 0.00099993]),
 'test_acc': array([0.91666667, 0.95833333, 0.91666667, 1.        , 0.91666667]),
 'test_f1': array([0.92207792, 0.95213675, 0.91071429, 1.        , 0.90740741]),
 'test_prc': array([0.94444444, 0.95238095, 0.92592593, 1.        , 0.9047619 ]),
 'test_rec': array([0.91666667, 0.95833333, 0.91666667, 1.        , 0.93333333])}
{'fit_time': a

# Pycaretと組み合わせる

In [4]:
#!pip install pycaret

In [16]:
# パッケージの読み込み
import pandas as pd
from pycaret.classification import *
#from pycaret.regression import *
from pycaret.datasets import get_data

#boston = get_data('boston')
#exp1 = setup(boston_data, target = 'medv')

# 使用するデータの読み込み 
from pycaret.datasets import get_data 
data = get_data('employee')

# 95％を学習データ、5%をテストデータ(Unseen Dataと呼ぶ)に分ける
employee_data = data.sample(frac =0.95, random_state = 786).reset_index(drop=True)
employee_data_unseen = data.drop(employee_data.index).reset_index(drop=True)
print('Data for Modeling: ' + str(employee_data.shape))
print('Unseen Data For Predictions: ' + str(employee_data_unseen.shape))

In [17]:
df_x = pd.DataFrame(X)
df_ = pd.concat([df_x, pd.DataFrame(y)])

In [18]:
df_.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [19]:
iris = load_iris()
X_a = pd.DataFrame(iris.data, columns=iris.feature_names)
y_a = pd.DataFrame(iris.target, columns=["target"])
df_a = pd.concat([X_a,y_a], axis=1)
df_a.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [20]:
# PyCaretを起動
exp1 = setup(df_a, target = 'target')
#exp1 = setup(df_x, target=df_y, ignore_features = None)
# PyCaretを起動（データ型を変更する場合）
#exp1 = setup(employee_data, target = 'left', ignore_features = None, numeric_features = ['time_spend_company'])

Unnamed: 0,Description,Value
0,session_id,4434
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(150, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [21]:
# モデルの比較
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.026
qda,Quadratic Discriminant Analysis,0.9909,1.0,0.9917,0.9927,0.9908,0.9862,0.9873,0.003
lda,Linear Discriminant Analysis,0.9909,1.0,0.9917,0.9927,0.9908,0.9862,0.9873,0.003
dt,Decision Tree Classifier,0.99,0.9917,0.9889,0.992,0.9896,0.9846,0.986,0.003
ada,Ada Boost Classifier,0.9818,1.0,0.9833,0.9855,0.9815,0.9725,0.9747,0.013
lr,Logistic Regression,0.9809,0.9969,0.9806,0.9847,0.9803,0.9709,0.9733,0.007
knn,K Neighbors Classifier,0.9809,0.9845,0.9806,0.9847,0.9803,0.9709,0.9733,0.006
rf,Random Forest Classifier,0.9718,0.9969,0.9722,0.9775,0.9711,0.9571,0.9607,0.036
et,Extra Trees Classifier,0.9718,1.0,0.9722,0.9775,0.9711,0.9571,0.9607,0.031
lightgbm,Light Gradient Boosting Machine,0.9618,0.9948,0.9667,0.9705,0.9615,0.9426,0.9474,0.056


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=4434, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)