In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
from sklearn.utils import all_estimators

# CSV読み込み
csv = pd.read_csv('iris.csv', encoding='utf-8')
print(csv)

SepalLength  SepalWidth  PetalLength  PetalWidth            Name
0            5.1         3.5          1.4         0.2     Iris-setosa
1            4.9         3.0          1.4         0.2     Iris-setosa
2            4.7         3.2          1.3         0.2     Iris-setosa
3            4.6         3.1          1.5         0.2     Iris-setosa
4            5.0         3.6          1.4         0.2     Iris-setosa
..           ...         ...          ...         ...             ...
145          6.7         3.0          5.2         2.3  Iris-virginica
146          6.3         2.5          5.0         1.9  Iris-virginica
147          6.5         3.0          5.2         2.0  Iris-virginica
148          6.2         3.4          5.4         2.3  Iris-virginica
149          5.9         3.0          5.1         1.8  Iris-virginica

[150 rows x 5 columns]


In [5]:
# ラベルとデータに分離
y = csv.loc[:, 'Name']
x = csv.loc[:, ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]

# 学習データとテストに分離
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, shuffle=True)
print(f'train: {len(x_train)}, {len(y_train)}')
print(f'test: {len(x_test)}, {len(y_test)}')

train: 120, 120
test: 30, 30


In [6]:
# 全てのclassifierアルゴリズム
warnings.filterwarnings('ignore')
all = all_estimators(type_filter='classifier')

In [19]:
# 評価
alg_list = []
for name, alg in all:
    try:
        clf = alg()
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        alg_list.append((name, accuracy_score(y_test, y_pred)))
    except Exception as err:
        print(f'{name}: {err}')

alg_list = sorted(alg_list, key=lambda x: x[1], reverse=True)
for name, acc in alg_list:
    print(f'{name} の正解率 = {acc}')

ClassifierChain: __init__() missing 1 required positional argument: 'base_estimator'
MultiOutputClassifier: __init__() missing 1 required positional argument: 'estimator'
OneVsOneClassifier: __init__() missing 1 required positional argument: 'estimator'
OneVsRestClassifier: __init__() missing 1 required positional argument: 'estimator'
OutputCodeClassifier: __init__() missing 1 required positional argument: 'estimator'
StackingClassifier: __init__() missing 1 required positional argument: 'estimators'
VotingClassifier: __init__() missing 1 required positional argument: 'estimators'
LinearDiscriminantAnalysis の正解率 = 1.0
LinearSVC の正解率 = 1.0
LogisticRegression の正解率 = 1.0
LogisticRegressionCV の正解率 = 1.0
QuadraticDiscriminantAnalysis の正解率 = 1.0
BaggingClassifier の正解率 = 0.9666666666666667
CalibratedClassifierCV の正解率 = 0.9666666666666667
CategoricalNB の正解率 = 0.9666666666666667
ExtraTreeClassifier の正解率 = 0.9666666666666667
ExtraTreesClassifier の正解率 = 0.9666666666666667
GaussianNB の正解率 = 0.966

In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

# クロスバリデーション
kfold = KFold(n_splits=5, shuffle=True)

# 評価
alg_list_cv = []
for name, alg in all:
    try:
        clf = alg()
        # scoreが存在するアルゴリズムだけ
        if hasattr(clf, 'score'):
            scores = cross_val_score(clf, x, y, cv=kfold)
            alg_list_cv.append((name, scores))
    except Exception as err:
        print(f'{name}: {err}')

alg_list_cv = sorted(alg_list_cv, key=lambda x: np.average(x[1]), reverse=True)
alg_list = sorted(alg_list, key=lambda x: x[1], reverse=True)
for name, scores in alg_list_cv:
    print(f'{name} の正解率 = {scores}')

ClassifierChain: __init__() missing 1 required positional argument: 'base_estimator'
MultiOutputClassifier: __init__() missing 1 required positional argument: 'estimator'
OneVsOneClassifier: __init__() missing 1 required positional argument: 'estimator'
OneVsRestClassifier: __init__() missing 1 required positional argument: 'estimator'
OutputCodeClassifier: __init__() missing 1 required positional argument: 'estimator'
StackingClassifier: __init__() missing 1 required positional argument: 'estimators'
VotingClassifier: __init__() missing 1 required positional argument: 'estimators'
LinearDiscriminantAnalysis の正解率 = [0.96666667 0.96666667 0.96666667 1.         1.        ]
LogisticRegressionCV の正解率 = [0.96666667 1.         0.93333333 1.         0.96666667]
MLPClassifier の正解率 = [0.93333333 1.         0.96666667 0.96666667 1.        ]
QuadraticDiscriminantAnalysis の正解率 = [0.96666667 1.         0.93333333 0.96666667 0.96666667]
LogisticRegression の正解率 = [0.93333333 1.         0.96666667 0.9

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# グリッドサーチ
params = [
    {"C": [1, 10, 100, 1000], 'kernel': ['linear']},
    {"C": [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    {"C": [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
]
clf = GridSearchCV(SVC(), params, cv=kfold)
clf.fit(x_train, y_train)
print(f'最適なパラメータ = {clf.best_estimator_}')

最適なパラメータ = SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [25]:
# 最適なパラメータで評価
y_pred = clf.predict(x_test)
print(f'評価時の正解率 = {accuracy_score(y_test, y_pred)}')

評価時の正解率 = 1.0


In [28]:
from sklearn.model_selection import RandomizedSearchCV

# グリッドサーチ
params = [
    {"C": [1, 10, 100, 1000], 'kernel': ['linear']},
    {"C": [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    {"C": [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.001, 0.0001]}
]
clf = RandomizedSearchCV(SVC(), params, cv=kfold)
clf.fit(x_train, y_train)
print(f'最適なパラメータ = {clf.best_estimator_}')

最適なパラメータ = SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [29]:
# 最適なパラメータで評価
y_pred = clf.predict(x_test)
print(f'評価時の正解率 = {accuracy_score(y_test, y_pred)}')

評価時の正解率 = 0.9666666666666667
