In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
import platform

# setting the font use.
SYSTEM = platform.system()
if SYSTEM == 'Linux':
  _FONT = 'IPAexGothic'
elif SYSTEM == 'Windows':
  _FONT = 'MS Gothic'

In [None]:
# データの読み込み
#cancer = pd.read_csv("../datasets/randomforest/handsign_dataset.csv", header=None)
#mr_train = pd.read_csv("../datasets/randomforest/mediapipe_train.csv", header=None)
#mr_test = pd.read_csv("../datasets/randomforest/mediapipe_test.csv", header=None)
mr_train = pd.read_csv("../datasets/randomforest/handsign_train.csv", header=None)
mr_test = pd.read_csv("../datasets/randomforest/handsign_test.csv", header=None)

In [None]:
feature_name = ["angle0","angle1","angle2","angle3","angle4","angle5","angle6","angle7","angle8","angle9","angle10","angle11","angle12","angle13","angle14",
                "direction_angle","is_plam_facing",
                "distance_to_thumb_and_index_finger",
                "distance_to_thumb_and_middle_finger",
                "distance_to_index_and_middle_finger", 
                "is_intersect_to_index_and_middle"]

In [None]:
# データ中の記号を数値に変換する
label_train = []  #正解ラベルを格納
data_train = []   #特徴データを格納
for row_index, row in mr_train.iterrows():    #一行ずつ読み込み
    label_train.append(row.loc[0])    #正解ラベル
    row_data = []
    for v in row.loc[7:]:   #一要素ずつ取り出し
        #row_data.append(ord(v))    #文字 -> urf-8
        row_data.append(v)  #数字
    data_train.append(row_data)   #特徴データ

# データ中の記号を数値に変換する
label_test = []  #正解ラベルを格納
data_test = []   #特徴データを格納
for row_index, row in mr_test.iterrows():    #一行ずつ読み込み
    label_test.append(row.loc[0])    #正解ラベル
    row_data = []
    for v in row.loc[7:]:   #一要素ずつ取り出し
        #row_data.append(ord(v))    #文字 -> urf-8
        row_data.append(v)  #数字
    data_test.append(row_data)   #特徴データ

In [None]:
search_params = {
      'n_estimators'      : [5, 10, 20, 30, 50, 100, 200],
      # 'max_features'      : [3, 5, 10, 15, 20],
      'random_state'      : [4],
      'max_depth'         : [3, 5, 10]
}

# グリッドサーチによるモデル生成
gs = GridSearchCV(RandomForestClassifier(),      # 対象の機械学習モデル
                  search_params,   # 探索パラメタ辞書
                  cv=3,            # クロスバリデーションの分割数
                  verbose=True,    # ログ表示
                  n_jobs=-1)       # 並列処理
gs.fit(data_train, label_train)

clf = gs.best_estimator_
print(clf)

In [None]:
predict = clf.predict(data_test)

# 精度を確認
ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)

print("正解率=", ac_score)
print("レポート=\n", cl_report)

In [None]:
# feature_importanceを求める
import numpy as np
import matplotlib.pyplot as plt

feature_importances = clf.feature_importances_

plt.style.use('seaborn-whitegrid')
plt.xlim([0, 0.1])
scores  = feature_importances
subjects  = np.arange(len(scores))
plt.barh(subjects , scores, align="center")
plt.yticks(subjects , feature_name)
plt.xlabel("使用率", fontname =_FONT)
plt.ylabel("特徴量", fontname =_FONT)
#plt.savefig('../log/randomforest_feature_importances.png', dpi=300)
plt.show()

In [None]:
# 学習の結果，生成された分析木を.pngで出力
import pydotplus
from sklearn import tree
label_name = [  "あ","い","う","え","お",
                "か","き","く","け","こ",
                "さ","し","す","せ","そ",
                "た","ち","つ","て","と",
                "な","に","ぬ","ね",
                "は","ひ","ふ","へ","ほ",
                "ま","み","む","め","も",
                "や","ゆ","よ",
                "ら","る","れ","ろ",
                "わ"]
file_name = "../output/randomforest/visualization/png/tree_visualization.png"
dot_name = "../output/randomforest/visualization/dot/tree.dot"
dot_data = tree.export_graphviz(
    clf.estimators_[0],
    out_file=None,
    feature_names=feature_name,
    class_names=label_name,
    filled=True,
    proportion=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png(file_name)

In [None]:
"""
# 保存したモデルからJava用のコードを生成
from sklearn_porter import Porter
# Export:
porter = Porter(clf, language='java')
output = porter.export(embed_data=True)
#output = porter.export(export_data=True)

f = open('../output/randomforest/RandomForestClassifier.java','w')
f.write(output)
f.close()
"""

In [None]:
# 学習曲線
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

train_sizes, train_scores, valid_scores = learning_curve(estimator=clf,
                                              X=data_train, y=label_train,
                                              train_sizes=np.linspace(0.1, 1.0, 10),
                                              cv=10, n_jobs=1)

# calculate the coorinates for plots
train_mean = np.mean(train_scores, axis=1)
train_std  = np.std(train_scores, axis=1)
valid_mean = np.mean(valid_scores, axis=1)
valid_std  = np.std(valid_scores, axis=1)

plt.style.use('seaborn-whitegrid')

# draw the training scores
plt.plot(train_sizes, train_mean, color='orange', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.1, color='orange')

# draw the validation scores
plt.plot(train_sizes, valid_mean, color='darkblue', marker='o', markersize=5,label='validation accuracy')
plt.fill_between(train_sizes, valid_mean + valid_std,valid_mean - valid_std, alpha=0.1, color='darkblue')

plt.xlabel('training samples')
plt.ylabel('accuracy')
plt.legend(loc='lower right')
plt.ylim([0.0, 1.01])
plt.savefig('learning_curve.png', dpi=300)
plt.show()