# メインのノートブック (データマイニングのアンサンブル学習)

## 必要モジュールの読み込み

In [2]:
import datetime
import logging
import argparse
import json
import numpy as np
import sys
import pickle
import pandas as pd
import xgboost as xgbt
import warnings
import pydotplus as pdp
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn import tree
from models.randomforest import RF_train_and_predict
from models.lgbm import LGBM_train_and_predict
from models.xgboost import XGB_train_and_predict
from models.catboost import CAT_train_and_predict
from models.adaboost import AD_train_and_predict
from __init__ import *
from tools.lime import *

ModuleNotFoundError: No module named 'models.randomforest'

## 説明変数と目的変数の読み込み

In [4]:
warnings.filterwarnings('ignore')

file_path = '../config/default.json'
config = json.load(open(file_path,'r',encoding="shift_jis"))

now = datetime.datetime.now()
logging.basicConfig(
    filename='../logs/log_{0:%Y%m%d%H%M%S}.log'.format(now), level=logging.DEBUG
)
logging.debug('../logs/log_{0:%Y%m%d%H%M%S}.log'.format(now))

feats = config['features']
logging.debug('feats: {}'.format(feats))

target_name = config['target_name']
logging.debug('target_name: {}'.format(target_name))

X_train_all = load_datasets(feats)
logging.debug('X_train_all.shape: {}'.format(X_train_all.shape))

y_train_all = load_target(target_name)
logging.debug('y_train_all.shape: {}'.format(y_train_all.shape))

# random_state_value
random_state=0

(train_x, test_x, train_y, test_y) = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=random_state)

KeyError: 'target_name'

In [None]:
#第1モデルを作成(randomForest,Lightgbm,catboost,xgboost,adaboost)

model_rf = RF_train_and_predict(X_train_all, y_train_all)
model_lgbm = LGBM_train_and_predict(X_train_all, y_train_all)
model_cat = CAT_train_and_predict(X_train_all, y_train_all)
model_xgb = XGB_train_and_predict(X_train_all, y_train_all)
model_ada = AD_train_and_predict(X_train_all, y_train_all)

with open('../models/pickle/rf_model.pickle', mode='wb') as fp:
    pickle.dump(model_rf, fp)

with open('../models/pickle/lgbm_model.pickle', mode='wb') as fp:
    pickle.dump(model_lgbm, fp)

with open('../models/pickle/cat_model.pickle', mode='wb') as fp:
    pickle.dump(model_cat, fp)

with open('../models/pickle/xgb_model.pickle', mode='wb') as fp:
    pickle.dump(model_xgb, fp)

with open('../models/pickle/ada_model.pickle', mode='wb') as fp:
    pickle.dump(model_ada, fp)

#特徴の重要度データフレームを作成
feature_dataframe = pd.DataFrame( {'features': feats,
     'Random Forest feature importances': model_rf.feature_importances_,
     'Lightgbm  feature importances': model_lgbm.feature_importances_,
     'CatBoost feature importances': model_cat.feature_importances_,
     'XGBoost feature importances': model_xgb.feature_importances_,
     'AdaBoost feature importances': model_ada.feature_importances_,
    })

feature_dataframe.to_csv('../data/interim/feature_dataframe.csv', encoding="shift_jis")


In [None]:
#第2モデルの学習データを作成
base_predictions_train = pd.DataFrame({
     'RandomForest': model_rf.predict(train_x),
     'Lightgbm': model_lgbm.predict(train_x),
     'CatBoost': model_cat.predict(train_x).ravel(),
     'XGBoost': model_xgb.predict(train_x),
     'AdaBoost': model_ada.predict(train_x)
    })

base_predictions_train.to_csv('../data/interim/base_predictions_train.csv')

#第2モデルのテストデータを作成
base_predictions_test = pd.DataFrame( {
     'RandomForest': model_rf.predict(test_x),
     'Lightgbm': model_lgbm.predict(test_x),
     'CatBoost': model_cat.predict(test_x).ravel(),
     'XGBoost': model_xgb.predict(test_x),
     'AdaBoost': model_ada.predict(test_x)
    })

base_predictions_test.to_csv('../data/interim/base_predictions_test.csv')

gbm = xgbt.XGBClassifier(
                 #learning_rate = 0.02,
                 random_state=0,
                 n_estimators= 2000,
                 max_depth= 4,
                 min_child_weight= 2,
                 #gamma=1,
                 gamma=0.9,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 objective= 'binary:logistic',
                 nthread= -1,
                 scale_pos_weight=1).fit(base_predictions_train, train_y)

predictions = gbm.predict(base_predictions_test)

logging.debug('predictions.shape: {}'.format(predictions.shape))

with open('../data/interim/stacking_model.pickle', mode='wb') as fp:
    pickle.dump(gbm, fp)

#CSVファイルの作成
StackingSubmission = pd.DataFrame({ '決裁区分': predictions})
StackingSubmission.to_csv('../data/processed/StackingSubmission.csv', index=False, encoding="shift_jis")

logging.debug('accuracy_score: {}'.format(accuracy_score(predictions, test_y)))


In [None]:
#LIME

#クラス名(今回は「同意」「条件付同意」) defaults.jsonにて定義
class_names = config['class_name']

#可変の説明変数(例：申請金額、金利など) LIMEの結果に出力するものを、この変数に定義されたものだけに絞る。 defaults.jsonにて定義
variable_features = config['variable_features']

#カテゴリ変数 defaults.jsonにて定義
#categorical_features = config['categorical_features']

# テスト用サンプル
test_sample = test_x[2:3]

#RandomForestモデルでのLIMEの結果
lime_result_rf = lime_predict(
                    model=model_rf,
                    X_train_all= train_x, 
                    y_train_all= train_y, 
                    x_test=test_sample, 
                    feature_names=feats,
                    num_features=len(feats), 
                    class_names=class_names,
                    discretize_continuous=False,
                    variable_features=variable_features
                    #categorical_features=categorical_features
                    )

#XGBoostでのLIMEの結果
lime_result_xgb = lime_predict(
                    model=model_xgb,
                    X_train_all= train_x, 
                    y_train_all= train_y, 
                    x_test=test_sample,
                    feature_names=feats,
                    num_features=len(feats), 
                    class_names=class_names,
                    discretize_continuous=False,
                    variable_features=variable_features
                    #categorical_features=categorical_features
                    )

#LightGBMモデルでのLIMEの結果
lime_result_lgbm = lime_predict(
                    model=model_lgbm,
                    X_train_all= train_x, 
                    y_train_all= train_y, 
                    x_test=test_sample,
                    feature_names=feats,
                    num_features=len(feats), 
                    class_names=class_names,
                    discretize_continuous=False,
                    variable_features=variable_features
                    #categorical_features=categorical_features
                    )

#CatboostモデルでのLIMEの結果
lime_result_cat = lime_predict(
                    model=model_cat,
                    X_train_all= train_x, 
                    y_train_all= train_y, 
                    x_test=test_sample,
                    feature_names=feats,
                    num_features=len(feats), 
                    class_names=class_names,
                    discretize_continuous=False,
                    variable_features=variable_features
                    #categorical_features=categorical_features
                    )

#AdaboostモデルでのLIMEの結果
lime_result_ada = lime_predict(
                    model=model_ada,
                    X_train_all= train_x, 
                    y_train_all= train_y, 
                    x_test=test_sample,
                    feature_names=feats,
                    num_features=len(feats), 
                    class_names=class_names,
                    discretize_continuous=False,
                    variable_features=variable_features
                    #categorical_features=categorical_features
                    )


In [None]:
# 結果１
# 同意か要見直し協議か　割合表示

model_list = [
                model_rf,
                model_lgbm,
                model_cat,
                model_xgb,
                model_ada
            ]

model_summary_table(model_list, test_sample)


In [None]:
# 結果２
# 結果１と判断された根拠をLIMEを使ってランキング表示
x = PrettyTable()
x.field_names = ['ランキング', '判断根拠']
for i, feature in enumerate(lime_result_rf):
    x.add_row([i+1, feature[0]])
print(x.get_string())


In [None]:
# 結果３
# グラフ用のdataframe取得
df_x_train = df_load_datasets(feats)
# テスト用サンプル
df_test_sample = df_x_train[2:3]

#LIMEによって導き出された特徴量を訓練データと比較する為のグラフ描画
graph_lime_predict_features(lime_result_rf, df_x_train, df_test_sample)


In [None]:
# 結果４
# 類似している協議書の番号を出力
# cos類似度を計算する

print('今回の協議と類似しているものは')
print(cosine_similar(df_test_sample,3))