# 二重矢板の予測

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from sklearn import preprocessing
import japanize_matplotlib

## 1.CSVファイルの読み込み

In [None]:
#データの読み込み
data_folder = input("データファイルのあるフォルダまでのパス")
data_folder = data_folder.rstrip()
data_folder = data_folder.replace("\\", "/") + "/"

file1 = data_folder + "train_data1.csv"
file2 = data_folder + "train_labels.csv"

df1 = pd.read_csv(file1,encoding="cp932")
df2 = pd.read_csv(file2,encoding="cp932")

df = pd.concat([df1,df2],axis = 1)

In [None]:
pd.set_option('display.max_columns',30)
df.head()

## ラベルエンコーディング

In [None]:
df['矢板型'] = df['矢板型'].apply(str)
df['矢板材料'] = df['矢板材料'].apply(str)

In [None]:
df = pd.get_dummies(df)
print(df.shape)

In [None]:
pd.set_option('display.max_columns',35)
df.head()

## 相関を調べる

In [None]:
# 目的変数の設定
pur = "遮水効果"

In [None]:
sns.set(font='Yu Gothic',rc = {'figure.figsize':(200,200)})
sns.heatmap(df.corr(),square=True, vmax=1, vmin=-1, center=0,cmap='coolwarm')

In [None]:
sns.set(font='Yu Gothic',rc = {'figure.figsize':(20,20)})
sns.heatmap(df.corr()[[pur]].sort_values(by=pur, ascending=False)[1:],cmap='coolwarm', annot=True)

In [None]:
df

## モデルの作成

### データの分割

In [None]:
# データの分割
# 全体の30%をテストデータに設定

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state = 3)

### 交差検証

In [None]:
# Stratified K Foldでデータを分割
from imblearn.under_sampling import RandomUnderSampler
# 目的変数と説明変数に分ける
X = train.drop([pur], axis = 1) # 予測対象以外を説明変数に設定
y = train.loc[:,pur]

# データの分割
# ライブラリのインポート
from sklearn.model_selection import KFold

fold = KFold(n_splits=5, shuffle=True, random_state=3) # データを5分割する
kf = fold.split(X, y)
kf_cv = list(kf)

In [None]:
for i, (idx_train, idx_val) in enumerate(kf_cv):
    print(f'fold {i}')
    print(idx_train)
    print(idx_val)
    print('=='*30)
    print(len(idx_train), len(idx_val)) #5分割しているのでデータ数が1:4になるか確認する
    print('=='*30)

## xgboost

In [None]:
# from xgboost.callback import early_stop
import xgboost as xgb
from sklearn import metrics # 正解率を出すためのライブラリ
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,StratifiedKFold,cross_val_score

### ハイパーパラメータチューニング

In [None]:
import optuna
def objective(trial,df_X,df_y):
    
    params ={
    'max_depth':trial.suggest_int("max_depth",1,10),
    'min_child_weight':trial.suggest_int('min_child_weight',1,15),
    'gamma':trial.suggest_uniform('gamma',0,5),
    'subsample':trial.suggest_uniform('subsample',0,1),
    'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.5,1),
    'reg_alpha':trial.suggest_uniform('subsample',0,1),
    'reg_lambda':trial.suggest_uniform('subsample',0,1),
    'learning_rate':trial.suggest_uniform('learning_rate',0,1) 
    }

    model = xgb.XGBRegressor(n_estimators=100,
                            verbosity=0,
                            n_jobs=-1,
                            random_state=42,
                            **params)

    #交差検証
    scores = cross_val_score(model, df_X, df_y, scoring='neg_mean_squared_error',cv=5)
    rmse = np.sqrt(-scores)
    score_mean = np.mean(rmse)

    return score_mean

In [None]:
#optuna.create_study()でoptuna.studyインスタンスを作る。
study = optuna.create_study()

#studyインスタンスのoptimize()に作った関数を渡して最適化する。
study.optimize(lambda trial: objective(trial,X,y),n_trials=200, timeout=300)

In [None]:
#スコアを見る
print(study.best_params)    

In [None]:
print(study.best_value)

In [None]:
xgb_params = {
    'objective': 'reg:squarederror',  # 回帰問題
    'eval_metric': 'rmse',       # 学習用の指標
}

In [None]:
xgb_params['max_depth'] = study.best_params['max_depth']
xgb_params['min_child_weight'] = study.best_params['min_child_weight']
xgb_params['gamma'] = study.best_params['gamma']
xgb_params['subsample'] = study.best_params['subsample']
xgb_params['colsample_bytree'] = study.best_params['colsample_bytree']
xgb_params['learning_rate'] = study.best_params['learning_rate']
# xgb_params['reg_alpha'] = study.best_params['reg_alpha']
# xgb_params['reg_lambda'] = study.best_params['reg_lambda']

### 学習開始

In [None]:
from sklearn.metrics import average_precision_score,mean_absolute_error
import shap

def fit_xgb(X, y, cv, params: dict=None):
    models = []
    ma = []
    oof = np.zeros(len(X))
    # oof_classfication = np.zeros(len(X))

    if params is None:
        params = {}

    threshold_all = []
    for i, (idx_train, idx_val) in enumerate(kf_cv):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train] # 学習用の説明変数と目的変数の呼び出し
        X_val, y_val = X.iloc[idx_val], y.iloc[idx_val]

        reg = xgb.XGBRegressor(**params)
        model = reg.fit(X_train, y_train,
                        eval_set=[(X_val, y_val)],  
                        early_stopping_rounds=20,
                        verbose = 2)

        pred = model.predict(X_val)
        oof[idx_val] = pred

        # acc.append(metrics.accuracy_score(y_val, pred))
        models.append(model)
        
        explainer = shap.TreeExplainer(model = model,data=X_train,feature_perturbation="interventional")
        shap_values = explainer(X_train)
        shap.plots.bar(shap_values=shap_values,max_display=40)
        shap.plots.beeswarm(shap_values,max_display=40)
        
        print('r2_train:',reg.score(X_train, y_train))
        print('r2_val:',reg.score(X_val, y_val))
        print('MAE_val: ',mean_absolute_error(y_val, pred))
        ma.append(metrics.mean_absolute_error(y_val, pred))

    print(f'平均のMAE：{np.mean(ma)}')
    return oof, models

In [None]:
oof,models = fit_xgb(X, y, kf_cv, xgb_params)

### テストデータ

In [None]:
X_test = test.drop([pur], axis=1)
y_test = test[pur]

In [None]:
# from sklearn.metrics import average_precision_score,mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_absolute_error

def inference_xgb(models):
    # testデータに対して推論を行う
    X_test = test.drop([pur], axis=1)
    y_test = test[pur]

    pred_test = np.zeros((5,len(y_test))) # 320×6の2次元配列を作成
    r2 = []

    for i,model in enumerate(models):
        pred_test[i] = model.predict(X_test)/5
        r2.append(model.score(X_test, y_test))
    pred_test = np.sum(pred_test, axis=0) 

    print('MAE_test: ',mean_absolute_error(y_test, pred_test))
    print('r2_test_average:',np.mean(r2))

    return pred_test,X_test,y_test

In [None]:
pred_test,X_test,y_test = inference_xgb(models)

### 全体の確認

In [None]:
train["predict"] = oof
train

In [None]:
test["predict"] = pred_test
test

In [None]:
df_pred = pd.concat([train,test])
df_pred = df_pred.sort_index()
df_pred

## 予測結果の可視化

In [None]:
#x軸が予測値，y軸が結果
import matplotlib.pyplot as plt
import numpy as np
plt.scatter(df_pred["predict"],df_pred[pur], alpha = 0.5)
plt.plot(np.linspace(0, 6, 6), np.linspace(0, 6, 6), "red")
plt.show()

###  特徴量重要度

In [None]:
# 5つのモデルで重要度が出てくるので箱ひげ図にします、
def plot_importance(model, X):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = X.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column").sum()[["feature_importance"]].sort_values("feature_importance", ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x="feature_importance", 
                  y="column", 
                  order=order, 
                  ax=ax, 
                  palette=None,  
                  orient="h")
    ax.tick_params(axis="x")
    ax.set_title("Feature Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax

fig, ax = plot_importance(models, X)

### shap値個々の値についてしらべる

In [None]:
def shap_part(val):
    X_predict = df_pred.drop([pur,"predict"], axis=1)
    y_predict = df_pred[pur] 

    for model in models:
        explainer = shap.TreeExplainer(model = model,data=X_predict,feature_perturbation="interventional")
        shap_values = explainer(X_predict)
        shap.plots.waterfall(shap_values=shap_values[val],max_display=20)

In [None]:
#引数にはデータの何番目の値の詳細を見たいかをいれる．
#モデルごとにwaterfall図が出る．モデルによって異なる．
#f(x):モデルの予測値
#E[f(x)]:モデル予測値全体の平均
shap_part(1)