# PyCaret 2 Classification Example
This notebook is created using PyCaret 2.0. Last updated : 31-07-2020

In [None]:
# check version
from pycaret.utils import version

version()

# 1. Data Repository

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))# デフォルトは75%

import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm_notebook as tqdm
import pycaret

sns.set()

# 常に全ての列（カラム）を表示
pd.set_option("display.max_columns", None)

!pwd
sys.executable

In [None]:
# 入力ディレクトリ
data_dir = "../../../data/orig/home-credit-default-risk"

# 出力ディレクトリ
output_dir = "model"
os.makedirs(output_dir, exist_ok=True)

# 目的変数
target = "TARGET"

# 乱数シード
session_id = 123

# cv hold
fold = 5

# metric
optimize = "AUC"

# models
choice_ms = ["catboost", "lightgbm"]#, "lda"]
#choice_ms = ['nb', 'lightgbm', "rf"]  # test用

# チューニング回数
n_iter = 100
#n_iter = 3  # test用

# compare_models()時間かかるので
is_compare_models = False

# setup()のcsv保存するか（0.5GBぐらい容量食うから）
is_save_setup_csv = False
#is_save_setup_csv = True

# 学習に除く列
ignore_features = ["SK_ID_CURR"]

In [None]:
# Parameters
n_iter = 400
is_compare_models = False
is_save_setup_csv = False


# data load

In [None]:
df_train = pd.read_csv(
    os.path.join(data_dir, "application_train.csv"),
)
df_test = pd.read_csv(
    os.path.join(data_dir, "application_test.csv"),
)

print(df_train.info())
display(
    df_train.head().style.background_gradient(cmap="Pastel1")
)  
display(df_train.describe().style.background_gradient(cmap="Pastel1"))

print(df_test.info())
display(df_test.head().style.background_gradient(cmap="Pastel1"))
display(df_test.describe().style.background_gradient(cmap="Pastel1"))

In [None]:
params = {"target": target, 
          "session_id": session_id, 
          "silent": True,
          "ignore_features": ignore_features,
         }

def imbalance_setup(df, target, params=params, fold=None, experiment_name=None):
    """imbalanceのオプション入れるとデータ数が変わるため再setpuする"""
    
    # mlflowのファイル作るか
    if experiment_name is not None:
        params["log_experiment"] = True
        params["experiment_name"] = experiment_name     
        
    strategy = None
    
    if fold == "all":
        # cvなし
        n_min = df[target].value_counts().min()
        strategy = {0:n_min, 1:n_min}

    elif fold == 10:
        # 10foldでの最大数
        n_fold10 = 15639
        strategy = {0:n_fold10, 1:n_fold10}
    
    elif fold == 5:
        # 5foldでの最大数
        n_fold5 = 13901
        strategy = {0:n_fold5, 1:n_fold5}

    # アンダーサンプリング
    if strategy is not None:
        params["fix_imbalance"] = True  # 不均衡補正入れる
        params["fix_imbalance_method"] = RandomUnderSampler(sampling_strategy=strategy, random_state=session_id)  # imblearnの関数で不均衡補正

    
    return setup(df, **params)

# 2. Initialize Setup

In [None]:
# from pycaret.classification import *
# help(setup)

In [None]:
%%time

from pycaret.classification import *

if is_save_setup_csv:
    _df_test = df_test.copy()
    _df_test[target] = 0  # target列仮で入れる
    
    clf1 = imbalance_setup(_df_test, target)
    display(clf1[0].head(3))
    
    # 一応前処理後のtest set保存しておく
    pd.concat([clf1[0], clf1[1]], axis=1).to_csv(
        os.path.join(output_dir, "test_setup.csv"), index=False
    )  

clf1 = imbalance_setup(df_train, target, fold=fold)
display(clf1[0].head(3))

if is_save_setup_csv:
    # 一応前処理後のtrain set保存しておく
    pd.concat([clf1[0], clf1[1]], axis=1).to_csv(
        os.path.join(output_dir, "train_setup.csv"), index=False
    )  

In [None]:
# test
#lr = create_model('lr', fold=fold)

# 3. Compare Baseline

In [None]:
%%time
if is_compare_models:
    best_model = compare_models(sort=optimize, fold=fold)

# 4. Create Model

In [None]:
models()

In [None]:
models(type="ensemble").index.tolist()

In [None]:
#ensembled_models = compare_models(
#    whitelist=models(type="ensemble").index.tolist(), fold=3
#)

# 5. Tune Hyperparameters

In [None]:
%%time

tune_models = []
for m in choice_ms:
    
    m = create_model(m, fold=fold)
    
    tuned_m = tune_model(
        m,
        fold=fold,
        optimize=optimize,
        n_iter=n_iter,
    )
    tune_models.append(tuned_m)

tuned_lightgbm = tune_models[1]
print(tune_models)

In [None]:
#help(finalize_model)

In [None]:
%%time

# cv分けず+hold-outのデータも全部含めて学習

clf1 = imbalance_setup(fold=None)

f_tune_models = []
for m, name in zip(tune_models, choice_ms):
    f_m = finalize_model(m)
    f_tune_models.append(f_m)
    save_model(f_m, model_name=os.path.join(output_dir, "pycaret_tuned_" + name))

print(f_tune_models)

# 6. Ensemble Model は省略

# 7. Blend Models

In [None]:
#help(blend_models)

In [None]:
# CatBoost Classifierはエラーになる

#clf1 = imbalance_setup(fold=fold)
#
#blender = blend_models(estimator_list=tune_models, 
#                       fold=fold,
#                       optimize=optimize,
#                       method="soft", 
#                       choose_better=True,  # 精度改善しなかった場合、create_model()で作ったモデルを返す
#                      )
#save_model(blender, model_name=os.path.join(output_dir, "pycaret_blender"))

# 8. Stack Models

In [None]:
#help(stack_models)

In [None]:
clf1 = imbalance_setup(fold=fold)  # finalize=Trueでもfold指定必要

stacker = stack_models(estimator_list=tune_models[:-1], 
                       meta_model=tune_models[-1],
                       fold=fold,
                       optimize=optimize,
                       finalize=True,  # cv分けず+hold-outのデータも全部含めて学習
                      )
save_model(stacker, model_name=os.path.join(output_dir, "pycaret_stacker"))

# 9. Analyze Model

In [None]:
#from pycaret.classification import *
#help(plot_model)

In [None]:
plot_model(tune_models[1])

In [None]:
plot_model(tune_models[1], plot="confusion_matrix")

In [None]:
plot_model(tune_models[1], plot="boundary")

In [None]:
plot_model(tune_models[1], plot="feature")  # catboostはエラーになる

In [None]:
# prだけ異様に時間かかるのでコメントアウト
#plot_model(tune_models[1], plot="pr")

In [None]:
plot_model(tune_models[1], plot="class_report")

In [None]:
evaluate_model(tune_models[1])

# 10. Interpret Model

In [None]:
#catboost = create_model("catboost", cross_validation=False)

## DockerではShapはつかえない。Docker imageが壊れるらしいのでインストールしていない

In [None]:
#interpret_model(catboost)

In [None]:
#interpret_model(catboost, plot="correlation")

In [None]:
#interpret_model(catboost, plot="reason", observation=12)

# 11. AutoML()

In [None]:
# help(automl)

In [None]:
#%%time
## なんかエラーになる。。。
#automl = automl(optimize=optimize)
#save_model(automl, model_name=os.path.join(output_dir, "pycaret_automl"))
#print(automl)

# 12. Predict Model

In [None]:
#pred_holdouts = predict_model(f_tune_models[0])
#pred_holdouts.head()

In [None]:
#new_data = df_test.copy()
#predict_new = predict_model(f_tune_models[0], data=new_data)
#predict_new.head()

In [None]:
f_tune_models = []
for name in choice_ms:
    loaded_model = load_model(os.path.join(output_dir, f"pycaret_tuned_{name}"))
    f_tune_models.append(loaded_model)
stacker = load_model(os.path.join(output_dir, f"pycaret_stacker"))

In [None]:
def make_submit(df_test, model, data_dir: str, output_dir: str, csv_name: str):
    """submit csv作成"""
    df_predict = predict_model(model, data=df_test)
    
    tem_csv = f"{data_dir}/sample_submission.csv"
    df_tem = pd.read_csv(tem_csv)
    df_sub = pd.merge(df_tem, df_predict, how="left", on="SK_ID_CURR")[["SK_ID_CURR", "Score"]]
    df_sub = df_sub.rename(columns={"Score": "TARGET"})
    df_sub.to_csv(f"{output_dir}/{csv_name}.csv", index=False)
    display(df_sub.head())
    
    
for m, name in zip(f_tune_models, choice_ms):
    make_submit(df_test, m, data_dir, output_dir, f"{name}_submission")
#make_submit(df_test, blender, data_dir, output_dir, "blender_submission")
make_submit(df_test, stacker, data_dir, output_dir, "stacker_submission")
#make_submit(df_test, automl, data_dir, output_dir, "automl_submission")

# 13. Save / Load Model

In [None]:
#save_model(best, model_name=os.path.join(output_dir, "pycaret_automl"))

In [None]:
#loaded_bestmodel = load_model(os.path.join(output_dir, "pycaret_automl"))
#print(loaded_bestmodel)

In [None]:
from sklearn import set_config

set_config(display="diagram")
loaded_bestmodel[0]

In [None]:
from sklearn import set_config

set_config(display="text")

# 14. Deploy Model

In [None]:
#deploy_model(best, model_name="best-aws", authentication={"bucket": "pycaret-test"})

# 15. Get Config / Set Config

In [None]:
X_train = get_config("X_train")
X_train.head()

In [None]:
get_config("seed")

In [None]:
from pycaret.classification import set_config

set_config("seed", 999)

In [None]:
get_config("seed")

# 16. MLFlow UI

In [None]:
# !mlflow ui

# End
Thank you. For more information / tutorials on PyCaret, please visit https://www.pycaret.org