# Light GBM の基礎
## 今回の目的 : とりあえず、Light GBMを流して、submitまでしてみる
## 流している間に、Light GBMとは何なのかを解説

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. input dataの追加
## https://www.kaggle.com/rohanrao/riiid-train-data-multiple-formats を追加する

In [None]:
%%time
traindf = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

## pd.read_csvでは、約5分以上読み込みかかるが、pklだと、1分未満で読み込み可能
## ※ GPU使用したcudfだと、17 sec程度

In [None]:
traindf

## 約1億行のデータ。使う行だけにスリミングする

In [None]:
traindf = traindf[["user_id","content_id","task_container_id","answered_correctly","prior_question_had_explanation"]]

## 講義を除去

In [None]:
traindf = traindf[traindf["answered_correctly"]!=-1] # inferenceではcontent_type_id　== 0を使用。メモリ削減と計算量削減のため、こちらではこうやっている

In [None]:
traindf

# 2. trainデータとvalidationデータの生成。
### 今回はユーザーごとに、24個問題と回答結果を抽出。
### うち18個をtrainデータ。6個をvalidationデータとする

In [None]:
usergroup = traindf.groupby("user_id")

In [None]:
train = usergroup.tail(24)

In [None]:
train

In [None]:
train[train["user_id"]==115]

## メモリ確保のために1億行データを削除

In [None]:
del traindf

In [None]:
# メモリ解放
import gc
gc.collect()

In [None]:
train

# prior_question_had_explanationのnanデータ置換

In [None]:
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].fillna(False).astype("bool")

## trainデータ24個から後ろ6個をvalidationにして抽出して、そのindexを除去することにより、train 18個、validation 6個とする

In [None]:
usergroup2 = train.groupby("user_id")

In [None]:
val = usergroup2.tail(6)

In [None]:
train = train.drop(val.index)

In [None]:
train

In [None]:
val

## 念のため、reset index

In [None]:
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

# 3. Modeling

## 学習させるcolumnをFEATURES, ラベルをTARGET

In [None]:
FEATURES = ["content_id","task_container_id","prior_question_had_explanation"]
TARGET = "answered_correctly"

In [None]:
import lightgbm as lgb

In [None]:
train[FEATURES]

## 3.1 データセット

In [None]:
# データセットを生成する
lgb_train = lgb.Dataset(train[FEATURES], train[TARGET])
lgb_eval = lgb.Dataset(val[FEATURES], val[TARGET])

## 3.2 パラメータ設定

In [None]:
# LightGBM のハイパーパラメータbasic
lgbm_params = {
    # 二値分類問題
    'objective': 'binary',
    # AUC の最大化を目指す
    'metric': 'auc',
    # Fatal の場合出力
    'verbosity': -1,
}

In [None]:
# 他のnotebookから持ってきたパラメータ
lgbm_params2 = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

# 4.学習 -この間に解説-

In [None]:
# 上記のパラメータでモデルを学習する
model = lgb.train(lgbm_params2, lgb_train, valid_sets=lgb_eval,
                  verbose_eval=50,  # 50イテレーション毎に学習結果出力
                  num_boost_round=1000,  # 最大イテレーション回数指定
                  early_stopping_rounds=100
                 )

In [None]:
# 使うかもしれないので、modelの保存方法とロード方法

import pickle

model_name = "LGBMmodel.bin"

# モデルの保存
pickle.dump(model, open(model_name, 'wb'))

# モデルのロード方法
estimator = pickle.load(open(model_name, 'rb'))

# そうすると、estimatorが↑でいうmodelの代わりとして使える。

## feature importanceの表示

In [None]:
import matplotlib.pyplot as plt

In [None]:
# model.save_model(f'model.txt')
lgb.plot_importance(model, importance_type='gain')
plt.show()

#### 以下は参考

In [None]:
# テストデータを予測する
y_pred = model.predict(val[FEATURES], num_iteration=model.best_iteration)

In [None]:
# AUC (Area Under the Curve) を計算する
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(val[TARGET], y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)

# ROC曲線をプロット
plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

# 5. inference

In [None]:
# testdataで練習

In [None]:
ex_test = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")

In [None]:
ex_test

In [None]:
ex_test = ex_test[ex_test["content_type_id"]==0] #講義のみにする
ex_test["prior_question_had_explanation"] = ex_test["prior_question_had_explanation"].fillna(False).astype("bool")

In [None]:
ex_train = ex_test[FEATURES]

In [None]:
ex_train

## ここで予測

In [None]:
ex_train["answered_correctly"] = model.predict(ex_train,model.best_iteration)

In [None]:
ex_train

In [None]:
# submitでrow_idが入るので、row_idを大本データから挿入
ex_train["row_id"] = ex_test["row_id"]

In [None]:
ex_train

## submit fileの形式にする

In [None]:
submission = ex_train[['row_id', 'answered_correctly']]

In [None]:
submission

## ここまでの流れを関数化しておくと楽

In [None]:
def predictdf(ex_test):
    ex_test = ex_test[ex_test["content_type_id"]==0] #講義のみにする
    ex_test["prior_question_had_explanation"] = ex_test["prior_question_had_explanation"].fillna(False).astype("bool")
    ex_train = ex_test[FEATURES]
    ex_train["answered_correctly"] = model.predict(ex_train,model.best_iteration)
    ex_train["row_id"] = ex_test["row_id"]
    submission = ex_train[['row_id', 'answered_correctly']]
  
    return submission



In [None]:
predictdf(ex_test)

# 6. submit

## お決まりのやり方と思ってもらえれば良い。

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    
    
    submission = predictdf(test_df)
    submission["answered_correctly"] = submission["answered_correctly"].fillna(0.707)
    
    env.predict(submission)