# Competition

・[Riiid Answer Correctness Prediction](https://www.kaggle.com/c/riiid-test-answer-prediction)

# Overview

・TOEICの学習アプリに関して、生徒が問題に正答できるかどうかを予測する

# Evaluation

・[ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)

# Module

In [None]:
import gc
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import riiideducation
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime

# Datasets

#### train：学習データ

In [None]:
train = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')
print(train.shape)
train.head(10)

In [None]:
pd.DataFrame([['row_id', '行ID'],
              ['timestamp', '新規登録からイベント完了までの時間（ミリ秒単位）'],
              ['user_id', 'ユーザID'],
              ['content_id', 'コンテンツID'],
              ['content_type_id', 'イベント（問題なら0、講義なら1）'],
              ['task_container_id', '問題または講義のバッチID'],
              ['user_answer', 'ユーザが回答した数字（イベントが講義なら-1）'],
              ['ansewerd_correctly', 'ユーザの正答状況（イベントが講義なら-1）'],
              ['prior_question_elapsed_time', '前の問題群（大問）の回答にかかった平均時間（ミリ秒単位）'],
              ['prior_question_had_explanation', '前の大問に回答した後、解説を見たかどうか']],
              columns=['カラム', '意味'])

#### questions：問題データ

In [None]:
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
print(questions.shape)
questions.head(10)

In [None]:
pd.DataFrame([['question_id', '質問なら0、train「content_type_id」の外部キー、小問の番号'],
              ['bundle_id', '大問の番号'],
              ['correct_answer', '問題の解答、train「user_answer」と照合可'],
              ['content_id', 'コンテンツID'],
              ['part', 'TOEICテストのセクション'],
              ['tag', '問題のタグ']],
              columns=['カラム', '意味'])

#### lectures：講義データ

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
print(lectures.shape)
lectures.head(10)

In [None]:
pd.DataFrame([['lecture_id', '講義ID'],
              ['part', '講義のカテゴリーコード'],
              ['tag', '講義のタグ'],
              ['type_of', '講義の目的']],
              columns=['カラム', '意味'])

# Info

#### train

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')
train.dtypes

In [None]:
train.isnull().sum()

In [None]:
train.nunique()

#### questions

In [None]:
questions.dtypes

In [None]:
questions.isnull().sum()

In [None]:
questions.nunique()

#### lectures

In [None]:
lectures.dtypes

In [None]:
lectures.isnull().sum()

In [None]:
lectures.nunique()

# Memory

In [None]:
train.memory_usage(deep=True)

In [None]:
questions.memory_usage(deep=True)

In [None]:
questions.memory_usage(deep=True)

# EDA

In [None]:
sns.set()

#### train

・コンテンツIDのうち、講義でなく問題であるものは、13523個ある<br>
・問題または講義のバッチIDは、10000個ある

In [None]:
print('content ids: ', train.content_id.nunique())
print('questions: ', train[train.content_type_id == False].content_id.nunique())

・コンテンツIDのうち、頻出度の高い上位30個をプロットする

In [None]:
cids = train.content_id.value_counts()[:30]

fig = plt.figure(figsize=(12,6))
ax = cids.plot.bar()
plt.title('Thirty most used content ids')
plt.xticks(rotation=90)
plt.show()

・ユーザが回答した数字（イベントが講義なら-1）について、それぞれの出現回数

In [None]:
train['user_answer'].value_counts()

・1ミリ秒は、1秒の1000分の1である

In [None]:
fig = plt.figure(figsize=(12,6))
train['timestamp'].plot.hist(bins=100)
plt.title("Histogram of timestamp")
plt.xticks(rotation=0)
plt.show()

・ユーザ数に対して、timestampが0であるユーザの数

In [None]:
print(train['user_id'].nunique())
print(train.query('timestamp == 0')['user_id'].nunique())

・正答できた数と正答できなかった数

In [None]:
correct = train[train.answered_correctly != -1].answered_correctly.value_counts(ascending=True)

fig = plt.figure(figsize=(12,4))
correct.plot.barh()
for i, v in zip(correct.index, correct.values):
    plt.text(v, i, '{:,}'.format(v), color='white', fontweight='bold', fontsize=14, ha='right', va='center')
plt.title('Questions answered correctly')
plt.xticks(rotation=0)
plt.show()

・登録したタイムスタンプ順に5つに分け、正解数をカウントする

In [None]:
bin_labels_5 = ['Bin_1', 'Bin_2', 'Bin_3', 'Bin_4', 'Bin_5']
train['ts_bin'] = pd.qcut(train['timestamp'], q=5, labels=bin_labels_5)

def correct(field):
    correct = train[train.answered_correctly != -1].groupby([field, 'answered_correctly'], as_index=False).size()
    correct = correct.pivot(index= field, columns='answered_correctly', values='size')
    correct['Percent_correct'] = round(correct.iloc[:,1]/(correct.iloc[:,0] + correct.iloc[:,1]),2)
    correct = correct.sort_values(by = 'Percent_correct', ascending = False)
    correct = correct.iloc[:,2]
    return(correct)

bins_correct = correct('ts_bin')
bins_correct = bins_correct.sort_index()

fig = plt.figure(figsize=(12,6))
plt.bar(bins_correct.index, bins_correct.values)
for i, v in zip(bins_correct.index, bins_correct.values):
    plt.text(i, v, v, color='white', fontweight='bold', fontsize=14, va='top', ha='center')
plt.title('Percent answered_correctly for 5 bins of timestamp')
plt.xticks(rotation=0)
plt.show()

・task_container_id（問題または講義のバッチID）を正解率でグループ化して、ヒストグラムを表示する

In [None]:
task_id_correct = correct('task_container_id')

fig = plt.figure(figsize=(12,6))
task_id_correct.plot.hist(bins=40)
plt.title('Histogram of percent_correct grouped by task_container_id')
plt.xticks(rotation=0)
plt.show()

In [None]:
train['task_container_id'].unique()

・ユーザごとに、平均正解数と回答した数を集約する

In [None]:
user_percent = train[train.answered_correctly != -1].groupby('user_id')['answered_correctly'].agg(Mean='mean', Answers='count')
print(user_percent.Answers.max())

・回答数が1000以下のユーザに絞って、回答数と正解率をプロットする

In [None]:
user_percent = user_percent.query('Answers <= 1000').sample(n=200, random_state=1)

fig = plt.figure(figsize=(12,6))
x = user_percent.Answers
y = user_percent.Mean
plt.scatter(x, y, marker='o')
plt.title('Percent answered correctly versus number of questions answered User')
plt.xticks(rotation=0)
plt.xlabel('Number of questions answered')
plt.ylabel('Percent answered correctly')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

・content_id も同様に、平均正解数と回答した数を集約する

In [None]:
content_percent = train[train.answered_correctly != -1].groupby('content_id')['answered_correctly'].agg(Mean='mean', Answers='count')
print(content_percent.Answers.max())
print(len(content_percent[content_percent.Answers > 25000]))

・回答数が25000以上あったものは除去し、残りの中から200サンプルを抽出して、正解率と回答数をプロットする

In [None]:
content_percent = content_percent.query('Answers <= 25000').sample(n=200, random_state=1)

fig = plt.figure(figsize=(12,6))
x = content_percent.Answers
y = content_percent.Mean
plt.scatter(x, y, marker='o')
plt.title("Percent answered correctly versus number of questions answered Content_id")
plt.xticks(rotation=0)
plt.xlabel("Number of questions answered")
plt.ylabel("Percent answered correctly")
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

・prior_question_had_explanation（前の大問に回答した後、解説を見たかどうか）をグループ化し、正解率を比較する

In [None]:
pq = train[train.answered_correctly != -1].groupby(['prior_question_had_explanation'], dropna=False).agg({'answered_correctly': ['mean', 'count']})
print(pq.iloc[:,1])
pq = pq.iloc[:,0]

fig = plt.figure(figsize=(12,4))
pq.plot.barh()
plt.title("Answered_correctly versus Prior Question had explanation")
plt.xlabel("Percent answered correctly")
plt.ylabel("Prior question had explanation")
plt.xticks(rotation=0)
plt.show()

・prior_question_elapsed_time（前の大問の回答にかかった平均時間）とansewerd_correctly（ユーザの正答状況）について、プロットする

In [None]:
mean_pq = train.prior_question_elapsed_time.astype("float64").mean()

condition = ((train.answered_correctly != -1) & (train.prior_question_elapsed_time.notna()))
pq = train[condition][['prior_question_elapsed_time', 'answered_correctly']].sample(n=200, random_state=1)
pq = pq.set_index('prior_question_elapsed_time').iloc[:,0]

fig = plt.figure(figsize=(12,6))
x = pq.index
y = pq.values
plt.scatter(x, y, marker='o')
plt.title("Answered_correctly versus prior_question_elapsed_time")
plt.xticks(rotation=0)
plt.xlabel("Prior_question_elapsed_time")
plt.ylabel("Answered_correctly")
plt.vlines(mean_pq, ymin=-0.1, ymax=1.1, color='black')
plt.text(x= 27000, y=0.4, s='mean')
plt.text(x=80000, y=0.6, s='trend')
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()

#### questions

・questionsのtagsには、欠損値が1つある

In [None]:
questions[questions.tags.isna()]

・trainでは、この質問は1度しかされていないことが分かる

In [None]:
train.query('content_id == 10033 and answered_correctly != -1')

In [None]:
questions['tags'] = questions['tags'].astype(str)

tags = [x.split() for x in questions[questions.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
tags = set(tags)
tags = list(tags)
print(f'There are {len(tags)} different tags')

・questionsについて、正答と誤答の個数を調べる

In [None]:
tags_list = [x.split() for x in questions.tags.values]
questions['tags'] = tags_list
questions.head()

correct = train[train.answered_correctly != -1].groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['Wrong', 'Right']
correct = correct.fillna(0)
correct[['Wrong', 'Right']] = correct[['Wrong', 'Right']].astype(int)
questions = questions.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")
questions.head()

・tag（講義タグ）ごとに正答数と誤答数をまとめ、正解率を求める

In [None]:
tags_df = pd.DataFrame()
for x in range(len(tags)):
    df = questions[questions.tags.apply(lambda l: tags[x] in l)]
    df1 = df.agg({'Wrong': ['sum'], 'Right': ['sum']})
    df1['Total_questions'] = df1.Wrong + df1.Right
    df1['Question_ids_with_tag'] = len(df)
    df1['tag'] = tags[x]
    df1 = df1.set_index('tag')
    tags_df = tags_df.append(df1)

tags_df[['Wrong', 'Right', 'Total_questions']] = tags_df[['Wrong', 'Right', 'Total_questions']].astype(int)
tags_df['Percent_correct'] = tags_df.Right/tags_df.Total_questions
tags_df = tags_df.sort_values(by = "Percent_correct")

tags_df.head()

・tagごとの正解率をプロットする

In [None]:
select_rows = list(range(0,10)) + list(range(178, len(tags_df)))
tags_select = tags_df.iloc[select_rows,4]

fig = plt.figure(figsize=(12,6))
x = tags_select.index
y = tags_select.values
clrs = ['red' if y < 0.6 else 'green' for y in tags_select.values]
tags_select.plot.bar(x, y, color=clrs)
plt.title("Ten hardest and ten easiest tags")
plt.xlabel("Tag")
plt.ylabel("Percent answers correct of questions with the tag")
plt.xticks(rotation=90)
plt.show()

・最も正解率の低いタグには、約25万件の回答しかないことが分かる

In [None]:
tags_select = tags_df.sort_values(by = "Total_questions", ascending = False).iloc[:30,:]
tags_select = tags_select["Total_questions"]

fig = plt.figure(figsize=(12,6))
ax = tags_select.plot.bar()
plt.title("Thirty tags with most questions answered")
plt.xticks(rotation=90)
plt.ticklabel_format(style='plain', axis='y')
ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ','))) #add thousands separator
plt.show()

・TOEICのパートは、次のような構成となっている

・リスニングセクション：パート1～4（リスニングセクション（約45分、100問））<br>
・リーディングセクション：パート5～7（リーディングセクション（75分、100問））

・パートごとのquestion_idのカウント数と正解率を表示する

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
ax1 = questions.groupby("part").count()['question_id'].plot.bar()
plt.title("Counts of part")
plt.xlabel("Part")
plt.xticks(rotation=0)

part = questions.groupby('part').agg({'Wrong': ['sum'], 'Right': ['sum']})
part['Percent_correct'] = part.Right/(part.Right + part.Wrong)
part = part.iloc[:,2]

ax2 = fig.add_subplot(212)
plt.bar(part.index, part.values)
for i, v in zip(part.index, part.values):
    plt.text(i, v, round(v,2), color='white', fontweight='bold', fontsize=14, va='top', ha='center')

plt.title("Percent_correct by part")
plt.xlabel("Part")
plt.xticks(rotation=0)
plt.tight_layout(pad=2)
plt.show()

#### Lectures

# Reset

・ノートブックを全てリセットする

In [None]:
%reset -f

# Module

In [None]:
import gc
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import riiideducation
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime

# Datasets

#### ・train：学習データ

In [None]:
cols_to_load = ['row_id',
                'user_id',
                'answered_correctly',
                'content_id',
                'prior_question_had_explanation',
                'prior_question_elapsed_time']

In [None]:
train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")[cols_to_load]
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')
print(train.shape)
train.head(10)

In [None]:
pd.DataFrame([['row_id', '行ID'],
              ['timestamp', '新規登録からイベント完了までの時間（ミリ秒単位）'],
              ['user_id', 'ユーザID'],
              ['content_id', 'コンテンツID'],
              ['content_type_id', 'イベント（問題なら0、講義なら1）'],
              ['task_container_id', '問題または講義のバッチID'],
              ['user_answer', 'ユーザが回答した数字（イベントが講義なら-1）'],
              ['ansewerd_correctly', 'ユーザの正答状況（イベントが講義なら-1）'],
              ['prior_question_elapsed_time', '前の問題群（大問）の回答にかかった平均時間（ミリ秒単位）'],
              ['prior_question_had_explanation', '前の大問に回答した後、解説を見たかどうか']],
              columns=['カラム', '意味'])

#### ・question：問題データ

In [None]:
question = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
print(question.shape)
question.head(10)

In [None]:
pd.DataFrame([['question_id', '質問なら0、train「content_type_id」の外部キー、小問の番号'],
              ['bundle_id', '大問の番号'],
              ['correct_answer', '問題の解答、train「user_answer」と照合可'],
              ['content_id', 'コンテンツID'],
              ['part', 'TOEICテストのセクション'],
              ['tag', '問題のタグ']],
              columns=['カラム', '意味'])

#### ・lecture：講義データ

In [None]:
lecture = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
print(lecture.shape)
lecture.head(10)

In [None]:
pd.DataFrame([['lecture_id', '講義ID'],
              ['part', '講義のカテゴリーコード'],
              ['tag', '講義のタグ'],
              ['type_of', '講義の目的']],
              columns=['カラム', '意味'])

# Feature Engineering

・ユーザごとの正解数と平均正解数

In [None]:
user_df = train[train.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
user_df.columns = ['user_id', 'user_questions', 'user_mean']
user_df.head()

・ユーザごとの講義を受けた回数

In [None]:
user_lect = train.groupby(['user_id', 'answered_correctly']).size().unstack()
user_lect.columns = ['lecture', 'wrong', 'right']
user_lect = user_lect[['lecture']].fillna(0).astype('int8')
user_lect['watches_lecture'] = np.where(user_lect['lecture'] > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]
user_lect.head()

In [None]:
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
user_df.head()

・コンテンツごとの正解数と平均正解数

In [None]:
content_df = train[train.answered_correctly != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_questions', 'content_mean']
content_df.head()

・バリデーションした行IDを、Kaggleのデータセットから引っ張ってくる

In [None]:
cv2_train = pd.read_pickle("../input/riiid-cross-validation-files/cv2_train.pickle")['row_id']
cv2_valid = pd.read_pickle("../input/riiid-cross-validation-files/cv2_valid.pickle")['row_id']

・trainのイベントが講義でないものに絞り込む

In [None]:
train = train[train.answered_correctly != -1]
mean_prior = train['prior_question_elapsed_time'].astype("float64").mean()

validation = train[train['row_id'].isin(cv2_valid)]
train = train[train['row_id'].isin(cv2_train)]

validation = validation.drop(columns = 'row_id')
train = train.drop(columns = 'row_id')

del cv2_train, cv2_valid
gc.collect()

・trainとvalidationに同じ処理をする

In [None]:
train = train.merge(user_df, on = "user_id", how = "left")
train = train.merge(content_df, on = "content_id", how = "left")
train['content_questions'].fillna(0, inplace = True)
train['content_mean'].fillna(0.5, inplace = True)
train['watches_lecture'].fillna(0, inplace = True)
train['user_questions'].fillna(0, inplace = True)
train['user_mean'].fillna(0.5, inplace = True)
train['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
train['prior_question_had_explanation'].fillna(False, inplace = True)
label_enc = preprocessing.LabelEncoder()
label_enc.fit(train['prior_question_had_explanation'])
train['prior_question_had_explanation'] = label_enc.transform(train['prior_question_had_explanation'])
train[['content_questions', 'user_questions']] = train[['content_questions', 'user_questions']].astype(int)
train.head()

In [None]:
validation = validation.merge(user_df, on = "user_id", how = "left")
validation = validation.merge(content_df, on = "content_id", how = "left")
validation['content_questions'].fillna(0, inplace = True)
validation['content_mean'].fillna(0.5, inplace = True)
validation['watches_lecture'].fillna(0, inplace = True)
validation['user_questions'].fillna(0, inplace = True)
validation['user_mean'].fillna(0.5, inplace = True)
validation['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
validation['prior_question_had_explanation'].fillna(False, inplace = True)
validation['prior_question_had_explanation'] = label_enc.transform(validation['prior_question_had_explanation'])
validation[['content_questions', 'user_questions']] = validation[['content_questions', 'user_questions']].astype(int)
validation.head()

# Preprocess

In [None]:
features = ['user_questions', 'user_mean', 'content_questions', 'content_mean', 'prior_question_elapsed_time']
train = train.sample(n=10000000, random_state = 1)

y_train = train['answered_correctly']
train = train[features]

y_val = validation['answered_correctly']
validation = validation[features]

print(train.shape)
print(validation.shape)

In [None]:
print(y_train.shape)
print(y_val.shape)

# Modeling

In [None]:
'''
def create_model(trial):
    num_leaves = trial.suggest_int('num_leaves', 2, 30)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    
    model = lgb.LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        bagging_freq=bagging_freq,
        bagging_fraction=bagging_fraction,
        feature_fraction=feature_fraction,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=666)
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train, y_train)
    y_pred = model.predict_proba(validation)[:,1]
    roc = roc_auc_score(y_val, y_pred)
    return roc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)
params = study.best_params
print(params)
'''

In [None]:
params = {'num_leaves': 30,
          'n_estimators': 130,
          'learning_rate': 0.2063142389801971,
          'max_depth': 10,
          'min_child_samples': 859,
          'min_data_in_leaf': 12,
          'bagging_freq': 5,
          'bagging_fraction': 0.49172530042862095,
          'feature_fraction': 0.8948054696257811,
          'subsample': 0.8841461600513105,
          'colsample_bytree': 0.8277289106661925,
          'random_state': 666}

In [None]:
cls = lgb.LGBMClassifier(**params)
cls.fit(train, y_train)

In [None]:
sns.set()
lgb.plot_importance(cls)
plt.show()

# Prediction

In [None]:
exsample_test = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')
print(exsample_test.shape)
exsample_test.head()

In [None]:
example_submission = pd.read_csv('../input/riiid-test-answer-prediction/example_sample_submission.csv')
print(example_submission.shape)
example_submission.head()

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test, sample_prediction) in iter_test:
    test = test.merge(user_df, on = 'user_id', how = 'left')
    test = test.merge(content_df, on = 'content_id', how = 'left')
    test['content_questions'].fillna(0, inplace = True)
    test['content_mean'].fillna(0.5, inplace = True)
    test['watches_lecture'].fillna(0, inplace = True)
    test['user_questions'].fillna(0, inplace = True)
    test['user_mean'].fillna(0.5, inplace = True)
    test['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
    test['prior_question_had_explanation'].fillna(False, inplace = True)
    test['prior_question_had_explanation'] = label_enc.transform(test['prior_question_had_explanation'])
    test[['content_questions', 'user_questions']] = test[['content_questions', 'user_questions']].astype(int)
    test['answered_correctly'] =  cls.predict(test[features])
    env.predict(test.loc[test['content_type_id'] == 0, ['row_id', 'answered_correctly']])

・content_typeを指定して、講義データを除去している

# References

・[Riiid: Comprehensive EDA + Baseline](https://www.kaggle.com/erikbruin/riiid-comprehensive-eda-baseline)<br>