# Riiid! Answer Correctness Prediction

## In-depth Introduction

> **timestamp**: (int64) the time in milliseconds between this user interaction and the first event completion from that user.

> content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

> task_container_id : (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

> user_answer : (int8) the user's answer to the question, if any. Read -1 as null, for lectures

> answered_correctly : (int8) if the user responded correctly. Read -1 as null, for lectures.

> prior_question_elapsed_time : (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

> prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import riiideducation
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgb
from scipy.stats import pearsonr
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import pandas_profiling
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
import plotly_express as px
from collections import Counter
from catboost import CatBoostClassifier
import shap
sns.set_style(style="whitegrid")

### Training data is in the competition dataset as usual
It's larger than will fit in memory with default settings, so we'll specify more efficient datatypes and only load a subset of the data for now.

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=10**5, 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
train_df.drop("row_id", axis=1, inplace=True)
questions = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")
lectures = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/lectures.csv")

## Data Visualization 

### Training set

In [None]:
fig = px.scatter(train_df["prior_question_elapsed_time"])
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(11, 7), nrows=2, ncols=2)
sns.countplot(train_df["user_answer"], ax=ax[0,0])
sns.countplot(train_df["answered_correctly"], ax=ax[0, 1])
sns.countplot(train_df["prior_question_had_explanation"], ax=ax[1, 0])
sns.countplot(train_df["content_type_id"],ax=ax[1, 1])

In [None]:
sns.barplot(x=train_df["prior_question_had_explanation"], y=train_df["prior_question_elapsed_time"])

In [None]:
sns.barplot(x=train_df["prior_question_had_explanation"], y=train_df["answered_correctly"])

### Questions

In [None]:
questions

In [None]:
tag = questions["tags"].str.split(" ", expand = True)
tag.columns = ['tag1','tag2','tag3','tag4','tag5','tag6']

In [None]:
tag.fillna(0, inplace=True)
tag = tag.astype(int)

In [None]:
questions['tags'] = questions['tags'].astype(str)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
questions["labels"] = le.fit_transform(questions["tags"])

In [None]:
questions

In [None]:
questions =  pd.concat([questions,tag],axis=1)

In [None]:
questions

In [None]:
questions["tag"] = questions["tags"].astype(str).str.split()
tags = []
for i in questions.tag:
    for j in i:
        tags.append(j)

In [None]:
tag_count = Counter(tags)
tag = list(tag_count.keys())
count = list(tag_count.values())
tag_counts = pd.DataFrame(data={"tag":tag, "count":count})

In [None]:
fig = px.bar(tag_counts, y='tag', x='count', orientation='h', width=800, height=900)
fig.show()

In [None]:
questions["part"].value_counts()

In [None]:
train_df = pd.merge(train_df[train_df['content_type_id']==0],
                              questions, 
                              how='left', 
                              left_on='content_id', 
                              right_on='question_id')
train_df.drop(["question_id", "bundle_id", "tags", "tag"], axis=1, inplace=True)

## Feature Engineering

In [None]:
# Check for missing value
train_df.isna().sum()

In [None]:
# fill the null values in elapsed time column with the mean of elapsed time
train_df["prior_question_elapsed_time"].fillna(train_df["prior_question_elapsed_time"].mean(), inplace=True)
train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"] / train_df["prior_question_elapsed_time"].mean()

In [None]:
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].fillna(value=False).astype(bool)

In [None]:
# transform last column 
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].map({True:1, False:0})

In [None]:
train_df["part"] = train_df["part"].map({1:6, 2:25, 3:39, 4:30, 5:30, 6:16, 7:54})

In [None]:
# Drop rows with content type id = lecture
train_df.drop(train_df[train_df["answered_correctly"] == -1].index, inplace=True)

In [None]:
content_id = train_df.groupby('content_id')
grouped_answer = content_id.agg({ 'answered_correctly': [np.mean, np.std, np.median, np.cumsum, 'count', "sum", "skew"]}).copy()
grouped_answer.columns = ["Content_Mean", "content_Std", "content_Median", 'content_cumsum', 'question_asked', "Sum_by_content", "skew_content"]
grouped_answer.index.names = ['content_id']

In [None]:
user_id = train_df.groupby('user_id')
grouped_user_id = user_id.agg({'answered_correctly':[np.mean, np.std, np.median, np.cumsum, 'count', "sum", "skew" ]}).copy()
grouped_user_id.columns = ["user_Mean", "user_Std", "user_Median", 'user_cumsum', "question_answered", "Sum_by_user", "skew_user"]
grouped_user_id.index.names = ['user_id']

In [None]:
part_id = train_df.groupby('user_id')
part_user_id = part_id.agg({"part":[np.mean, "sum"]}).copy()
part_user_id.columns = ["Mean_questions", "Total_questions"]

In [None]:
task_container = train_df.groupby("task_container_id")
task_id = task_container.agg({'answered_correctly':[np.mean, np.std, np.cumsum, 'count', 'sum', "skew"]})
task_id.columns = ["task_Mean", "task_std", 'task_cumsum', 'count_by_task', "Sum_by_task", "skew_task"]
task_id.index.names = ['task_container_id']

In [None]:
train_df = train_df.merge(grouped_answer, how='left', on='content_id')
train_df = train_df.merge(grouped_user_id, how='left', on='user_id')
train_df = train_df.merge(part_user_id, how="left", on="user_id")
train_df = train_df.merge(task_id, how='left', on="task_container_id")

In [None]:
train_df["user_correctness"] = train_df["Sum_by_user"] / train_df["question_answered"]
train_df["user_uncorrectness"] = 1 - train_df["user_correctness"]

In [None]:
train_df["content_correctness"] = train_df["Sum_by_content"] / train_df["question_asked"]
train_df["content_uncorrectness"] = 1 - train_df["Sum_by_content"] / train_df["question_asked"]

In [None]:
train_df["task_correctness"] = train_df["Sum_by_task"] / train_df["count_by_task"]
train_df["task_uncorrectness"] = 1 - train_df["task_correctness"]

In [None]:
train_df["timestamp"] = train_df["timestamp"] / train_df["timestamp"].mean()

In [None]:
train_df

### Lag feature

In [None]:
train_df["lag_1"] = train_df["timestamp"].shift(1)
train_df["lag_2"] = train_df["timestamp"].shift(2)
train_df["lag_3"] = train_df["timestamp"].shift(3)
train_df["lag_4"] = train_df["timestamp"].shift(4)
train_df["lag_5"] = train_df["timestamp"].shift(5)
train_df["lag_6"] = train_df["timestamp"].shift(6)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(train_df["timestamp"], lags=10)
plot_pacf(train_df["timestamp"], lags=10)

### Sliding window

In [None]:
train_df["rolling_mean"] = train_df["timestamp"].rolling(window=6).mean()

In [None]:
train_df["expanding_mean"] = train_df["timestamp"].expanding(2).mean()

In [None]:
train_df.fillna(0, inplace=True)

In [None]:
train_df.drop("content_type_id", axis=1, inplace=True)

In [None]:
train_df.columns

In [None]:
features = ['timestamp',     
            'prior_question_elapsed_time',
            'Content_Mean',
            'content_Std',  
            'question_asked', 
            'Sum_by_content',
            'skew_content', 
            'user_Mean',
            'question_answered', 
            'skew_user', 
            'Mean_questions',
            'task_Mean', 
            'task_std',
            'skew_task',
            'content_uncorrectness',
            'task_uncorrectness','lag_2', 
            'rolling_mean', 
            'expanding_mean',
            'content_cumsum',
            'user_cumsum',
            'task_cumsum']

In [None]:
len(features)

In [None]:
plt.figure(figsize=(10, 10))
train_corr = train_df[features].corr()
sns.heatmap(train_corr)

## Modelling

In [None]:
train_df.reset_index(drop=True)
X = train_df.drop("answered_correctly", axis=1)
y = train_df["answered_correctly"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X[features], y, test_size=0.3, random_state=42)

### Lightgbm

In [None]:
train = lgb.Dataset(x_train, label=y_train)
test = lgb.Dataset(x_test, label=y_test)

In [None]:
params= {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.001,
    'max_bin': 1500,
    'num_leaves': 80 
    }
    
model_lgb = lgb.train(
        params, 
        train, 
        num_boost_round=5000, 
        valid_sets=[train, test], 
        early_stopping_rounds=50, 
        verbose_eval=50,
        feature_name = features
        )

In [None]:
roc_auc_score(y_test, model_lgb.predict(x_test))

In [None]:
lgb.plot_importance(model_lgb)

In [None]:
%%time
explainer = shap.TreeExplainer(model_lgb)
shap_values = explainer.shap_values(X[features])

In [None]:
shap.summary_plot(shap_values, X[features])

### LightGBM Classifier

In [None]:
params = {
    'num_leaves': 10, 
    'n_estimators': 100, 
    'min_data_in_leaf': 10, 
    'max_depth': 5, 
    'lambda': 0.0, 
    'feature_fraction': 1.0
}

model_clf = LGBMClassifier(**params)
model_clf.fit(x_train, y_train)

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
scores = cross_val_score(model_clf, X, y, scoring="accuracy", cv=cv)

In [None]:
print(scores.mean())

In [None]:
roc_auc_score(y_test, model_clf.predict_proba(x_test)[:, 1])

In [None]:
lgb.plot_importance(model_clf)

In [None]:
explainer = shap.TreeExplainer(model_clf)
shap_values = explainer.shap_values(X[features])

In [None]:
shap.summary_plot(shap_values, X[features])

## Hyperparameter Tuning using optuna

In [None]:
import optuna

In [None]:
# def create_model(trial):
#     num_leaves = trial.suggest_int('num_leaves', 10, 100)
#     n_estimators = trial.suggest_int("n_estimators", 100, 3000)
#     min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 5, 100)
#     learning_rate = trial.suggest_uniform("learning_rate", 0.0001, 0.99)
#     bagging_fraction = trial.suggest_uniform("bagging_fraction", 0.0001, 1)
#     feature_fraction = trial.suggest_uniform("feature_fraction", 0.0001, 1)
#     max_depth = trial.suggest_int("max_depth", 5, 20)
    
#     model = LGBMClassifier(num_leaves=num_leaves,
#                            n_estimators=n_estimators,
#                            learning_rate=learning_rate,
#                            bagging_fraction=bagging_fraction,
#                            feature_fraction= feature_fraction,
#                            min_data_in_leaf=min_data_in_leaf,
#                            max_depth=max_depth)
    
#     return model

# def objective(trial):
#     model = create_model(trial)
# #     model = lgb.train(
# #         params, 
# #         train, 
# #         num_boost_round=2500, 
# #         valid_sets=[train, test], 
# #         early_stopping_rounds=20, 
# #         verbose_eval=50,
# #         feature_name = features
# #         )
#     model.fit(x_train, y_train)
#     y_pred = model.predict_proba(x_test)[:, 1]
#     score = roc_auc_score(y_test, y_pred)
#     return score

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

In [None]:
# Umcomment for hyper paramterer tuning

# import optuna.integration.lightgbm as lgbm
# def objective(trial):
#     param = {
#         'objective': 'binary',
#         'metric': 'auc',
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
#     }
 
#     gbm = lgbm.train(param, train, valid_sets=[train, test] ,early_stopping_rounds=20)
#     preds = gbm.predict(x_test)
#     accuracy = roc_auc_score(y_test, preds)
#     return accuracy
 
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=5)
 
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
# param = study.best_params
# param

### Hyperparameters tuned for Lightgbm Classifier

In [None]:
param_lgb = {
  'objective': 'binary',
  'seed': 42,
  'metric': 'auc',
  'learning_rate': 0.001,
  'max_bin': 1500,
 'lambda_l1': 5.793574585607526,
 'lambda_l2': 0.5835207139734166,
 'num_leaves': 183,
 'feature_fraction': 0.6392575149246273,
 'bagging_fraction': 0.7925551905109522,
 'bagging_freq': 6,
 'min_child_samples': 43
}

model_lgb = lgb.train(
        param_lgb, 
        train, 
        num_boost_round=5000, 
        valid_sets=[train, test], 
        early_stopping_rounds=50, 
        verbose_eval=50,
        feature_name = features
        )

In [None]:
lgb.plot_importance(model_lgb)

In [None]:
params = {'num_leaves': 10,
 'n_estimators': 437,
 'min_data_in_leaf': 22,
 'learning_rate': 0.051498290819129294,
 'bagging_fraction': 0.1604505638073827,
 'feature_fraction': 0.8585127657321616,
 'max_depth': 11}

In [None]:
model_lgbm_clf = LGBMClassifier(**params)
model_lgbm_clf.fit(x_train, y_train)

In [None]:
roc_auc_score(y_test, model_lgbm_clf.predict_proba(x_test)[:, 1])

In [None]:
lgb.plot_importance(model_lgbm_clf)

## Submission

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df[test_df['content_type_id']==0],
                              questions, 
                              how='left', 
                              left_on='content_id', 
                              right_on='question_id')
    test_df["prior_question_elapsed_time"].fillna(test_df["prior_question_elapsed_time"].mean(), inplace=True)
    test_df["prior_question_elapsed_time"] = test_df["prior_question_elapsed_time"] / train_df["prior_question_elapsed_time"].mean()
    test_df["prior_question_had_explanation"] = test_df["prior_question_had_explanation"].fillna(value=True).astype(bool)
    test_df["prior_question_had_explanation"] = test_df["prior_question_had_explanation"].map({True:1, False:0})
    test_df = test_df.merge(grouped_answer, how='left', on='content_id')
    test_df = test_df.merge(grouped_user_id, how='left', on='user_id')
    test_df = test_df.merge(part_user_id, how="left", on="user_id")
    test_df = test_df.merge(task_id, how='left', on="task_container_id")
    
    test_df["user_correctness"] = test_df["Sum_by_user"] / test_df["question_answered"]
    test_df["user_uncorrectness"] = test_df["question_answered"] - test_df["Sum_by_user"]
    
    test_df["content_correctness"] = test_df["Sum_by_content"] / test_df["question_asked"]
    test_df["content_uncorrectness"] = 1 - test_df["Sum_by_content"] / test_df["question_asked"]
    
    test_df["task_correctness"] = test_df["Sum_by_task"] / test_df["count_by_task"]
    test_df["task_uncorrectness"] = 1 - test_df["task_correctness"]
    
    # lag feature
    test_df["lag_1"] = test_df["timestamp"].shift(1)
    test_df["lag_2"] = test_df["timestamp"].shift(2)
    test_df["lag_3"] = test_df["timestamp"].shift(3)
    test_df["lag_4"] = test_df["timestamp"].shift(4)
    test_df["lag_5"] = test_df["timestamp"].shift(5)
    test_df["lag_6"] = test_df["timestamp"].shift(6)
    
    # Sliding window
    test_df["rolling_mean"] = test_df["timestamp"].rolling(window=6).mean()
    test_df["expanding_mean"] = test_df["timestamp"].expanding(2).mean()
    
    test_df.fillna(0, inplace=True)
    
    test_df['answered_correctly'] = model_lgb.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])