In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from typing import List, Dict, Optional
import numpy as np
from sklearn.model_selection import RepeatedKFold
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import time
import random
import lightgbm as lgb
import gc
import os
from collections import defaultdict
import datatable as dt
from sklearn.preprocessing import LabelEncoder
from numba import jit
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
import riiideducation

_ = np.seterr(divide='ignore', invalid='ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train.csv

1. **row_id**: (int64) ID code for the row.

2. **timestamp**: (int64) the time between this user interaction and the first event from that user.

    user_id: (int32) ID code for the user.

3. **content_id**: (int16) ID code for the user interaction

4. **content_type_id**: (int8) 0 if the event was a question being posed to the user, 1 if the event was 
    the user watching a lecture.

5. **task_container_id**: (int16) Id code for the batch of questions or lectures. For example, a user might 
    see three questions in a row before seeing the explanations for any of them. Those three would
    all share a task_container_id. Monotonically increasing for each user.

6. **user_answer**: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.

7. **answered_correctly**: (int8) if the user responded correctly. Read -1 as null, for lectures.

8. **prior_question_elapsed_time**: (float32) How long it took a user to answer their previous question      bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is   null for a user's first question bundle or lecture. Note that the time is the total time a user took to 
   solve all the questions in the previous bundle.

9. **prior_question_had_explanation**: (bool) Whether or not the user saw an explanation and the correct 
    response(s) after answering the previous question bundle, ignoring any lectures in between. 
    The value is shared across a single question bundle, and is null for a user's first question 
    bundle or lecture. Typically the first several questions a user sees were part of an onboarding
    diagnostic test where they did not get any feedback.


In [None]:
data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv',
                    columns=set(data_types_dict.keys())).to_pandas()


In [None]:
print('Train size: ',train_df.shape)

In [None]:
#checking how much memory this dataframe is using
train_df.memory_usage(deep=True)

In [None]:
train_df.info()

In [None]:
#changing prior_question_had_explanation from object to boolean
train_df['prior_question_had_explanation']=train_df['prior_question_had_explanation'].astype('boolean')

train_df.memory_usage(deep=True)

In [None]:
%%time

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

In [None]:
train_df.head(10)

In [None]:
#number of unique users in our dataset

train_df['user_id'].nunique()

In [None]:
#unique user interactions
train_df['content_id'].nunique()

In [None]:
#unique user interactions which are questions
print(f"We have {train_df['content_id'].nunique()} content ids of which {train_df[train_df['content_type_id']==False]['content_id'].nunique()} are questions ")

In [None]:
train_df['task_container_id'].nunique()

In [None]:
#
train_df['answered_correctly'].value_counts()

Timestamp
timestamp is important because it is user interaction and the first event from that user. so starting
time could be different for each user

In [None]:
plt.hist(train_df['timestamp'], bins=40);

In [None]:
questions.head()

In [None]:
lectures.head()

In [None]:
train_df.groupby(['user_id'])['timestamp'].max().sort_values(ascending=False).head()

Feature engineering

In [None]:
train_df=train_df.loc[train_df['answered_correctly']!=-1].reset_index(drop=True)
train_df=train_df.drop(['timestamp','content_type_id'], axis=1)
train_df['prior_question_had_explanation']=train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)

In [None]:
user_answers_df=train_df.groupby('user_id').agg({'answered_correctly': ['mean', 'count']}).copy()
user_answers_df.columns=['mean_user_accuracy','questions_answered']

content_answers_df =train_df.groupby('content_id').agg({'answered_correctly':['mean','count']}).copy()
content_answers_df.columns=['mean_accuracy','question_asked']

In [None]:
train_df = train_df.iloc[90000000:,:]

In [None]:
train_df=train_df.merge(user_answers_df, how='left', on='user_id')
train_df=train_df.merge(content_answers_df, how='left', on='content_id')

In [None]:
train_df.fillna(value=0.5, inplace=True)

In [None]:
train_df.head(10)

In [None]:
le = LabelEncoder()
train_df["prior_question_had_explanation"] = le.fit_transform(train_df["prior_question_had_explanation"])

In [None]:
train_df=train_df.sort_values(['user_id'])

In [None]:
y=train_df['answered_correctly']

columns = ['mean_user_accuracy', 'questions_answered', 'mean_accuracy', 'question_asked',
           'prior_question_had_explanation']

In [None]:
X=train_df[columns]

In [None]:
del train_df

In [None]:
scores=[]
feature_importance=pd.DataFrame()
models=[]

In [None]:
params = {'num_leaves': 32,
          'max_bin': 300,
          'objective': 'binary',
          'max_depth': 13,
          'learning_rate': 0.03,
          "boosting_type": "gbdt",
          "metric": 'auc',
         }

In [None]:
columns = ['mean_user_accuracy', 'questions_answered', 'mean_accuracy', 'question_asked',
#            'prior_question_had_explanation', 'mean_diff1', 'mean_diff2'
          ]

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=False)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = lgb.LGBMClassifier(**params, n_estimators=700, n_jobs = 1)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='auc',verbose=1000, early_stopping_rounds=10)
    score = max(model.evals_result_['valid_1']['auc'])
    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    break

In [None]:
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
feature_importance["importance"] /= 1
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index

best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

plt.figure(figsize=(16, 12));
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
plt.title('LGB Features (avg over folds)');

In [None]:
del X,y

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    y_preds = []
    test_df = test_df.merge(user_answers_df, how = 'left', on = 'user_id')
    test_df = test_df.merge(content_answers_df, how = 'left', on = 'content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df = test_df.loc[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df.fillna(value = 0.5, inplace = True)
    test_df["prior_question_had_explanation_enc"] = le.fit_transform(test_df["prior_question_had_explanation"])
    for model in models:
        y_pred = model.predict_proba(test_df[columns], num_iteration=model.best_iteration_)[:, 1]
        y_preds.append(y_pred)

    y_preds = sum(y_preds) / len(y_preds)
    test_df['answered_correctly'] = y_preds
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
