In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import os
from matplotlib.ticker import FuncFormatter

import os
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%time

train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

print("Train size:", train.shape)

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

# train.memory_usage(deep=True)

In [None]:
%%time

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

In [None]:
example_test.head()

# Sampling the data

In [None]:
user_interactions = train.user_id.value_counts()
SAMPLE_SIZE = 100000
sampled_users, n_sampled = [], 0
while n_sampled < SAMPLE_SIZE:
    
    user = user_interactions.sample(1)
    user_id = user.index.values[0]
    n_interactions = user.values[0]
    sampled_users.append(user_id)
    n_sampled += n_interactions
#     print(user_id, n_interactions)
print(sampled_users[:10], '\n', n_sampled)


In [None]:
train = train.loc[train.user_id.isin(sampled_users)]
train.shape

## Cleaning the data

Removing questions with low/high correct answers ratio

In [None]:
answered_questions = train.groupby(['content_id', 'answered_correctly']).size().unstack()
correct_ratio = answered_questions.iloc[:, 1].divide(answered_questions.sum(axis=1))
correct_ratio.plot.hist(bins=100)

In [None]:
easy_question_th = 0.95
problematic_question_th = 0.5

normal_questions = correct_ratio.loc[correct_ratio.between(problematic_question_th, easy_question_th)].index
train = train.loc[train.content_id.isin(normal_questions)]
train.shape

## Aggregated history

In [None]:
cols = ['user_id', 'answered_correctly', 'prior_question_had_explanation']
train = train.loc[:, cols]
train.head()

In [None]:
train = train[train.answered_correctly != -1]
train.shape

# 2. Baseline model

In [None]:
# #this clears everything loaded in RAM, including the libraries
# %reset -f

In [None]:
import numpy as np
import pandas as pd
import riiideducation
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import os
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc
import sys
pd.set_option('display.max_rows', None)

In [None]:
# %%time
# cols_to_load = ['row_id', 'user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time']
# train = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")[cols_to_load]
# train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype('boolean')

# print("Train size:", train.shape)

In [None]:
# %%time

# questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
# lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
# example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
# example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

`current_user_data`

In [None]:
total_q = train_train.groupby('user_id').size()
n_correct = train_train.groupby('user_id')['answered_correctly'].sum()
ratio_q = n_correct.divide(total_q)
current_user_data = pd.DataFrame({'n_questions': total_q, 'ratio_q': ratio_q})
current_user_data.head()

# Submission

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for i, (test_df, sample_prediction_df) in enumerate(iter_test):
    # Create target (all-0.5-)column
    test_df['answered_correctly'] = 0.5
    
    # Making predictions
    for idx, row in test_df.iterrows():
        if row.user_id in current_user_data.index:
            pred = current_user_data.loc[row.user_id, 'ratio_q']
        else:
            pred = 0.5
        test_df.loc[idx, 'answered_correctly'] = pred    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

    # Updating knowledge based on latest batch
    if i > 0:
        prev_group_answers_correct = test_df.prior_group_answers_correct.iloc[0]
        if isinstance(prev_group_answers_correct, str):
            answers = map(int, prev_group_answers_correct.split())
            answers = pd.DataFrame(answers, index=prev_test_df.index)
    prev_test_df = test_df.copy()