In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

# Download from internet
# !pip install datatable==0.11.0 > /dev/null

In [None]:
import riiideducation
import gc
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt

# Read Train

In [None]:
%%time
#Read in train.csv
#Using pandas to read in will be too slow, so we use datatable instead.
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv').to_pandas()
train_df.shape

In [None]:
%%time
# Find the max value of each column to determine data types
train_df.max()

In [None]:
train_df.info()

In [None]:
train_df.memory_usage(deep=True)

In [None]:
%%time
# Decrease memory use by convert original data types to smaller data types.
train_data_types = {
    'row_id': 'int32',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool',
}
for column, dtype in train_data_types.items():
    train_df[column] = train_df[column].astype(dtype) 
train_df.memory_usage(deep=True)

In [None]:
train_df.info()

# Explore Train

In [None]:
train_df.head()

1. Relationship between timestamp and answer_correctly

In [None]:
ms_per_year = 1000 * 60 * 60 * 24 * 365
ts = train_df['timestamp']/(ms_per_year/365)
fig = plt.figure(figsize=(12,6))
ts.plot.hist(bins=100)
plt.title("Histogram of timestamp")
plt.xticks(rotation=0)
plt.xlabel("Days between this user interaction and the first event completion from that user")
plt.show()
del ts

In [None]:
# Return a reshaped dataframe organized by specified field with respect to percentage of correct answers
def correct(field):
    correct = train_df[train_df.answered_correctly != -1].groupby([field, 'answered_correctly'], as_index=False).size()
    correct = correct.pivot(index= field, columns='answered_correctly', values='size')
    correct['Percent_correct'] = round(correct.iloc[:,1]/(correct.iloc[:,0] + correct.iloc[:,1]),2)
    correct = correct.sort_values(by = "Percent_correct", ascending = False)
    correct = correct.iloc[:,2]
    return correct

In [None]:
group_labels_6 = ['Group_1', 'Group_2', 'Group_3', 'Group_4', 'Group_5', 'Group_6']
train_df['timestamp_group'] = pd.qcut(train_df['timestamp'], q=6, labels=group_labels_6)

ts_correct = correct("timestamp_group")
ts_correct = ts_correct.sort_index()

fig = plt.figure(figsize=(12,6))
ts_correct.plot.bar()
plt.title("Percentage of answered_correctly for 6 groups of timestamp")
plt.xticks(rotation=0)
plt.show()
del ts_correct

**Users in Group_1 have relatively worst percentage of correctness. Difference of performance of other groups is not significant much.**

In [None]:
# Use a new column to indicate users with shortest timestamp ("Group_1")
train_df['new_users'] = np.where(train_df['timestamp_group'] == 'Group_1', True, False)
del train_df['timestamp_group']

2. Relationship between number of questions answered per user and answer_correctly

In [None]:
user_percent = train_df[train_df.answered_correctly != -1].groupby('user_id')['answered_correctly'].agg(Mean='mean', Answers='count')

In [None]:
user_percent = user_percent.query('Answers <= 2000').sample(n=1000, random_state=1)

fig = plt.figure(figsize=(12,6))
x = user_percent.Answers
y = user_percent.Mean
plt.scatter(x, y, marker='o')
plt.title("Percent answered correctly versus number of questions answered")
plt.xticks(rotation=0)
plt.xlabel("Number of questions answered")
plt.ylabel("Percent answered correctly")
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()
del user_percent

**Increasing number of questions answered only slightly increases the percentage of correctness.**

3. Relationship between prior_question_elapsed_time and answer_correctly

In [None]:
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].fillna(0)
elapse_labels_5 = ['Bin_1', 'Bin_2', 'Bin_3', 'Bin_4', 'Bin_5']
train_df['elapse_bin'] = pd.qcut(train_df['prior_question_elapsed_time'], q=5, labels=elapse_labels_5)

elapse_correct = correct("elapse_bin")
elapse_correct = elapse_correct.sort_index()

fig = plt.figure(figsize=(12,6))
elapse_correct.plot.bar()
plt.title("Percent answered_correctly for 5 bins of prior_question_elapsed_time")
plt.xticks(rotation=0)
plt.show()
del elapse_correct

**'prior_question_elapsed_time' does not have a strong correlation with 'answer_correctly'.**

In [None]:
del train_df["elapse_bin"]

4. Relationship between prior_question_had_explanation and answer_correctly

In [None]:
pq = train_df[train_df.answered_correctly != -1].groupby(['prior_question_had_explanation']).agg({'answered_correctly': ['mean']})
fig = plt.figure(figsize=(12,10))
pq.plot.bar(legend=None)
plt.title("Answered_correctly versus Prior_question_had_explanation")
plt.xlabel("Prior question had explanation")
plt.ylabel("Percent answered correctly")
plt.xticks(rotation=0)
plt.show()
del pq

**Prior question having explanation help user to increase percentage of answer correctly.**

In [None]:
print(f"There are {train_df.user_id.nunique()} unique users in Train.")

In [None]:
gc.collect()

# Read Lectures

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
lectures.info()

In [None]:
lectures.head(10)

# Explore Lectures

In [None]:
lect_type_of = lectures.type_of.value_counts()

fig = plt.figure(figsize=(12,6))
lect_type_of.plot.barh()
plt.title("Counts of different types of lectures")
plt.xlabel("Count of lectures")
plt.xticks(rotation=0)
plt.show()

1. Relationship between watching lecture or not and answer_correctly

In [None]:
# Group train_df by 'user_id' and 'answer_correctly'
user_lect = train_df.groupby(["user_id", "answered_correctly"]).size().unstack()
# Changed [-1, 0, 1] to ['Lecture', 'Wrong', 'Right']
user_lect.columns = ['Lecture', 'Wrong', 'Right']
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)

# Add another column to indicate whether the user watch lectures or not
user_lect = user_lect.astype('Int64')
user_lect['Watched_lecture'] = np.where(user_lect.Lecture > 0, True, False)

In [None]:
# Reshape user_lect by grouping 'Watched_lectures' and count the sum of wrong and right answers
watched_l = user_lect.groupby("Watched_lecture").agg({'Wrong': ['sum'], 'Right': ['sum']})
(t, f) = user_lect.Watched_lecture.value_counts()
print(f"Watched lecture(s): \t{t}\nNot watched lecture(s): {f}")

# Add a column to compute percentage of correct answers
watched_l['Percent_correct'] = watched_l.Right/(watched_l.Right + watched_l.Wrong)
watched_l = watched_l.iloc[:,2]

fig = plt.figure(figsize=(8,6))
watched_l.plot.bar()
plt.title("User Watched Lectures Versus Percent of Correctness")
plt.xlabel("User watched at least one lecture")
plt.ylabel("Percent of correctness")
plt.xticks(rotation=0)
plt.show()
del watched_l

**Watching lectures help increase correctness of answering questions.**

In [None]:
user_lect = user_lect.reset_index()

In [None]:
train_df = pd.merge(train_df, user_lect[['user_id', 'Watched_lecture']], how='left', on=['user_id', 'user_id'])
del user_lect

# Read Questions

In [None]:
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
questions.info()

In [None]:
questions.head(10)

# Explore Questions

In [None]:
questions[questions.tags.isna()]

Counting tags

In [None]:
questions['tags'] = questions['tags'].astype(str)

tags = [x.split() for x in questions[questions.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
print(f'There are {len(set(tags))} different tags')

In [None]:
tags_list = [x.split() for x in questions.tags.values]
questions['tags'] = tags_list
questions.head()

correct = train_df[train_df.answered_correctly != -1].groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['Wrong', 'Right']
correct = correct.fillna(0)
correct[['Wrong', 'Right']] = correct[['Wrong', 'Right']].astype(int)
questions = questions.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")
questions.head()
del correct

------------------

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(labels=['timestamp','content_type_id','task_container_id','user_answer','prior_question_elapsed_time'],axis=1)
gc.collect()

In [None]:
%%time
#using one of the validation sets composed by tito
cv_train = pd.read_pickle("../input/cv-index-for-riiid/train_index.pkl")
cv_valid = pd.read_pickle("../input/cv-index-for-riiid/valid_index.pkl")

In [None]:
cv_train

In [None]:
%%time
#Split the train set as train and validation set.
validation_df = train_df[train_df.row_id.isin(cv_valid)]
train_df = train_df[train_df.row_id.isin(cv_train)]

validation_df = validation_df.drop(columns = "row_id")
train_df = train_df.drop(columns = "row_id")

del cv_train, cv_valid
gc.collect()

In [None]:
train_df.head()

In [None]:
#Fill na in the merged dataset
#current we do not merge Questions and Lectures, so this Function simply return the original dataframe
def merge_fill_na(df):
    #df = df.merge(user_df, on = "user_id", how = "left")
    #df = df.merge(content_df, on = "content_id", how = "left")
    #df['content_questions'].fillna(0, inplace = True)
    #df['content_mean'].fillna(0.5, inplace = True)
    #df['watches_lecture'].fillna(0, inplace = True)
    #df['user_questions'].fillna(0, inplace = True)
    #df['user_mean'].fillna(0.5, inplace = True)
    #df[['content_questions', 'user_questions']] = df[['content_questions', 'user_questions']].astype(int)
    df['prior_question_had_explanation'].fillna(True, inplace = True)
    return(df)

In [None]:
%%time
#Read in Questions, Lectures and two tests files
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

In [None]:
%%time
train_df = merge_fill_na(train_df)
# validation_df = merge_fill_na(validation_df)

In [None]:
train_df

In [None]:
train_df.dtypes

In [None]:
%%time
#build final train/validation set
features = ['content_id', 'prior_question_had_explanation']

train_df = train_df.sample(n=10000000, random_state = 1)
y_train = train_df['answered_correctly']
train = train_df[features]

y_val = validation_df['answered_correctly']
validation = validation_df[features]

In [None]:
#define LGBM params
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 2020,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }

In [None]:
lgb_train = lgb.Dataset(train, y_train, categorical_feature = ['prior_question_had_explanation'])
lgb_eval = lgb.Dataset(validation, y_val, categorical_feature = ['prior_question_had_explanation'])
del train, y_train, validation, y_val
gc.collect()

In [None]:
%%time
#train
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

In [None]:
lgb.plot_importance(model)
plt.show()

In [None]:
# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = merge_fill_na(test_df)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype('bool')
    test_df['answered_correctly'] =  model.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])