1. Problem Identification

2. Data Wrangling
    * Data Collection: loading and joining
    * Data Definition
        * Column names
        * Data types
        * Count/Percent of unique values
    * Data Cleaning
        * NA or missing data
        * Duplicates
3. Exploratory Data Analysis
4. Pre-processing,Training Data Development and Modeling
5. Documentation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
!pip install seaborn_qqplot
from seaborn_qqplot import pplot

import gc

# import riiideducation
# env = riiideducation.make_env()

# # datatable installation with internet
!pip install datatable==0.11.0 > /dev/null
import datatable as dt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**2.1. Data loading and joining**

Combine training, lectures and questions datasets for more insight.
The training file is too large so datatable was used to read train.csv.


In [None]:
%%time
# train_df = dt.fread('/kaggle/input/riiid-test-answer-prediction/train.csv')
# train_df = train_df.to_pandas()
# meta_lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
# meta_lectures = meta_lectures.to_pandas()
# meta_questions = dt.fread('/kaggle/input/riiid-test-answer-prediction/questions.csv')
# meta_questions = meta_questions.to_pandas()

# train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
#                       dtype=dtypes)
train_df=pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',low_memory=False, nrows=10**6) #,dtype=dtypes)
lectures=pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
questions=pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

print("Train size:",train_df.shape)
print("Lectures size:", lectures.shape)
print("Questions size:", questions.shape)


In [None]:
train_df.head()

In [None]:
lectures.head()

In [None]:
questions.head()

In [None]:
# check if all records have metadata (lecture/question) associate with it
# lectures:
n_lec = train_df[train_df.content_type_id==1].content_id.isin(lectures.lecture_id.unique()).sum()
n_ques = train_df[train_df.content_type_id==0].content_id.isin(questions.question_id.unique()).sum()
print("number of lectures that has a match: {}".format(n_lec))
print("number of questions that has a match: {}".format(n_ques))
print("total number equal to number of records: ", (n_lec+n_ques)==10**6)

In [None]:
train_df.memory_usage(deep=True)

In [None]:
# chunk_list = []  # append each chunk df here 
# ids = set()
# # Each chunk is in df format
# for chunk in df_chunk:  
#     # perform data filtering 
#     chunk = chunk.drop_duplicates(['timestamp'])
#     chunk = chunk[~chunk['timestamp'].isin(ids)]
#     ids.update(chunk['timestamp'].values)
#     # Once the data filtering is done, append the chunk to list
#     chunk_list.append(chunk)
    
# # concat the list into dataframe 
# df_concat = pd.concat(chunk_list)

In [None]:
train_df.info()

In [None]:
lectures.info()

In [None]:
questions.info()

In [None]:
# check for duplicates
print(train_df[train_df.duplicated()])
print(lectures[lectures.duplicated()])
print(questions[questions.duplicated()])

In [None]:
# train_df.drop_duplicates(inplace=True)
# lectures.drop_duplicates(inplace=True)
# questions.drop_duplicates(inplace=True)
# print(train_df.shape)
# print(lectures.shape)
# printquestions.shape)

Got memory error when trying to apply drop_duplicates on train_df.
First 10**6 records don't contain duplicates.

In [None]:
# combine train_df, lectures, questions
train_lec = train_df[train_df.content_type_id==1][['user_id','timestamp','content_id','task_container_id']]
train_lec_merge = train_lec.merge(lectures, how='left',left_on=['content_id'], right_on = ['lecture_id'])\
                    .drop(columns=['lecture_id','content_id','part','tag'])
train_lec_merge = train_lec_merge.rename(columns={'type_of':'type_of_lec'})
# train_lec_merge.head()
# add a column to train_df: num_lec_watched: the number of lectures watched before doing the question
train_df = train_df.sort_values(by=['user_id','timestamp'])
train_df['num_lec_watched'] = train_df.groupby(['user_id']).content_type_id.cumsum()
# (how to include lecture types???)

In [None]:
train_q = train_df[train_df.content_type_id ==0]
train_q = train_q.drop(columns=['content_type_id'])

In [None]:
# # combine train_df, lectures, questions
# lectures["content_type_id"] = 1
# questions["content_type_id"] = 0
# train_1 = train_df.merge(lectures, how='left',left_on=['content_id','content_type_id'], right_on = ['lecture_id','content_type_id'])\
#                     .drop(columns=['lecture_id'])
# train = train_1.merge(questions,how='left', left_on=['content_id','content_type_id'], right_on = ['question_id','content_type_id'])\
#                     .drop(columns=['question_id','content_type_id'])
# gc.collect()
# train = train.rename(columns={'part_x':'part_lec', 'part_y':'part_ques', 'tag':'tag_lec', 'tags':'tag_ques'})
# train.head()

In [None]:
# columns with only one unique value
one_value = [col for col in train_q.columns if train_q[col].nunique() <= 1]
print("Number of columns with only one unique value: {}".format(len(one_value)))

**2.2. Data definition**

In [None]:
print("Training set shape: {}".format(train_q.shape))

In [None]:
# data info
train_q.info()

In [None]:
%%time
train_q['task_container_id'] = train_q['task_container_id'].astype('int16')
train_q['user_answer'] = train_q['user_answer'].astype('int8')
train_q['answered_correctly'] = train_q['answered_correctly'].astype('int8')
train_q['prior_question_elapsed_time'] = train_q.prior_question_elapsed_time.fillna(0)
train_q['prior_question_elapsed_time'] = train_q['prior_question_elapsed_time'].astype('float32')
# train['prior_question_had_explanation'].replace({'True': True, 'False': False}, inplace=True)
train_q['first_bundle'] =  np.where(train_q['prior_question_had_explanation'].isnull(), True, False)
# train_q['prior_question_had_explanation'] = train_q.where(~train_q['prior_question_had_explanation'].isnull(),False)
train_q['prior_question_had_explanation'].replace({np.NaN: False}, inplace=True)
train_q['prior_question_had_explanation'] = train_q['prior_question_had_explanation'].astype('bool')
gc.collect()

In [None]:
train_q.head()

In [None]:
# summary of missing values in each column
train_q.isnull().sum()

In [None]:
pd.options.display.float_format = "{:.2f}".format
train_q[['timestamp','prior_question_elapsed_time']].describe().T

In [None]:
print("Number of unique users: {}".format(train_q.user_id.nunique()))
print("Average number of records per user: {:.2f}".format(len(train_q.index)/train_q.user_id.nunique()))

In [None]:
# print("Number of lectures watched: {}". format((~train.tag_lec.isnull()).sum()))
# print("Number of questions asked: {}". format((train.tag_lec.isnull()).sum()))
# print("Percentage lectures in the samples: {:.2%}".format((~train.tag_lec.isnull()).sum()/train.shape[0]))

In [None]:
print("Number of correct answers: {}".format((train_q.answered_correctly == 1).sum()))
print("Number of incorrect answers: {}".format((train_q.answered_correctly == 0).sum()))

Imbalanced dataset: incorrect records is half of correct records

In [None]:
# print("Number of unique content ids: {}". format(train[["content_id","tag_lec"]].nunique()))
# print("Number of unique lectures: {}". format(train[train.tag_ques.isnull()].content_id.nunique()))
# print("Number of unique questions: {}". format(train[train.tag_lec.isnull()].content_id.nunique()))

In [None]:
print("Number of unique {}". format(train_q[["content_id"]].nunique()))

Some lectures and questions share the same content ids.

In [None]:
print("Number of unique content ids in meta lectures: {}".format(lectures.lecture_id.nunique()))
print("Number of unique content ids in meta questions: {}".format(questions.question_id.nunique()))

Not all the lectures and questions in the metadata are included in the sampled dataset.

In [None]:
print("Number of question/lecture bundles: {}".format(train_q.task_container_id.nunique()))

In [None]:
print("Percentage of question that had explanation: {:.2%}".format((train_q.prior_question_had_explanation == True).sum()/len(train_q.index)))
print("Percentage of question that were in first bundle: {:.2%}".format((train_q.first_bundle==True).sum()/len(train_q.index)))
gc.collect()

In [None]:
# Percentage of questions that's not in the first bundle and not explained 
print("Percentage of questions that's not in the first bundle and not explained: {:.2%}".format(train_q[train_q.prior_question_had_explanation!=~train_q.first_bundle].row_id.count()/len(train_q.index)))

In [None]:
cor_rate_ques_explained = train_q[train_q.prior_question_had_explanation == True].answered_correctly.mean()
cor_rate_ques_not_explained = train_q[train_q.prior_question_had_explanation == False].answered_correctly.mean()
print("Correctness rate with prior question explained: {:.2%}".format(cor_rate_ques_explained))
print("Correctness rate with prior question not explained: {:.2%}".format(cor_rate_ques_not_explained))

Whether the prior question is explained might have an effect on correctness, however, the percentage of question in the first bundle is small so can be ignored. Drop *first_bundle* column

In [None]:
train_q.drop(columns=['first_bundle'], inplace=True)

**Export processed data to new file**

In [None]:
train_q.info()

In [None]:
train_q.to_csv("train_cleaned.csv", index=False)
# train.to_pickle("/kaggle/working/train_cleaned.pkl")

**3. Exploratory Data Analysis**

In [None]:
dtypes = {'row_id': 'int64', 
         'timestamp': 'int64', 
         'user_id': 'int32', 
         'content_id': 'int32', 
         'task_container_id': 'int16', 
         'user_answer': 'int8', 
         'answered_correctly': 'int8', 
         'prior_question_elapsed_time': 'float32',
         'prior_question_had_explanation': 'boolean',
         'num_lec_watched': 'int16'}

In [None]:
df = pd.read_csv("./train_cleaned.csv", dtype=dtypes)
# df = pd.read_pickle("./train_cleaned.pkl")

In [None]:
df.head()

In [None]:
# df[df.user_id==124]

In [None]:
df.info()

Whether the question is answered correctly might be associated with:
* Length of user interaction -- timestamp
* Number of questions in a bundle -- task_container_id
* Number of lectures in a bundle -- task_container_id
* Question-lecture ratio in a bundle -- task_container_id
* Length of time user took to answer the previous bundle -- prior_question_elapsed_time
* Whether the explanation given for previous question bundle -- priot_question_had_explanation

Regarding the questions file:
* The number of correct answer -- correct_answer
* The section of test -- part
* Questions with certain tags -- tags
Regarding the lectures file:
* Type of lecture provided -- type_of
* Certain lecture -- tag

In [None]:
sns.catplot(x="answered_correctly", 
                data=df, kind="count")
plt.title('Correct/Incorrect answers')

**Lectures**

Whether having watched lectures increases correctness rate?

In [None]:
sns.distplot(df.num_lec_watched,bins=20)
plt.title("distribution plot of number of watched lectures")
plt.ylabel("probability")

The histogram is highly skewed, so we perform log transformation and plot again, separating correctly/incorrectly answered questions.

In [None]:
sns.distplot(np.log(df[(df["answered_correctly"]==1) & (df['num_lec_watched']!=0)].num_lec_watched),bins=20)
sns.distplot(np.log(df[(df["answered_correctly"]==0) & (df['num_lec_watched']!=0)].num_lec_watched),bins=20)
plt.legend(['Correct','Incorrect'])
plt.title("log-histogram of number of watched lectures")
plt.ylabel("probability")

The log-transform of correct/incorrect questions are almost the same and not normal distribution.

**Timestamp**

Whether timestamp is associated with correctness rate?

In [None]:
# sns.distplot(temp_[temp_["answered_correctly"]==1].timestamp,
#              hist=False,rug=True)
# sns.distplot(temp_[temp_["answered_correctly"]==0].timestamp,
#              hist=False,rug=True)
# sns.distplot(temp_[temp_["answered_correctly"]==-1].timestamp,
#              hist=False,rug=True)

sns.distplot(df[df["answered_correctly"]==1].timestamp,hist=False,rug=True)
sns.distplot(df[df["answered_correctly"]==0].timestamp,hist=False,rug=True)
plt.legend(['Correct','Incorrect'])
plt.title("rug plot of timestamp")
plt.ylabel("Probability")
plt.xlabel("timestamp")

gc.collect()

In [None]:
# Since the data was skewed, log transformation is taken
sns.distplot(np.log(df[(df["answered_correctly"]==1) & (df['timestamp']!=0)].timestamp))
sns.distplot(np.log(df[(df["answered_correctly"]==0) & (df['timestamp']!=0)].timestamp))

# ax.set_xlim(0,4*10**10)
plt.legend(['Correct','Incorrect'])
plt.title("log-histogram of timestamp")
plt.ylabel("Probability")
plt.xlabel("timestamp after log transformation")
gc.collect()

Incorrectly answered question has higher density at smaller timestamp number. No other obvious difference in these 3 categories associated with timestamp. The log timestamp does not follow a normal distribution.

**Previous elapsed time**

Whether previous elapsed time is associated with correctness rate?

In [None]:
sns.distplot(df[df["answered_correctly"]==1].prior_question_elapsed_time,hist=False,rug=True)
sns.distplot(df[df["answered_correctly"]==0].prior_question_elapsed_time,hist=False,rug=True)
plt.legend(['Correct','Incorrect'])
plt.title("rug plot of prior_question_elapsed_time")
plt.ylabel("Probability")
plt.xlabel("prior_question_elapsed_time")

gc.collect()

Plot relationship between task_container_id and correctness
* Number of questions in a bundle -- task_container_id
* Number of lectures in a bundle -- task_container_id
* Question-lecture ratio in a bundle -- task_container_id

In [None]:
sns.distplot(np.log(df[(df["answered_correctly"]==1) & (df['prior_question_elapsed_time']!=0)].prior_question_elapsed_time))
sns.distplot(np.log(df[(df["answered_correctly"]==0) & (df['prior_question_elapsed_time']!=0)].prior_question_elapsed_time))

plt.legend(['Correct','Incorrect','Lectures'])
plt.title("log-histogram of prior_question_elapsed_time")
plt.ylabel("Probability")
plt.xlabel("prior_question_elapsed_time after log transformation")
gc.collect()

**Task containers**

In [None]:
# What are the frequencies of question bundles being visited? 
sns.distplot(df.groupby("task_container_id").count())
plt.title("histogram of task_container_id frequency")
plt.ylabel("Probability")
plt.xlabel("Number of times task_container_id frequency being visited")

In [None]:
# print(df.groupby(["content_id"]).size().nlargest(10))
# print(df.groupby(["content_id"]).size().nsmallest(10))

In [None]:
# A bundle with multiple questions and same timestamp only counts as one visit  
df_unique_tasks = df.groupby(['user_id','timestamp']).first()
df_unique_tasks.head()
print(df_unique_tasks.groupby('task_container_id').size().nlargest(10))
print(df_unique_tasks.groupby('task_container_id').size().nsmallest(10))

The times of visit varies from 3824 to 1. The top 10 most visited questions id number are smaller than the least visited ones.

Whether the times of visit and task container id are correlated?

In [None]:
count_container= pd.DataFrame(df_unique_tasks.groupby('task_container_id').size())
count_container.reset_index(level=['task_container_id'],inplace=True)
count_container.rename(columns={0: "freq"}, inplace=True)
sns.scatterplot(x='task_container_id',y="freq",data=count_container,linewidth=0,s=2)

We can see that the question bundles with higher id number are visited less.

In [None]:
sns.distplot(np.log(df_unique_tasks.groupby('task_container_id').size()))
plt.title("log-histogram of task_container_id frequency")
plt.ylabel("Probability")
plt.xlabel("Number of times task_container_id frequency being visited(log)")

Most question bundles are only visited once. Could be because of the sampling.

In [None]:
# sns.scatterplot(df.task_container_id, df.groupby(["task_container_id"]).size(),size=1, linewidth=0)
# plt.xlabel("task container id")
# plt.ylabel("number of times visited")
# plt.title("Scatter plot of task container id vs number of times visited")

In [None]:
df_unique_tasks.reset_index(level=['timestamp'],inplace=True)
df_unique_tasks.reset_index(level=['user_id'],inplace=True)
p = sns.jointplot(x='task_container_id', y='timestamp', data=df_unique_tasks, s=1)
p.set_axis_labels("task container id", "timestamp")
p.fig.suptitle("Scatter plot of task container id vs timestamp")

The tasks are visited less if the task container id number is larger, but the task container id and timestamp are not linearly correlated, which means learning longer time is not necessarily assigned question bundles with higher task container id number.

However, short straight lines with different slopes are seen on the task_container_id vs timestamp plot. We can guess that the learning speed of each user is different. 

So we derive questions_elapsed_time from previous_questions_elapsed_time and plot task_container_id vs questions_elapsed_time to verify our guess.

In [None]:
# df.sort_values(by=['user_id','timestamp'])
# df["current_question_elapsed_time"] = df.groupby(['user_id']).prior_question_elapsed_time.shift(-1)
# df[df.user_id==124][["prior_question_elapsed_time","current_question_elapsed_time"]]

In [None]:
time_table = df_unique_tasks.sort_values(by=['user_id','timestamp'])[['user_id','prior_question_elapsed_time','task_container_id','timestamp']]
time_table["current_question_elapsed_time"] = time_table.groupby(['user_id']).prior_question_elapsed_time.shift(-1)
# fill the unknown current question elapsed time with mean
time_table.current_question_elapsed_time = time_table.groupby(['user_id']).current_question_elapsed_time.apply(lambda x: x.fillna(x.mean()))
time_table.drop(columns=['prior_question_elapsed_time'],inplace=True)

In [None]:
df = df.merge(time_table, how='left',on=['user_id','task_container_id','timestamp'])
df.head()

In [None]:
p = sns.jointplot(x='task_container_id', y='current_question_elapsed_time', data=df, s=1)
p.set_axis_labels("task container id", "question_elapsed_time")
p.fig.suptitle("Scatter plot of task container id vs question elapsed time")

There is an edge at task container id 5600~. One guess is that not many students proceeded to questions bundles above that number.

In [None]:
print("Number of students finished bundles above 5500: {}".format(df[df.task_container_id>=5500].user_id.nunique()))

In [None]:
print("Number of students finished one question using more that 295000ms: {}"\
      .format(df[df.current_question_elapsed_time>=295000].user_id.nunique()))

**Task container id**

Is there a correctness rate difference in different question bundles?


In [None]:
# Calculate the correctness rate vs task_container_id
# Exclude bundles number larger than 5500
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(121)
cor_container_avg = df.groupby('task_container_id').answered_correctly.apply(lambda x: x.mean()).to_frame().reset_index()
cor_container_avg_5k = cor_container_avg[cor_container_avg.task_container_id<=5500]
# cor_container_std = df.groupby('task_container_id').answered_correctly.apply(lambda x: x.std())
ax1 = sns.regplot(x='task_container_id', y='answered_correctly',data=cor_container_avg_5k,line_kws={'color':'magenta'}, scatter_kws={'s':1},ci=99)
plt.title("Task container id vs correctness rate")
plt.ylabel("Correctness rate")
plt.xlabel("Task container id")
ax2 = fig.add_subplot(122)
ax2 = sns.distplot(cor_container_avg_5k.answered_correctly, bins=50)
plt.title("Histgram")
plt.ylabel("Frequency")
plt.xlabel("Correctness rate")

The correctness rate varies from around 0.65(smaller ids) to 0.4-0.8(larger ids). For questions in bundle number over 5500, since they are visited only once, the correctness rate is either 0 or 1. There is no evidence that correctness is correlated with task container id.  

In [None]:
# Whether questions in a bundle have higher correctness?
df_bundle = df.loc[(df.task_container_id.shift(-1)==df.task_container_id) | (df.task_container_id.shift(1)==df.task_container_id)]
print("The correctness of questions in bundle: {:.2%}".format(df_bundle.answered_correctly.mean()))

The correctness of questions in bundle is not higher than the average of all questions.

In [None]:
df_bundle.head(10)

**Content id**

Is there a correctness rate difference in different questions?

In [None]:
# Calculate the correctness rate vs task_content_id
# Exclude bundles number larger than 5500
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(121)
cor_content_avg_5k = df[df.task_container_id<=5500].groupby('content_id').answered_correctly.apply(lambda x: x.mean()).to_frame().reset_index()
# cor_container_std = df.groupby('task_container_id').answered_correctly.apply(lambda x: x.std())
ax1 = sns.regplot(x='content_id', y='answered_correctly',data=cor_content_avg_5k,line_kws={'color':'magenta'}, scatter_kws={'s':1},ci=99)
plt.title("Content id vs correctness rate")
plt.ylabel("Correctness rate")
plt.xlabel("Content id")
ax2 = fig.add_subplot(122)
ax2 = sns.distplot(cor_content_avg_5k.answered_correctly, bins=50)
plt.title("Histgram")
plt.ylabel("Frequency")
plt.xlabel("Correctness rate")

There is no pattern of correctness in content id number, but some questions have 100% correct rate while some have zero. We can investigate for how many records there are for those 0 or 100% correct questions. 

In [None]:
cor_100_id = cor_content_avg_5k[cor_content_avg_5k.answered_correctly==1].content_id
print("Number of 100% correct questions: {}".format(len(cor_100_id)))
cor_0_id = cor_content_avg_5k[cor_content_avg_5k.answered_correctly==0].content_id
print("Number of 0% correct questions: {}".format(len(cor_0_id)))

fig = plt.figure(figsize=(20,6))
# plot the sample size of 100% correct questions
ax1 = fig.add_subplot(121)
cor_100_num = df[df.content_id.isin(cor_100_id)].groupby('content_id').count()
ax1= sns.distplot(cor_100_num, bins=60)
plt.title("Histgram")
plt.ylabel("Frequency")
plt.xlabel("Number of samples making up 100% correctness")
# plot the sample size of 0% correct questions
ax2 = fig.add_subplot(122)
plt.title("Histgram")
plt.ylabel("Frequency")
plt.xlabel("Number of samples making up 0% correctness")
cor_0_num = df[df.content_id.isin(cor_0_id)].groupby('content_id').count()
ax2 = sns.distplot(cor_0_num)


Most of the 0 or 100% correct questions only have 1 record, the prediction on those might not be accurate.

In [None]:
# Plot a correlation matrix
corr_mat = df.corr().stack().reset_index(name="correlation")
g = sns.relplot(
    data=corr_mat,
    x="level_0", y="level_1", hue="correlation", 
    palette="Blues", edgecolor=".7", size="correlation", height=10,
    sizes=(100, 400), size_norm=(-.2, .8),
)
plt.xticks(rotation=90)

From the correlation matrix we can see that answer_correctly is not significantly correlated with any other single variable.

**4. Preprocessing and Feature Engineering**

* Create new features
* Select specific features
* Standardize numeric features
* (Split into testing and training datasets)
* Resampling training dataset

In [None]:
# 
df.drop()

**Oversampling: SMOTE**

One approach to deal with imbalanced datasets is to oversample the minority class, which is incorrectly answered questions in this case. A widely used approach is Synthetic Minority Oversampling Technique (SMOTE) for the minority class.