In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
import riiideducation

env = riiideducation.make_env()

# Data Import

In [None]:
train_df = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")

print("Train size:", train_df.shape)

In [None]:
print(train_df.dtypes)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('bool')

In [None]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

# Data exploration

## Train dataset

### First look

In [None]:
train_df.isna().sum()

In [None]:
print("Number of rows : {}".format(train_df.shape[0]))
print("Number of individual user : {}".format(train_df.user_id.nunique()))
print("Number of individual question : {}".format(train_df.content_id.nunique()))
print("Number of individual tasks: {}".format(train_df.task_container_id.nunique()))

In [None]:
groupby_user_id = train_df.groupby("user_id").count()

plt.figure(figsize=(15,5))

plt.subplot(1,2,1)
sns.distplot(groupby_user_id.row_id, kde=False)
plt.title("Distribution of number of questions per users")

plt.subplot(1,2,2)
threshold = 200
sns.distplot(groupby_user_id[groupby_user_id.row_id<threshold].row_id, kde=False, bins=100)
plt.title("Distribution of number of questions per users (<{})".format(threshold))

print("Average number of questions per user : {}".format(groupby_user_id.row_id.mean()))
print("Median number of questions per user : {}".format(groupby_user_id.row_id.median()))
print("75% quantile : {}".format(groupby_user_id.row_id.quantile(0.75)))

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,2,1)


ts = train_df['timestamp']/(31536000000/12)
sns.distplot(ts, kde=False, bins=100, color="b")
plt.title("Timestamp histogram")
plt.xlabel("Month between first user interaction & current interaction")

plt.subplot(1,2,2)

ts = train_df.groupby('user_id').max()["timestamp"]/(31536000000/12)

sns.distplot(ts, kde=False, bins=100, color="b")
plt.title("Histogram of last registered interactions")
plt.xlabel("Month between first & last interaction for each user")

It seems that most of the users don't stay active for a long time.

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(train_df.user_answer)
plt.title("User answers count")

We would expect to have a uniform repartition between 0,1,2 and 3, but it seems that the answer #2 is significantly less choosen.

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(train_df.answered_correctly, orient="v")
plt.title("Question answered correctly")

Approximately one third of the answers are incorrect.

In [None]:
plt.figure(figsize=(15,7))

train_df_f = train_df[train_df["user_answer"]!=-1]
sns.countplot(x="user_answer", hue="answered_correctly", data=train_df_f)

Expectedly, the proportion of right and wrong answers is the same for each answer number.

In [None]:
time_prior_question = train_df[train_df.prior_question_elapsed_time.isna() == False].prior_question_elapsed_time
plt.figure(figsize=(15,7))

sns.distplot(time_prior_question, kde=False, color="b")
plt.title("Prior question elapsed time histogram")

In [None]:
plt.figure(figsize=(15,7))

sns.countplot(x = "prior_question_had_explanation" , hue="answered_correctly", data=train_df_f)

The proportion of false answer is higher when the prior question didn't have an explanation.

## Question Dataset

In [None]:
print(questions.head(5))
print(questions.shape)

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(questions.correct_answer)

This explains why the answer #2 was less picked than the others

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(questions.part)