In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
chunk = 100000
df = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', chunksize=chunk, iterator=True)
train = pd.concat(df, ignore_index=True)

In [None]:
questions = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
lectures = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")

In [None]:
print(f"Train Shape: {train.shape}\nQuestions Shape: {questions.shape}\nLectures Shape: {lectures.shape}")

<h2>Since the data is huge, we'll take a random sample (size: 10% of total data set) that fairly represents the data set

In [None]:
random.seed(11)
samp = random.sample(range(len(train)),int(0.1*len(train)))
print(f"No of Samples: {len(samp)}")

In [None]:
train_samp = train.iloc[samp,:].copy()
del(train)
print(f"Shape of Train Sample: {train_samp.shape}")

<h2>Training Sample EDA</h2>

<h3>Feature description of train.csv</h3>

row_id: (int64) ID code for the row.

timestamp: (int64) the time between this user interaction and the first event completion from that user.

user_id: (int32) ID code for the user.

content_id: (int16) ID code for the user interaction

content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.

answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.

prior_question_elapsed_time: (float32) The average time it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [None]:
train_samp.head()

In [None]:
train_samp.describe().T

In [None]:
train_samp.dtypes

In [None]:
train_samp.nunique()

<h2>% of missing values per feature

In [None]:
np.round(train_samp.isnull().mean()*100,2)

In [None]:
categorical_features = ["content_type_id","user_answer","answered_correctly","prior_question_had_explanation"]

<h3>Though the features above are of int data type, they can be considered as categorical nominal variables

In [None]:
for col in categorical_features:
    print(f"{col} | dtype: {train_samp[col].dtypes} | nunique: {train_samp[col].nunique()}\n{train_samp[col].value_counts()}\n\n")

In [None]:
for col in categorical_features:
    sns.countplot(train_samp[col])
    plt.title(col)
    plt.show()

<h3>Inferences From The Above Distribution Analysis</h3>

content_type_id: In majority of the cases the event was a question being posed to the user.

user_answer: Analyzing this variable without additional information is irrelevant. Hence skipping it for now.

answered_correctly: Most of the students have correctly answered the questions

prior_question_had_explanation: Majority of the prior questions had an explanation






In [None]:
prior_correct = train_samp.loc[train_samp["answered_correctly"]>=0,["prior_question_had_explanation","answered_correctly"]].copy()
ct = pd.crosstab(index=prior_correct["prior_question_had_explanation"],columns=prior_correct["answered_correctly"],normalize="index")
ct.plot(kind="bar",stacked=True,figsize=(10,5))
plt.ylabel("% of students");

<h3>When prior questions had an explanation most  of the students answered correctly. When there was no explanation to the prior question, around 50% of them answered correctly

<h1>WORK IN PROGRESS