Still in development.  
Used this [notebook](https://www.kaggle.com/ilialar/simple-eda-and-baseline) as inspiration.

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
NROWS=10**7

# Complete description

In [None]:
dtype = {'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 
         'content_type_id': 'int8','task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 
         'prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'boolean',
        }

nrows = 10**7
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', chunksize=nrows, dtype=dtype)

In [None]:
j=0
for i in train:
    print(i.info(null_counts=True))
    j+=1
j-=1

In [None]:
print('Number of rows: %f' % (j*nrows+len(i)))

# Analysis of questions ('content_type_id'=0)


We can see that there are very few lectures compared to questions.  
We will assess the dataset without the rows containning missing rows

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', nrows=NROWS, dtype=dtype)
train = train.dropna()
train = train[train['answered_correctly']!=-1]
train

No missing values in the questions.

In [None]:
train = train[train['content_type_id']==0]
train

In [None]:
train.describe()

In [None]:
print('Proportion of questions:%f' % (train.count().row_id/NROWS))
print('Proportion of unique questions:%f' % (len(train.content_id.unique())/len(train)))

### 1. timestamp column

Histogram of timestamp

In [None]:
train.timestamp.hist(bins=10)

In [None]:
train.timestamp.hist(bins=100)

Histogram of timestamp by user

In [None]:
train.iloc[:10**5].groupby('user_id').timestamp.hist(bins=10)

In [None]:
train.iloc[:10**5].groupby('user_id').timestamp.hist(bins=100)

As expected, we can see different different behaviour for different users.

### 2. user_id column

Questions descriptions by user

In [None]:
print('Average number of question per student: %f' % train.groupby('user_id').count().mean().row_id)
print('Standard deviation of number of question per student: %f' % train.groupby('user_id').count().std().row_id)

In [None]:
print(train.groupby('user_id').count().describe().row_id)

Again, there are a lot of variations depending on the user and few users do many questions while most of them do few questions.

In [None]:
train.groupby('user_id').count().row_id.hist(bins=10)

In [None]:
train.groupby('user_id').count().row_id.hist(bins=100)

### 3. task_container_id column

In [None]:
print('Proportion of unique task_container_id: %f' % (len(train.task_container_id.unique())/NROWS))

In [None]:
train.groupby('task_container_id').count().describe().row_id

### 4. content_id column

In [None]:
print('Percentage of questions that appears only once: %f' % ((train.groupby('content_id').count().row_id==1).mean()))

In [None]:
print('Percentage of unique question: %f' % (len(train.groupby('content_id').count())/len(train)))

In [None]:
train.content_id.hist(bins=10)

In [None]:
train.content_id.hist(bins=100)

Some questions appear more often than others and they are of course not ordered. Should be careful to not modelize this feature as ordered in the model.

### 5. prior_question_elapsed_time column

Average time since the last question answered for a student and its standard deviation

In [None]:
print(train.groupby('user_id').mean().prior_question_elapsed_time.mean())
print(train.groupby('user_id').mean().prior_question_elapsed_time.std())

In [None]:
train.prior_question_elapsed_time.hist(bins=10)

In [None]:
train.prior_question_elapsed_time.hist(bins=100)

In [None]:
train.iloc[:10**5].groupby('user_id').prior_question_elapsed_time.hist(bins=10)

In [None]:
train.iloc[:10**5].groupby('user_id').prior_question_elapsed_time.hist(bins=100)

### 5. prior_question_had_explanation column


In [None]:
print('Percentage of questions that had an explanation: \n%s' % (train.prior_question_had_explanation.value_counts()/NROWS))

In [None]:
print('Description of True prior_question_had_explanation per user: \n%s' % (train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index()[train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index().prior_question_had_explanation==True].describe().row_id))
print('Number of user without True value: %d' % (len(train.user_id.unique())-len((train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index()[train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index().prior_question_had_explanation==True]))))

In [None]:
print('Description of False prior_question_had_explanation per user: \n%s' % (train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index()[train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index().prior_question_had_explanation==False].describe().row_id))
print('Number of user without False value: %d' % (len(train.user_id.unique())-len((train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index()[train.groupby(['user_id', 'prior_question_had_explanation']).count().row_id.reset_index().prior_question_had_explanation==False]))))

### 8. answered_correctly column

In [None]:
print('Percentage of questions answered correctly: %f' % train['answered_correctly'].mean())
train['answered_correctly'].hist(bins=10)

Histograms of percentage of questions answered corretly by user

In [None]:
count_answered_correctly_true_per_user = (train.groupby(['user_id', 'answered_correctly']).count().reset_index()[train.groupby(['user_id', 'answered_correctly']).count().reset_index().answered_correctly==1].set_index('user_id'))
results = train.groupby('user_id').count()
results.row_id = 0
results.loc[count_answered_correctly_true_per_user.index, 'row_id'] = count_answered_correctly_true_per_user.row_id

In [None]:
(results.row_id/train.groupby('user_id').count().row_id).hist(bins=10)

In [None]:
(results.row_id/train.groupby('user_id').count().row_id).hist(bins=100)

In [None]:
((results[results.row_id<50].row_id)/(train.groupby('user_id').count()[results.row_id<50].row_id)).hist(bins=100)

In [None]:
(results[results.row_id>=50].row_id/train.groupby('user_id').count()[results.row_id>50].row_id).hist(bins=100)

In [None]:
(results[results.row_id>=50].row_id/train.groupby('user_id').count()[results.row_id>1000].row_id).hist(bins=100)

# Analysis of lecture ('content_type_id'=1)

In [None]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', nrows=NROWS, dtype=dtype)
lectures = lectures[lectures['content_type_id']==1]

In [None]:
lectures.groupby('user_id').count().row_id.hist(bins=100)

Might be interesting to have a feature that keep informations about the lectures one student followed (number of lectures, type of lectures, did they do it since a long time?...)

In [None]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', nrows=NROWS, dtype=dtype)
questions = questions.dropna()
questions = questions[questions['answered_correctly']!=-1]

lectures_count = questions.groupby('user_id').answered_correctly.mean()
lectures_count.loc[:] = 0
lectures_count.loc[lectures.groupby('user_id').count().index] = lectures.groupby('user_id').count().row_id

plt.scatter(questions.groupby('user_id').answered_correctly.mean(), lectures_count)
plt.xlabel("Correctness rate per student")
plt.ylabel("Number of lectures attended per student")
plt.show()

We can see that students who attend lectures have a higher correctness rate.

# Questions

In [None]:
questions_type = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
questions_type

In [None]:
print((questions_type[(questions_type.question_id != questions_type.bundle_id)]))
print('\nPercentage of questions served together: %f\n' % (len(questions_type[(questions_type.question_id != questions_type.bundle_id)])/len(questions_type)))
print('Description of unique value of bundle_id:\n%s\n' % questions_type.bundle_id.value_counts().describe())

In [None]:
print('Description of unique value of part:\n%s\n' % questions_type.part.value_counts().describe())

In [None]:
all_tags=[]
for j in [y.split() for y in questions_type['tags'].astype(str).values]:
    for i in j:
        all_tags.append(i)
print('Description of unique value of tags:\n%s\n' % pd.Series(all_tags).value_counts().describe())

Some tags appear very often while other don't. Very unbalanced.

# Lectures

In [None]:
lectures_type = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures_type

In [None]:
print('Description of unique value of part:\n%s\n' % lectures_type.part.value_counts().describe())
print('Description of unique value of tag: \n%s\n' % lectures_type.tag.value_counts().describe())
print('Value counts of the type of unique questions: \n%s' %lectures_type.type_of.value_counts())

Very few intention and starter lectures. Should check if many students attended this type of lectures and should be careful the model doesn't overfit on this. Can apply the same reasoning with tag features/some tag appears only once.