In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datatable

# Introduction

In this notebook I want to provide an explaration data analysis. We retrieve the data from the train.csv file and its connection with questions.csv and lectures.csv files. 
It seems they are some kind of database extractions with a "star" schema model. 

## Train.csv

From the challange description, we get the following infos:

>* row_id: ID code for the row. 

*It is an index column for the dataset*

>* timestamp: the time between this user interaction and the first event completion from that user. 

*I should check if it is in milliseconds.*

>* user_id: ID code for the user. 
>* content_id: ID code for the user interaction
>* content_type_id: 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

*The content_type_id can be used as an integer even if it is an id. I can use it as one-hot column for lectures.*

>* task_container_id: Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.
>* user_answer: the user's answer to the question, if any. Read -1 as null, for lectures.
>* answered_correctly: if the user responded correctly. Read -1 as null, for lectures.
>* prior_question_elapsed_time: The average time it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

*I should again check if it is in milliseconds.*

>* prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

The first step is to understand what we have in the Train dataset.

In [None]:
%%time

# reading the dataset from raw csv file
import datatable as dt
import plotly.express as px
import matplotlib.pyplot as plt

#df = dt.fread("../input/riiid-test-answer-prediction/train.csv").to_pandas()
df = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',nrows = 1000000)
print(df.shape)

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
d = {'row_id':'int64',
 'timestamp':'int64',
 'user_id':'object',
 'content_id':'object',
 'content_type_id':'int64',
 'task_container_id':'object',
 'user_answer':'object',
 'answered_correctly':'int64',
 'prior_question_elapsed_time':'float64',
 'prior_question_had_explanation':'object'}


In [None]:
df.head(100000).astype(d).describe(include='all')

We try to detect students performance with question type content. For this reason, we exclude lectures.

In [None]:
user_profile = df[df['answered_correctly']!=-1].groupby(
    'user_id',as_index = False
    ).agg(age_as_learner = pd.NamedAgg(column = 'timestamp', aggfunc=max),
          num_contents = pd.NamedAgg(column = 'content_id', aggfunc=pd.Series.nunique),
          num_events = pd.NamedAgg(column = 'row_id', aggfunc = 'count'),
          num_true_answers = pd.NamedAgg(column = 'answered_correctly', aggfunc = sum),
          perc_true_answers = pd.NamedAgg(column = 'answered_correctly', aggfunc = 'mean'),
          std_true_answers = pd.NamedAgg(column = 'answered_correctly', aggfunc = 'std'),
          #skew_true_answers = pd.NamedAgg(column = 'answered_correctly', aggfunc = 'skew'),
          avg_prior_question_elapsed_time = pd.NamedAgg(column = 'prior_question_elapsed_time', aggfunc = 'mean'),
          std_prior_question_elapsed_time = pd.NamedAgg(column = 'prior_question_elapsed_time', aggfunc = 'std'),
          )
user_profile['user_id'] = user_profile['user_id'].astype('object')
user_profile.age_as_learner /=(60*60*24*1000*365)
user_profile.avg_prior_question_elapsed_time /= 60*60*24
user_profile.std_prior_question_elapsed_time /= 60*60*24
print('Number of Users: ', len(user_profile))
user_profile.describe()

I'm not sure but it seems that timestamp is in milliseconds while the elapsed time is in seconds. I rescale it so that we get respectively years and days. 
In the following we get a scatter matrix to have an overall picture of our dataset and hidden patterns. 

In [None]:
pd.plotting.scatter_matrix(user_profile.select_dtypes(include='number'),alpha = 0.1, figsize = (30,30))

In [None]:
n = len(df.content_type_id)
a,b = df.content_type_id.value_counts()
print('The number of lecture events is : {} ({}%)'.format(b,100*b/n))
print('The number of question events is : {} ({}%)'.format(a,100*a/n))

Let's see who are the "oldest" students and how they are characterized

In [None]:
user_profile.sort_values(by='age_as_learner',ascending = False).head(10)

Let's see who are the students who have answered more questions

In [None]:
user_profile.sort_values(by='num_contents',ascending = False).head(10)

In [None]:
user_profile.sort_values(by='num_true_answers',ascending = False).head(10)

In [None]:
user_profile[user_profile.std_true_answers.notnull()].sort_values(by='perc_true_answers',ascending = False).head(10)

In [None]:
#user_series = df.groupby(['user_id','timestamp']).agg({'answered_correctly':'sum','row_id':'count'})
#user_series = user_series.groupby(level=0).agg({'answered_correctly':'cumsum','row_id':'cumsum'}).reset_index()
#user_series['avg_correct'] = user_series.answered_correctly.div(user_series.row_id, axis = 0)
user_series

### To Do:

* Lectures have impact on performances? explanations too?
* Build timeseries to track students performance
* Track impact of lectures on students' performance

## Questions

>* question_id: foreign key for the train/test content_id column, when the content type is question (0).
>* bundle_id: code for which questions are served together.

*The  bundle_id key should be added to the train.csv dataset.*

>* correct_answer: the answer to the question. Can be compared with the train user_answer column to check if the user was right.
>* part: the relevant section of the TOEIC test.
>* tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

*This tags field seems interesting.*

In [None]:
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')#,nrows = 1000000)

In [None]:
questions.head(10)

In [None]:
questions.info()

Be aware there's a troublesome missing tag value. 

In [None]:
questions[questions.tags.isnull()]

In [None]:
dq = {
    'question_id':'object',
    'bundle_id':'object',
    'correct_answer':'object',
    'part':'object'
}

questions.astype(dq).describe()

In [None]:
(questions.correct_answer.value_counts()/len(questions)).plot.bar()

The questions' answers seems to be fair and balanced so that if a student use choosing answers randomly is not a good strategy. 

In [None]:
(questions.part.value_counts()/len(questions)).plot.bar()

We now study the tags. Remember to fill the na values.

In [None]:
questions.fillna('-1', inplace = True)

In [None]:
from collections import Counter

tags = Counter(questions.tags.str.split(' ')[0])
for tag in questions.tags.str.split(' '):
    try:
        tags.update(tag)
    except:
        print('Found exception: tags field equal to ', tag)

In [None]:
tags = pd.DataFrame.from_dict(dict(tags),orient='index').reset_index().sort_values(by=0, ascending = False)
tags.columns = ['tag','count_tag']

Which are the most common tags?

In [None]:
tags.head(20).plot.bar()

Which is the relationship with part field?

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

tags_of_parts = pd.DataFrame(mlb.fit_transform(questions.tags.str.split(' ')), columns = mlb.classes_, index = questions.question_id)

In [None]:
tags_of_parts.info()

In [None]:
test = pd.merge(questions[['question_id','part']],tags_of_parts, on = 'question_id').drop(['question_id'], axis = 1).groupby('part').sum()

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(34, 5))

ax = sns.heatmap(test)

This is a cool finding even if the plot is not the best one probably. It seems there are tags strongly related to specific parts. 

In [None]:
test.reset_index().melt(id_vars = ['part'], var_name = 'tag',value_name = 'count').astype('int64')

I've spent a bit of time to change the questions.tags structure in a proper manner. At the end I've found my strategy thanks to [this post on brandonrose.org](http://brandonrose.org/clustering?ref=dzone#K-means-clustering).

In [None]:
import nltk, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans


def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[0-9]', token):
            filtered_tokens.append(token)
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                 min_df=0.0,use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(questions.tags) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

tags = tfidf_vectorizer.get_feature_names()


dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
sse = {}
for k in range(2,30,3):
    km = KMeans(n_clusters=k)
    #print('Number of cluster: {}'.format(k))
    #%time 
    km.fit(tfidf_matrix)
    sse[k] = km.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()
sse.keys()

Using the elbow method, it seems that 11 or 23 are good candidates. However, I prefert to fix the number of cluster to 11 and reduce the number of values make the following analysis much simpler to read. 

Be aware that results could slightly change at every run. 

In [None]:
NUM_CLUSTERS = 11


km = KMeans(n_clusters=NUM_CLUSTERS)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

n_words = 5

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(NUM_CLUSTERS):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :n_words]: 
        print(' %s' % tags[ind], end=' ')
    print() #add whitespace
    
questions['kmean_cluster'] = clusters

In [None]:
questions.head(10)

In [None]:
dq['kmean_cluster']='object'
questions.astype(dq).info()

In [None]:
questions.kmean_cluster.value_counts()/len(questions)

In [None]:
(pd.pivot_table(
    questions, 
    values = 'question_id', 
    index = ['kmean_cluster'], 
    columns = ['part'],aggfunc='count')
).fillna(0).astype('int64')

How is bundle related to the other attributes?

In [None]:
questions.groupby('bundle_id').agg({
    'question_id':'count',
    'part':pd.Series.nunique,
    'kmean_cluster':pd.Series.nunique}).describe()

Every bundle is associated to a single part, while at least the 75% is composed by a single question or associated to a single cluster.

In [None]:
(questions.groupby('bundle_id').agg({
    'question_id':'count',
    'part':pd.Series.nunique,
    'kmean_cluster':pd.Series.nunique})[['question_id','kmean_cluster']]).hist()

## Lectures

> lectures.csv: metadata for the lectures watched by users as they progress in their education.
>
> lecture_id: foreign key for the train/test content_id column, when the content type is lecture (1).
> part: top level category code for the lecture.
> tag: one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.
> type_of: brief description of the core purpose of the lecture

Its content is similar to questions.csv. We will provide similar analysis.

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')#,nrows = 1000000)
print(lectures.info())
lectures.head(10)

In [None]:
print(pd.pivot_table(
    lectures,
    values = 'lecture_id', 
    index = ['type_of'], 
    columns = ['part'],aggfunc='count'    
).fillna(0))
print()
(lectures.part.value_counts()).plot.bar()

In [None]:
(lectures.type_of.value_counts()).plot.bar()

In [None]:
print(lectures.groupby('tag').agg({'lecture_id':'count'}).sort_values(by='lecture_id',ascending=False)[:25].plot.bar())

In [None]:
print(lectures.groupby('tag').agg({'lecture_id':'count'}).hist())

### To Do:

* Cluster of Tags (LDA approach)
* Sankey Plot of Tags and Parts
* Improve charts layout