In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# train.csv

In [None]:
data_types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       low_memory=False,
                       nrows=10**7,
                       dtype=data_types_dict, 
                      )

In [None]:
train_df

In [None]:
# 欠損値
print('Part of missing values for every column')
print(train_df.isnull().sum() / len(train_df))

In [None]:
train_df.info()

In [None]:
train_df.describe().T

In [None]:
cols = train_df.columns

for col in cols:
    print(f'Unique values in {col} : {train_df[col].nunique()}')

categoricalカテゴリ変数

content_type_id, user_answer , answered_correctly ,prior_question_had_explanation

now we can see that there are some very low integer we convert the columns content_type_id, user_answer , answered_correctly ,prior_question_had_explanation to categorical format when we train a model

### timestamp

In [None]:
train_df['timestamp'].hist(bins=100);

timestamp represents the time from the first user interaction to the current one. It is expected that the distribution looks like this.

timestamp・・・ユーザーとの対話からそのイベント終了までの時間

timestamp is defined as "the time between this user interaction and the first event from that user"

prior_question_elapsed_time is defined as "How long it took a user to answer their previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Note that the time is the total time a user took to solve all the questions in the previous bundle"

The timestamp column shows when an activity is finished, not when it started. 

The timestamp timer starts after first question is answered or lecture is finished.

prior_question_elapsed_time timer starts when the user starts doing the previous question and it ends when the user moves to another question.

maybe timestamp is miliseconds. it cannot be seconds.

https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/189351

In [None]:
grouped_by_user_df = train_df.groupby('user_id')

In [None]:
grouped_by_user_df.agg({'timestamp':'max'}).hist(bins=100);

↑各ユーザーの最大のtimestampの分布・・・ほとんどのユーザーがすぐにプラットフォームを離れるようだ。

The distribution of the max timestamp for each user looks similar. It seems most users leave the platform quite soon (at least based on partial data we analyze).

### Answered correctly
 ユーザーが正しく応答したかどうか。講義と質問がある。講義（lectures）の場合は、-1をnullとして読み取ります。質問の場合は、正答１、誤答０

In [None]:
# 講義の割合  # 平均 -1 (True)の割合
(train_df['answered_correctly'] == -1).mean()

In [None]:
ds = train_df['content_type_id'].value_counts().reset_index()
ds.columns = ['content_type_id', 'percent']
ds['percent'] /= len(train_df)

fig = px.pie(
    ds, 
    names='content_type_id', 
    values='percent', 
    title='Lecures & questions', 
    height=500, 
    width=600
)

fig.show()

trainデータの約2%は、「講義」である。→回答分析から除外する必要がある。

2% of activities are lectures, we should exclude them for answers analysis.

In [None]:
train_questions_only_df = train_df[train_df['answered_correctly'] != -1]
train_questions_only_df['answered_correctly'].mean()

In [None]:
ds = train_df['answered_correctly'].value_counts().reset_index()
ds.columns = ['answered_correctly', 'percent_of_answers']
ds['percent_of_answers'] /= len(train_df)
ds = ds.sort_values(['percent_of_answers'])

fig = px.pie(
    ds, 
    names='answered_correctly', 
    values='percent_of_answers', 
    title='Percent of correct answers', 
    height=500, 
    width=600
)

fig.show()

In [None]:
correct = train_df[train_df.answered_correctly != -1].answered_correctly.value_counts()

fig = plt.figure(figsize=(12,4))

correct.plot.barh()
plt.title("Questions answered correctly")
plt.xticks(rotation=0)
plt.show()

平均して、ユーザーは最大６６％の質問に正しく答えている。 →ユーザーごとにどのくらい違うかも見てみる

On average users answer ~66% questions correctly. Let's look how it is different from user to user.

「講義」を除外した、answered_correctlyをみてみると、１／３は質問に間違えている。

When looking at the numbers of answered_correctly, we see the same number of missing answers. Without looking at the lecture interactions, we see about 1/3 of the questions was answered incorrectly.

### Answers by users

In [None]:
grouped_by_user_df = train_questions_only_df.groupby('user_id')

In [None]:
# 回答率('mean')と回答数（'count'）で分ける
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count']})
user_answers_df[('answered_correctly', 'mean')].hist(bins=100); # bins = 棒の数

In [None]:
user_answers_df

In [None]:
user_answers_df[('answered_correctly', 'count')].hist(bins=100);

In [None]:
(user_answers_df[('answered_correctly','count')]< 50).mean()

ユーザーの54％が、50未満の質問に回答。 → すべてのユーザーを「初心者」と「アクティブユーザー」に分けてみる。

54% of users answered less than 50 questions. Let's divide all users into novices and active users.

In [None]:
# 初心者の正答率
user_answers_df[user_answers_df[('answered_correctly', 'count')] < 50][('answered_correctly', 'mean')].mean()

In [None]:
user_answers_df[user_answers_df[('answered_correctly', 'count')] < 50][('answered_correctly', 'mean')].hist(bins=100);

In [None]:
# アクティブユーザーの正答率
user_answers_df[user_answers_df[('answered_correctly', 'count')] >= 50][('answered_correctly', 'mean')].mean()

In [None]:
user_answers_df[user_answers_df[('answered_correctly', 'count')] >= 50][('answered_correctly', 'mean')].hist(bins=100);

アクティブユーザーは、初心者よりもはるかに優れている。

全体の平均66%　しかし、平均ユーザースコアは、正解の全体の66％よりも低くなっている。→これは、ヘビーユーザーのスコアがさらに高くなることを意味する。

We can see that active users do much better than novices. But anyway average user score is lower than the overall % of correct answers. It means heavy users have even better scores. Let's look at them.

In [None]:
# ヘビーユーザーの割合 500以上questionを回答しているユーザーの割合
(user_answers_df[('answered_correctly','count')] >= 500).mean()

In [None]:
# ヘビーユーザーの回答率の分布
user_answers_df[user_answers_df[('answered_correctly', 'count')] >= 500][('answered_correctly', 'mean')].hist(bins=100);

In [None]:
# ヘビーユーザーの正答率
user_answers_df[user_answers_df[('answered_correctly', 'count')] >= 500][('answered_correctly', 'mean')].mean()

In [None]:
plt.scatter(x = user_answers_df[('answered_correctly', 'count')], y = user_answers_df[('answered_correctly', 'mean')]);

### これまでのまとめ
・Timestamp, ・アクティブユーザーの平均スコア, ・回答された質問の数、はベースラインの作成に役立ちそう。

Timestamp, the average score for the active user, and the number of questions answered can be useful for baseline.

### Answers by content

In [None]:
grouped_by_content_df = train_questions_only_df.groupby('content_id')

In [None]:
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count']})

In [None]:
content_answers_df

In [None]:
content_answers_df[('answered_correctly', 'count')].hist(bins=100);

In [None]:
content_answers_df[('answered_correctly', 'mean')].hist(bins=100);

質問(content_id)が異なれば、answered_correctlyも異なるため、ベースラインに使えそう。

Different questions have different popularity and complexity, and it can also be used in the baseline.

In [None]:
content_answers_df[content_answers_df[('answered_correctly','count')]>50][('answered_correctly','mean')].hist(bins = 100);

### Top 40 users by number of actions

In [None]:
ds = train_df['user_id'].value_counts().reset_index()
ds.columns = ['user_id', 'count']

ds['user_id'] = ds['user_id'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40),
    x='count',
    y='user_id',
    orientation='h', # horizontal bar char 横水平バー
    title='Top40 users by number of actions',
    height=900,
    width=700
)

fig

一番多いユーザーで、15,871回データ出現している。

### User action distribution

In [None]:
ds = train_df['user_id'].value_counts().reset_index()
ds.columns = ['user_id', 'count']
ds = ds.sort_values('user_id')

fig = px.line(
    ds, 
    x='user_id', 
    y='count', 
    title='User action distribution', 
    height=600, 
    width=900
)

fig.show()

### Top 40 most useful content_ids

In [None]:
ds = train_df['content_id'].value_counts().reset_index()
ds.columns = ['content_id', 'count']
ds['content_id'] = ds['content_id'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x='count', 
    y='content_id', 
    orientation='h', 
    title='Top40 most useful content_ids', 
    height=900, 
    width=700
)

fig.show()

In [None]:
c_ids = train_df.content_id.value_counts()[:40]

fig = plt.figure(figsize=(12,8))

c_ids.plot.bar()
plt.title("Top 40 most used content id's")
plt.xticks(rotation=90)
plt.show()

### content_id action distribution

In [None]:
ds = train_df['content_id'].value_counts().reset_index()
ds.columns = ['content_id', 'count']
ds = ds.sort_values('content_id')

fig = px.line(
    ds, 
    x='content_id', 
    y='count', 
    title='content_id action distribution', 
    height=600, 
    width=900
)

fig.show()

### Top 40 most useful task_container_id

task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

質問や講義のひとかたまりを表したIDコード

例：　説明を見る前に、3つの質問を見たらそれらをtask_container_idとしてシェアしておく。

⇨つまり、trainの「user_answer(ユーザーの回答)」だけではなく、(その答える時に「他の選択肢」も含んだ)がわかるIDカラム(外部キー)

In [None]:
ds = train_df['task_container_id'].value_counts().reset_index()
ds.columns = ['task_container_id', 'count']
ds['task_container_id'] = ds['task_container_id'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x='count', 
    y='task_container_id', 
    orientation='h', 
    title='Top 40 most useful task_container_ids', 
    height=900, 
    width=700
)

fig.show()

### task_container_id action distribution

In [None]:
ds = train_df['task_container_id'].value_counts().reset_index()
ds.columns = ['task_container_id', 'count']
ds = ds.sort_values('task_container_id')

fig = px.line(
    ds, 
    x='task_container_id', 
    y='count', 
    title='task_container_id action distribution', 
    height=600, 
    width=800
)

fig.show()

task_container_id が小さいほど、データの出現回数も多い。

In [None]:
# size() 全要素数を取得
task_id_correct = train_df[train_df.answered_correctly != -1].\
groupby(["task_container_id", 'answered_correctly'], as_index=False).size()

task_id_correct

In [None]:
task_id_correct = task_id_correct.pivot(index='task_container_id',\
                                         columns='answered_correctly', values='size')

# 正答率
task_id_correct['Percent Correct'] = round(task_id_correct.iloc[:,1]/(task_id_correct.iloc[:,0] + task_id_correct.iloc[:,1]),2)

# %ごとに並び替え
task_id_correct = task_id_correct.sort_values(by = "Percent Correct", ascending = False)

task_id_correct

In [None]:
ds = task_id_correct['Percent Correct'].value_counts().reset_index()
ds.columns = ['Percent Correct', 'count']
ds = ds.sort_values('Percent Correct')

fig = px.line(
    ds, 
    x='Percent Correct', 
    y='count', 
    title='Percent Correct action distribution of task_container_id', 
    height=600, 
    width=900
)

fig.show()

In [None]:
task_id_correct = train_df[train_df.answered_correctly != -1].\
groupby(["task_container_id", 'answered_correctly'], as_index=False).size()

task_id_correct = task_id_correct.pivot(index='task_container_id',\
                                         columns='answered_correctly', values='size')

task_id_correct['Percent Correct'] = round(task_id_correct.iloc[:,1]/(task_id_correct.iloc[:,0] + task_id_correct.iloc[:,1]),2)
task_id_correct = task_id_correct.sort_values(by = "Percent Correct", ascending = False)

# task_container_id - %
task_id_correct = task_id_correct.iloc[:,2]

task_id_correct = task_id_correct[:40]

fig = plt.figure(figsize=(12,6))
task_id_correct.plot.bar()
plt.title("Top 40 hardest batches of questions")
plt.xticks(rotation=90)
plt.show()

回答が正解になりやすい質問たち(簡単な問題？）の割合

you can see the Top-40 of question batches with the highest percentage of questions answered correct.

### Percent of user answers for every option ユーザーが回答した「選択肢」の割合

In [None]:
ds = train_df['user_answer'].value_counts().reset_index()
ds.columns = ['user_answer', 'percent_of_answers']
ds['percent_of_answers'] /= len(train_df)
ds = ds.sort_values(['percent_of_answers'])

fig = px.bar(
    ds, 
    x='user_answer', 
    y='percent_of_answers', 
    orientation='v', 
    title='Percent of user answers for every option', 
    height=500, 
    width=600
)

fig.show()

-1 は、lecture(講義)のため、null扱い

### Percent of correct answers for every option 「選択肢」ごとの回答正解率

これまでのやつ

* [全体] 平均して、ユーザーは最大６６％の質問に正しく答えている。
* [初心者] 正解率 約48%
* [アクティブユーザー] 正解率 約62%

In [None]:
fig = make_subplots(rows=2, cols=3)

traces = [
    go.Bar(
        x=[-1, 0, 1], 
        y=[
            len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == -1)]),
            len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == 0)]),
            len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == 1)])
        ], 
        name='Option: ' + str(item),
        text = [
            str(round(100 * len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == -1)]) / len(train_df[(train_df['user_answer']==item)]), 2)) + '%',
            str(round(100 * len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == -0)]) / len(train_df[(train_df['user_answer']==item)]), 2)) + '%',
            str(round(100 * len(train_df[(train_df['user_answer']==item) & (train_df['answered_correctly'] == 1)]) / len(train_df[(train_df['user_answer']==item)]), 2)) + '%',
        ],
        textposition='auto'
    ) for item in train_df['user_answer'].unique().tolist()
]

for i in range(len(traces)):
    fig.append_trace(traces[i], (i // 3) + 1, (i % 3)  +1)

fig.update_layout(
    title_text='Percent of correct answers for every option',
    height=600,
    width=900
)

fig.show()

「選択肢」ごとの正答率

* option0 65.93%
* option1 64.93%
* option2 66.95%
* option3 66.00%
* option-1 NaN

### prior_question_elapsed_time distribution

prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

前の質問に回答してから、どのくらいミリセカンド秒経ったか。 NULL = 講義か初めての質問の場合。 このカラムは、「前の質問にどのくらい解決時間を要したか」の参考になる

In [None]:
fig = px.histogram(
    train_df, 
    x="prior_question_elapsed_time",
    nbins=100,
    width=700,
    height=500,
    title='prior_question_elapsed_time distribution'
)

fig.show()

10k = 10,000 miliseconds = 10 seconds(秒)

分布見た感じ、16~25秒くらいがボリュームゾーン

# Questions.csv

In [None]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

In [None]:
questions_df

In [None]:
# 欠損値
print('Part of missing values for every column')
print(questions_df.isnull().sum() / len(questions_df))

In [None]:
print(f"There are {len(questions_df['part'].unique())} different parts")

In [None]:
questions_df['tags'].values[-1] # なんで最後の行のtagを取得してるのか？ → データの型を確認しているだけ

In [None]:
unique_tags = set().union(*[y.split() for y in questions_df['tags'].astype(str).values])

print(f"There are {len(unique_tags)} different tags")

In [None]:
# [question_id] content_type_idが質問(0)のとき、train/test content_id列の外部キー / [bundle_id] 質問と一緒に提供されるコード
(questions_df['question_id'] != questions_df['bundle_id']).mean()

### Number of correct answers per group 正解のナンバーの割合

In [None]:
ds = questions_df['correct_answer'].value_counts().reset_index()
ds.columns = ['correct_answer', 'number_of_answers']
ds['correct_answer'] = ds['correct_answer'].astype(str) + '-'
ds = ds.sort_values(['number_of_answers'])

fig = px.bar(
    ds, 
    x='number_of_answers', 
    y='correct_answer', 
    orientation='h', 
    title='Number of correct answers per group', 
    height=400, 
    width=700
)

fig.show()

### Parts distribution Partの分布

In [None]:
ds = questions_df['part'].value_counts().reset_index()
ds.columns = ['part', 'count']
ds['part'] = ds['part'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds, 
    x='count', 
    y='part', 
    orientation='h', 
    title='Parts distribution', 
    height=500, 
    width=700
)

fig.show()

part5のquestionが多い。

### Number tags distribution tagsの数の分布

tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

tags 意味は特にないけど、これらのコードは質問と一緒にクラスタリングする際に効果的らしい。

In [None]:
# tagsの個数が何個あるかを示すカラム questions_df_copy
questions_df_copy = questions_df
questions_df_copy['tag'] = questions_df_copy['tags'].str.split(' ')
questions_df_copy = questions_df_copy.explode('tag')
questions_df_copy = pd.merge(questions_df_copy, questions_df_copy.groupby('question_id')['tag'].count().reset_index(), on='question_id')
questions_df_copy = questions_df_copy.drop(['tag_x'], axis=1)
questions_df_copy.columns = ['question_id', 'bundle_id', 'correct_answer', 'part', 'tags', 'tags_number']
questions_df_copy = questions_df_copy.drop_duplicates()

questions_df_copy

In [None]:
ds = questions_df_copy['tags_number'].value_counts().reset_index()
ds.columns = ['tags_number', 'count']
ds['tags_number'] = ds['tags_number'].astype(str) + '-'
ds = ds.sort_values(['tags_number'])

fig = px.bar(
    ds, 
    x='count', 
    y='tags_number', 
    orientation='h', 
    title='Number tags distribution', 
    height=400, 
    width=700
)

fig.show()

### Top 40 most useful tags tagsの出現回数

In [None]:
check = questions_df['tags'].str.split(' ').explode('tags').reset_index()
check = check['tags'].value_counts().reset_index()

check.columns = ['tag', 'count']
check['tag'] = check['tag'].astype(str) + '-'
check = check.sort_values(['count'])

fig = px.bar(
    check.tail(40), 
    x='count', 
    y='tag', 
    orientation='h', 
    title='Top 40 most useful tags', 
    height=900, 
    width=700
)

fig.show()

# lectures.csv

講義内容の詳細データ

In [None]:
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures_df

In [None]:
# 欠損値
print('Part of missing values for every column')
print(lectures_df.isnull().sum() / len(lectures_df))

### Top 40 lectures by number of tags 講義タグの数ランキング

In [None]:
ds = lectures_df['tag'].value_counts().reset_index()
ds.columns = ['tag', 'count']
ds['tag'] = ds['tag'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds.tail(40), 
    x='count', 
    y='tag', 
    orientation='h', 
    title='Top 40 lectures by number of tags', 
    height=800, 
    width=700
)

fig.show()

### Parts distribution

part: top level category code for the lecture.

In [None]:
ds = lectures_df['part'].value_counts().reset_index()
ds.columns = ['part', 'count']
ds['part'] = ds['part'].astype(str) + '-'
ds = ds.sort_values(['count'])

fig = px.bar(
    ds, 
    x='count', 
    y='part', 
    orientation='h', 
    title='Parts distribution', 
    height=500, 
    width=700
)

fig.show()

### type_of column distribution 講義(種類・内容)の内訳

In [None]:
ds = lectures_df['type_of'].value_counts().reset_index()
ds.columns = ['type_of', 'count']
ds = ds.sort_values(['count'])

fig = px.bar(
    ds, 
    x='count', 
    y='type_of', 
    orientation='h', 
    title='type_of column distribution', 
    height=500, 
    width=700
)

fig.show()

In [None]:
corr = train_df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(10, 10))
    ax = sns.heatmap(corr,mask=mask,square=True,linewidths=.8,cmap="viridis",annot=True)

In [None]:
train_df.corr().style.background_gradient(cmap='Oranges')

この中で高めな相関があったのは、[task_container_id - timestamp] と [content_type_id - answered_correctly]

とはいえ、有効そうな相関はないって感じ。

we can see 2 correlations which have some high values:

* task_container_id is correlated with the timestamp.

task_container_idは、ユーザーごとに単調に増えているから、timestampと相関が高めにでる。

(task_container_id: Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id. Monotonically（単調に） increasing for each user.)

This might help us explain why it has a good correlation with the timestamp

* content_type_id is correlated with answered_correctly
(content_type_id: 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.)

0: 質問 1: 講義

講義をみたユーザーの方が、回答率が高いのは想像つくから、これはいい相関である

If the user watched the lecture then chances of answering correctly increases so there is a good correlation I assume.

### prior_question_had_explanationと時間(timestamp)の流れ

prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

### prior_question_had_explanation(ユーザーの質問回答後の反応)とanswaered_correctly(正解か)の関係を見る

In [None]:
plt.figure(figsize=(20,12))
sns.set_style('dark')

mini_df = train_df.copy()
mini_df = mini_df.sort_values(by=['timestamp'])
mini_df = mini_df.drop_duplicates('timestamp')

# Start
min_df = mini_df.head(1000)
plt.subplot(3, 1, 1);
sns.pointplot(x=min_df['timestamp'],y=min_df['prior_question_had_explanation'],hue= min_df['answered_correctly'],
              linestyle='--',color='yellow',markers='x');
plt.title('Start_time');
plt.xticks([]);
plt.yticks([0,1]);

# Mid
mid_df = mini_df[50000:51100]
plt.subplot(3, 1, 2);
sns.pointplot(x=mid_df['timestamp'],y=mid_df['prior_question_had_explanation'],hue= mid_df['answered_correctly'],
              linestyle='--',color='orange',markers='x');
plt.title('Middle_time');
plt.xticks([]);
plt.yticks([0,1]);

# End
max_df = mini_df.tail(1000)
plt.subplot(3, 1, 3);
sns.pointplot(x=max_df['timestamp'],y=max_df['prior_question_had_explanation'],hue= max_df['answered_correctly'], 
              linestyle='--',color='red',markers='x');
plt.title('End_time');
plt.xticks([]);
plt.yticks([0,1]);

prior_question_had_explanation

0: 回答したあと、ユーザーは無視している。 不真面目？　・・・　でも、回答に(簡単すぎて)正解したら、回答後に解説を確認しない。復習で、その質問に何回か答えていたら、解説を飛ばすよね。

1: 回答したあと、ユーザーは解説を見ている。　真面目？

prior_question_had_explanation

ユーザーが、(前の質問バンドルに"答えた"後or質問間の"講義"を無視した後)説明をみたか≒正しい反応をしたかどうか。※nullは、ユーザーにとって最初の質問or講義である。基本は、最初講義らしい。(通常、ユーザーに表示される最初のいくつかの質問は、フィードバックが得られなかった診断テストの一部)

(bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

* 【3つのプロットから推測できること】

スタート: prior_explanation ない 0　・・・　最初の質問だから0だと思われる。・・・それにしても、結構回答を間違えている0ユーザーが多い。

ミドル: prior_explanation ない0　一部ある1　・・・　ほとんど正解1している。(解説を振り返らなくてもいいほど)簡単な問題なのか？　一部、きちんと解説を見てる(真面目!!)

ファイナル: prior_explanation ほとんど全部ある1　・・・　(まぁ、ほとんど正解1しているが、)みんな回答後に、(正解1しても)解説の説明を見ている！ (なんでだ？)

→　timestamp時間が経つほど、ユーザーは回答後の解説を見ている(ほとんど正解1しているけど)・・・問題が難しいのかな？

※　注意として、全ての質問に、「回答後の解説」がある訳ではない。(と思われる)

There are many things that can be inferred from the above 3 plots

・First thing you can see that in the early stages there are no prior_explanation.

・In the final stages you can see almost all had prior_explanation.

・Notice that in starting time there are a lot of question that are answered incorrectly (marked by black x)

・In the middle time session the questions that did not have prior explanation were answered wrong (look bottom of chart-2)

・Final stages had nearly all answers correct

In [None]:
# sns.set_style('white')
# plt.figure(figsize=(10,6));
# sns.set_style('whitegrid');
# sns.scatterplot(x ='timestamp', y='prior_question_elapsed_time', data = train_df, hue='prior_question_had_explanation',alpha=0.8
#                 ,linewidth=0,palette='viridis');
# plt.legend(loc="best");

ほとんどの質問が、解説つきである。　右下のプロットの塊は、おそらく大勢の生徒が同時に、「試験」をしたことを意味するだろう。

From the above plot we can see most of the questions had an explanation. Also we can see near right bottom some points in groups. This maybe because a large number of students took their test at the same time.

ブルーのライン(x軸が0)は、timestampが0だから＝最初のやつはprior explanationsが0である決まりなので。

Another thing that we can notice is a faint blue line along the y-axis where x is 0. This is where the timestamp is 0 and there were no prior explanations.

https://www.kaggle.com/nitindatta/eda-with-r3-id

In [None]:
# plt.figure(figsize=(10,6));
# sns.set_style('darkgrid');
# sns.scatterplot(x = train_df['task_container_id'], y= train_df['prior_question_elapsed_time'], hue=train_df['user_id'],palette='plasma',linewidth=0, size=train_df['user_id'] ,alpha=1);
# plt.legend(loc='best');

In [None]:
user_answers_df.sort_values(('answered_correctly', 'count'), ascending=False)

In [None]:
plt.scatter(x = user_answers_df[('answered_correctly', 'count')], y = user_answers_df[('answered_correctly', 'mean')]);

In [None]:
how_good = train_df[train_df['answered_correctly'] != -1].groupby('user_id').mean()

In [None]:
plt.figure(figsize = (15,6))

ax = sns.distplot(how_good['answered_correctly'], color='darkcyan',bins=50)

ax.set_xlabel("Plot of the ratio of correct to incorrect answers by user",fontsize=18)
ax.set_xlim(0,1)

values = np.array([rec.get_height() for rec in ax.patches])

norm = plt.Normalize(values.min(), values.max())

colors = plt.cm.jet(norm(values))

for rec, col in zip(ax.patches, colors):
    rec.set_color(col)

plt.show();

In [None]:
print("The best score is: %.1f" % (how_good['answered_correctly'].max()*100), "%")
print("The mean score is:  %.1f" % (how_good['answered_correctly'].mean()*100), "%")

### 学生の数

In [None]:
print("No of students = ", len(train_df['user_id'].unique()))

### 学生ごとにサンプルの数の分布を見る

In [None]:
# distribution of number of samples per student
sns.set()
fig = plt.figure(figsize=(15,6))
fig = sns.kdeplot(train_df.groupby(by='user_id').count()['row_id'], shade=True, gridsize=50, color='g', legend=False)
fig.figure.suptitle("User_id distribution", fontsize = 20)
plt.xlabel('User_id counts', fontsize=16)
plt.ylabel('Probability', fontsize=16);

ほとんどの学生が、2000未満のデータを持っている。　train内で登場する回数を、ユーザーごとにカウント

In [None]:
train_df.groupby(by='user_id').count()['row_id'].sort_values()

### 学生ごとに、どのくらい質問に回答を試みたか分布を見る

In [None]:
# How many question does each student attempt
df = train_df[train_df['content_type_id'] == 0] #回答したやつ

df = df.groupby(by='user_id').count()

fig = plt.figure(figsize=(15,6))
fig = sns.kdeplot(df['row_id'], shade=True, gridsize=50, color='r', legend=False)
fig.figure.suptitle("User attempted questions distribution", fontsize = 20)
plt.xlabel('Questions counts', fontsize=16)
plt.ylabel('Probability', fontsize=16)
plt.legend(['Questions Attempted','Questions Correctly answered'])

学生ごとにサンプルの数の分布と似ている。　ほとんどの学生が、2000未満の質問に回答している。

In [None]:
# distribution of correct and incorrect and no answers
df = train_df[train_df['content_type_id'] == 0]

df2 = df[df['answered_correctly'] == 1]
df3 = df[df['answered_correctly'] == 0]

df2 = df2.groupby(by='user_id').count()
df3 = df3.groupby(by='user_id').count()

fig = plt.figure(figsize=(15,6))
fig = sns.kdeplot(df2['row_id'], shade=True, gridsize=50, color='b', legend=False)
fig = sns.kdeplot(df3['row_id'], shade=True, gridsize=50, color='r', legend=False)

fig.figure.suptitle("User attempted questions distribution", fontsize = 20)
plt.xlabel('Questions counts', fontsize=16)
plt.ylabel('Probability', fontsize=16)
plt.legend(['Correctly answered','Incorrectly answered'])

correctly answered の正解したユーザーの方が、全体的に質問に回答している割合が多いと見れる。

### 学生ごとに、どのくらいのユーザーが解説を見ているか

In [None]:
# What precent of students see explanations

values = []

df = train_df[train_df['content_type_id'] == 0]

for group, frame in df.groupby(by='user_id'):
    
    value = len(frame[frame['prior_question_had_explanation'] == True]) / len(frame)
    values.append(value)

In [None]:
values

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(values, kde=False)
plt.title('Distribution if students who see x percent of explanations')
plt.xlabel('Percent explanation seen out of attempted questions')
plt.ylabel('Counts')

割合的には、prior explanations(事前の説明)を見たことがなく、正解した生徒がかなりいます。これ以上の読み取りは、timestampの側面を考慮する必要がある。

There is a considerable amount of students who never watched prior explanations and yet answered correctly. Any further than this we will need to use timestamps or other files.

In [None]:
# distribution of tags

total = []

for i in questions_df['tags']:
    for j in str(i).strip().split(' '):
        total.append(j)
        
keys = set(total)
final = {}
for i in keys:
    final[i] = total.count(i)
    
values = sorted(final.items(), key=lambda x: x[1], reverse=True)
d = []
for i in values:
    d.append(i[1])

In [None]:
plt.figure(figsize=(10,6))
px.line(d, title='Tags distribution')

タグの分布は、非常に偏っている。80％を超える割合で発生するタグは40個だけです。

The distribution of tags is very skewed. Only 40 tags occur almost > 80% of time. If we want to decrease the sparcity of our data we could use only the top 100 tags and it would be more than 95% of total tags with only 50% sparcity.

In [None]:
from wordcloud import WordCloud
# Most commmon tags
tags = WordCloud().generate_from_frequencies(final)
px.imshow(tags, title='Most frequent Tags')

### 適当に8人の学生の傾向を追ってみる。

In [None]:
# we will see first 8 students for trends
no_students = 8
scores = []
user_ids = []
question_attempted_l = []
correctly_answered_l = []
prior_questions_explanations = []

for count, (group, frame) in enumerate(train_df.groupby(by='user_id')):
    
    if count == no_students:
        break
    
    frame = frame.sort_values(by='timestamp')
    
    percentage = []
    question_attempted = []
    correctly_answered = []
    explanations = []
    attempted = 0
    correct_answers = 0
    explanation = 0
    
    df = frame[frame['content_type_id'] == 0]
    df = df.fillna(0)
    
    for answered_correctly, had_explanation in zip(df['answered_correctly'], df['prior_question_had_explanation']):
        
        attempted += 1
        question_attempted.append(attempted)
        
        if answered_correctly == 1:
            correct_answers += 1
            
        if had_explanation:
            explanation += 1
            
        correctly_answered.append(correct_answers)
            
        percent = correct_answers / attempted * 100
        percentage.append(percent)
        explanations.append(explanation)
        
    
    scores.append(percentage)
    user_ids.append(group)
    question_attempted_l.append(question_attempted)
    correctly_answered_l.append(correctly_answered)
    prior_questions_explanations.append(explanations)

In [None]:
# Trend in attempted question and correctly answering

plt.figure(figsize=(15,20))

for i in range(1,9):
    plt.subplot(4,2,i)
    plt.plot(question_attempted_l[i-1], question_attempted_l[i-1], label='Questions attempted')
    plt.plot(question_attempted_l[i-1], correctly_answered_l[i-1], label='Questions correctly answered')
    plt.plot(question_attempted_l[i-1], scores[i-1], label='Percentage correctly answered')
    plt.plot(question_attempted_l[i-1], prior_questions_explanations[i-1], label='Prior_questions_explanations')
    plt.legend()
    plt.ylim(0,100)
    plt.xlim(0,50)
    plt.tight_layout(pad = 2)
    plt.title(f'user_id: {user_ids[i-1]}')

非常に多くの傾向とパターンがある。

質問の回答数が多くなるからといって、正答率が上がるとはいえない。


So much to see. So much trends and patterns. Well those who had prior explanation had better results. So the trend has many types. sudden spikes(+ve, -ve), consistency, continuous increment, decrement.
Bad Students: Almost no one started watching explanations until they started performing bad.

In [None]:
# Does students time spend on answering prior questions

no_students = 8
time_spend_l = []

for count, (group, frame) in enumerate(train_df.groupby(by='user_id')):
    
    if count == no_students:
        break
    
    frame = frame.sort_values(by='timestamp')
    total_time_spends = []
    time_spends = 0
    
    for time_spend in frame['prior_question_elapsed_time'][frame['content_type_id'] == 0]:
        
        if time_spend > 0:
            time_spends += time_spend
            total_time_spends.append(time_spends)
        
    
    time_spend_l.append(total_time_spends)

In [None]:
time_spend_l = np.array(time_spend_l)
for index, value in enumerate(time_spend_l):
    time_spend_l[index] = np.array(time_spend_l[index]) / 10000

In [None]:
# Trend in time spend with percentage

plt.figure(figsize=(15,20))

for i in range(1,9):
    plt.subplot(4,2,i)
    plt.plot(question_attempted_l[i-1], correctly_answered_l[i-1], label='Questions correctly answered')
    plt.plot(question_attempted_l[i-1][1:], time_spend_l[i-1], label='time spend in 10000')
    plt.plot(question_attempted_l[i-1], scores[i-1], label='Percentage correctly answered')
    plt.legend()
    plt.ylim(0,100)
    plt.xlim(0,50)
    plt.tight_layout(pad = 2)
    plt.title(f'user_id: {user_ids[i-1]}')

prior_question_elapsed_time は、前の質問の回答にどのくらいの時間を要したか。

There is mostly a linear increase in prior question time elapsed.

### answered_correctlyの上位と下位を比較したい。

#### timestamp について

It is imprtant to remember that this is the time between this user interaction and the first event from that user. So starting time could be different for each user

In [None]:
train_df.groupby(['user_id'])['timestamp'].max().sort_values(ascending=False).head(20)

Some users have really huge activity time!

#### content_id
Id of the content - question or lecture

In [None]:
train_df['content_id'].value_counts()

In [None]:
train_df.loc[train_df['content_id'] == 6116]

In [None]:
train_df.loc[train_df['content_id'] == 6116, 'user_answer'].value_counts()

In [None]:
questions_df.loc[questions_df['question_id'] == 6116]

We can see that a lot of people made mistakes answering this question.