In [None]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

In [None]:
# loading the data
train = pd.read_feather('../input/riiid-train-data-multiple-formats/riiid_train.feather', 
                        columns=['user_id', 'timestamp', 'user_answer', 'answered_correctly'])
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

# Time elapsed since the user started using the app

In [None]:
# aggregation(max value of timestamp per user) 
agg_df = train.groupby('user_id')['timestamp'].max().reset_index()
agg_df = agg_df.sort_values('timestamp', ascending=False) 
agg_df['day'] = agg_df['timestamp'] / (1000*60*60*24)
agg_df['day'].describe()

Most are used for a few days or months.  
but the longest users have been using it for more than 1000days!

In [None]:
agg_df.head()

In [None]:
plt.figure(figsize=(10,4))
plt.hist(agg_df['day'])
plt.xlabel('elapsed days')
plt.ylabel('number of users')

They're either continuous learners or, if they're not, they're app developers.

# correct answer to all questions

In [None]:
train = train[train.answered_correctly>=0]

# Calculate the percentage of correct answers per user
agg_df = train.groupby('user_id')['answered_correctly'].agg(['count', 'sum']).reset_index()
agg_df.columns = ['user_id', 'total_answer_cnt', 'correct_cnt']
agg_df['correct_rate'] = agg_df['correct_cnt'] / agg_df['total_answer_cnt']
agg_df.sort_values(['correct_rate', 'total_answer_cnt'], ascending=False).head(20)

user_id:48102728 answered all 41 questions correctly.  
but the numbers aren't surprising. (The real TOEIC test has 200 questions.)

# users who keep choosing the same answer

In [None]:
# Calculate the percentage of each answer number selected by each user.
agg_df = train.groupby(['user_id', 'user_answer'])['answered_correctly'].count().reset_index()
agg_df.columns = ['user_id', 'user_answer', 'answer_cnt']
agg_df['total_answer_cnt'] = agg_df.groupby('user_id').transform('sum')['answer_cnt']
agg_df['selection_rate'] = agg_df['answer_cnt'] / agg_df['total_answer_cnt']
agg_df

In [None]:
agg_df.sort_values(['selection_rate', 'total_answer_cnt'], ascending=False).head(20)

There are quite a few users who always pick the same number.  
Are they abandoning answers?

# by the way,  
I noticed a things in the investigation above.  
Many users with total_answer_cnt of 30.  
Let us just check one last thing.

In [None]:
agg_df = train.groupby('user_id')['user_answer'].count().reset_index()
agg_df.columns = ['user_id', 'total_answer_cnt']

In [None]:
vc = agg_df['total_answer_cnt'].value_counts()
vc.head(20)

In [None]:
plt.figure(figsize=(15,4))
plt.bar(vc.index, vc)
plt.xlim(0,100)
plt.xlabel('total answer count')
plt.ylabel('number of users')

(total_answer_cnt=30) are remarkably high.  
And then there are round number like 40 and 50.  

There may be a total of 30 questions to test user skills when to start using the app.  
Basically are there 10 questions per a attempt?