In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading data

We will read half of the train data but you can do all the shown preprocessing with another half.

In [None]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", dtype=dtypes, nrows=50000000)

print("Train size:", data.shape)

In [None]:
data.head()

In [None]:
qdata = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
ldata = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
test_data = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [None]:
qdata.head()

In [None]:
ldata.head()

In [None]:
test_data.head()

## Feature description

Let's talk more detalized about every single feature in train.

row_id: (int64) ID code for the row.

In [None]:
data.row_id.value_counts()

We can see that ID is unique.

timestamp: (int64) the time in milliseconds between this user interaction and the first event completion from that user.

In [None]:
data.timestamp.hist()

Let's look at some users timestamp distribution:

In [None]:
sns.distplot(data[data.user_id==data.user_id.unique()[0]].timestamp)

In [None]:
sns.distplot(data[data.user_id==data.user_id.unique()[1]].timestamp)

In [None]:
sns.distplot(data[data.user_id==data.user_id.unique()[2]].timestamp)

Or let's have a look at the smartest users timestamp distribution:

In [None]:
data[data.user_id==data.groupby('user_id').answered_correctly.sum().sort_values(ascending=False).index[0]].timestamp.hist()

In [None]:
data[data.user_id==data.groupby('user_id').answered_correctly.sum().sort_values(ascending=False).index[1]].timestamp.hist()

Smart guys spend a lot of time on the platform

user_id: (int32) ID code for the user.

In [None]:
data.user_id.hist()

content_id: (int16) ID code for the user interaction

In [None]:
data.content_id.hist()

content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

In [None]:
sns.countplot(data.content_type_id)

We can see that questions are much more than lectures.

task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.

In [None]:
sns.countplot(data.user_answer)

In [None]:
sns.countplot(qdata.correct_answer)

It is strange but users prefer don't choose the second answer.

answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.

In [None]:
sns.countplot(data.answered_correctly)

The right answers are about 2/3 and the wrong ones - 1/3. Let's have a look at the top of the smartest users:

In [None]:
pd.Series(data.groupby('user_id').answered_correctly.sum().sort_values(ascending=False).iloc[:30], index=data.groupby('user_id').answered_correctly.sum().sort_values(ascending=False).iloc[:30].index).sort_values().plot(kind='barh')

The median value of answered_correctly:

In [None]:
data.groupby('user_id').answered_correctly.sum().median()

And the mean:

In [None]:
data.groupby('user_id').answered_correctly.sum().mean()

Let's create a new feature is_smart. We will consider a person to be smart if he answered right more questions than 75% quantile:

In [None]:
sums = data.groupby('user_id').answered_correctly.sum()
smart_users = sums[sums > sums.quantile(0.75)].index
data['is_smart'] = 0
data.loc[data.user_id.isin(smart_users), 'is_smart'] = 1
del sums
del smart_users

In [None]:
data.is_smart.value_counts()

Also let's count the summary time on the platform of each user in relation to the whole time of all the users.|

In [None]:
sum_time = data.groupby('user_id').timestamp.sum()
all_sum = sum_time.sum()

In [None]:
data['sum_timestamp'] = data['user_id'].apply(lambda x: sum_time.loc[x]/all_sum)
del sum_time
del all_sum

In [None]:
mean0 = data[data.is_smart==0].sum_timestamp.mean()
mean1 = data[data.is_smart==1].sum_timestamp.mean()
mean0 / (mean0 + mean1), mean1 / (mean0 + mean1)

As we can see, smart users spend a lot of time on the platform.

prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

In [None]:
data.prior_question_elapsed_time.hist()

prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [None]:
sns.countplot(data.prior_question_had_explanation)

Let's have a look if there are any users from train which are in test too:

In [None]:
traintest_users = list(set(test_data.user_id) & set(data.user_id))

In [None]:
traintest_users

In [None]:
len(traintest_users)

Yes! They are. And in larger dataset they will be more in count. So now we will make some preprocessing in test data

In [None]:
test_data['is_smart'] = 0
sum_time = test_data.groupby('user_id').timestamp.sum()
all_sum = sum_time.sum()
test_data['sum_timestamp'] = test_data['user_id'].apply(lambda x: sum_time.loc[x]/all_sum)

In [None]:
smart_guys = data[data.user_id.isin(traintest_users)].groupby('user_id')['is_smart'].max()
test_data.loc[test_data.user_id.isin(traintest_users), 'is_smart'] = test_data[test_data.user_id.isin(traintest_users)]['user_id'].apply(lambda x: smart_guys.loc[x])

In [None]:
test_data

So here we have two new features named is_smart and sum_timestamp. They are correlating somehow:

In [None]:
data[['is_smart','sum_timestamp']].corr()

What I suggest is to count the traintest_users from another part of data and to make all the preprocessing deals. Then to take the users from traintest_users and transform the feaure is_smart from train to test. The least guys which are in test but aren't in traintest_users and their feauture is_smart we will predict.

## To be continued...