In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [None]:
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int16",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", dtype=dtypes)

In [None]:
data.info()

In [None]:
data.isnull().sum()

## prior_question_elapsed_time

In [None]:
temp_ =data.loc[(data.prior_question_elapsed_time.isnull()) & (data.timestamp != 0)]

In [None]:
temp_.content_type_id.value_counts().plot(kind = 'pie', autopct = '%0.3f')
plt.title("Content Type when timestamp not equal to zero and prior_question_elapsed_time is null")

In [None]:
sns.distplot(temp_.timestamp)

In [None]:
# temp_.loc[(temp_.content_type_id == 1)]
data.loc[(data.timestamp == 0) & (data.prior_question_elapsed_time == 0)]

In [None]:
data.loc[((data.user_id == 369317294) & (data.prior_question_elapsed_time.isnull())) | ((data.user_id == 369317294) & (data.timestamp == 0))]

In [None]:
temp_ =data.loc[data.prior_question_elapsed_time == 0]
plt.figure(figsize = (8, 20))
plt.subplot(121)
plt.subtitles('when prior_question_elapsed_time')
plt.title("Is 0")
temp_.answered_correctly.value_counts().plot(kind = 'pie', autopct = '%0.3f')
plt.subplot(122)
plt.title('Is null')
data.loc[(data.prior_question_elapsed_time.isnull()) & (data.content_type_id == 0)].answered_correctly.value_counts().plot(kind = 'pie', autopct = '%0.3f')

In [None]:
gc.collect()

The Distribution of Taget feature (when prior_question_elapsed_time is equal to zero ) is equal to the Distribution of Target Feature (when prior_question_elapsed_time is Nan) so in my perspection it won't affect when we replace Nan value with 0 when content type id = 0 and -1 when content type id = 1

## Null Values

In [None]:
prior_q_e_time = data.loc[data.prior_question_elapsed_time.isnull()]

In [None]:
prior_q_e_time.head()

In [None]:
data['prior_question_elapsed_time'] = data['prior_question_elapsed_time'].fillna(0)

In [None]:
data['prior_question_had_explanation'] = data['prior_question_had_explanation'].fillna(0)

In [None]:
data['prior_question_had_explanation'] = data['prior_question_had_explanation'].replace({True : 1, False : 0})

In [None]:
gc.collect()

In [None]:
question_data = pd.read_csv(r'../input/riiid-test-answer-prediction/questions.csv')

In [None]:
question_data.loc[question_data.tags.isnull()]

In [None]:
 from collections import Counter
    
tags = question_data['tags'].loc[question_data['part'] == 6]
tags = tags.loc[tags.isnull() == False]
tags = tags.str.split(' ')
tags = [j for i in tags.values for j in i]
total_count = len(tags)
tags = Counter(tags)
x = list(tags.keys())
y = np.array(list(tags.values()))
y = y / total_count

In [None]:
plt.figure(figsize=(15, 15))
plt.barh(x, y)
plt.vlines(x=0.5, ymin = '179', ymax = len(tags), color = 'red',  linestyles = 'dashed')
plt.title('Count of Invidual tags in part - 6')
plt.xlabel('Count')
plt.ylabel('Tag')

In [None]:
data.loc[data['content_id'] == 10033]

In [None]:
temp_ = data.iloc[62750278 - 5 :62750278 + 5, :]

In [None]:
temp_.merge(question_data, left_on = 'content_id', right_on = 'question_id')

In [None]:
data.loc[(data.user_id == 1333688829) & ((data.task_container_id == 1126) | (data.task_container_id == 1128))].merge(question_data, left_on = 'content_id', right_on = 'question_id')

In [None]:
temp_ = data.loc[(data.user_id == 1333688829) & (data.content_type_id == 0)].merge(question_data, left_on = 'content_id', right_on = 'question_id')
tags = temp_['tags'].loc[temp_['part'] == 6]
tags = tags.loc[tags.isnull() == False]
tags = tags.str.split(' ')
tags = [j for i in tags.values for j in i]
total_count = len(tags)
tags = Counter(tags)
x1 = list(tags.keys())
y1 = np.array(list(tags.values()))
y1 = y1 / total_count

In [None]:
answer = temp_['answered_correctly'].loc[temp_.part == 6]
plt.pie(answer.value_counts(), labels = [1, 0], autopct = '%0.2f')
plt.legend()

In [None]:
lecture_data = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

In [None]:
lecture_data.isnull().sum()

In [None]:
temp_ = question_data.merge(lecture_data, left_on = 'question_id', right_on = 'lecture_id', how = 'outer')
temp_.loc[temp_.question_id == 10033]

In [None]:
temp_.loc[temp_.lecture_id.isnull() == False].head()

There is only one nan value in tags columns of the question dataset. To find the best value of in place of nan tag done some comparition with train andlecture dataset. Such question was asked only once in the whole train dataset and it was asked inbetween part 5 question but the nan tag question is belong to part - 6.

Questions that asked before and after nan tag question is of tag "8" from that we can suspect that it may belongs to tag 8 but there is no evidence that it is.

I check wheather the question and lecture dataset that has same id also have same tags, but it is not same.

In other hand, when i check the distribution of the tag part - 6 question that asked to the user_id "1333688829" is almost same as the distribution of part-6 questions in question 
dataset.

Main Problem wheater we want to remove the nan tag question from both train and question dataset or we want to mark it with -1.

In my perspective i decided to mark the tag as -1.

In [None]:
question_data.tags = question_data.tags.fillna(-1)

## Anlysing the First Interaction of each User

In [None]:
first_interaction = data.groupby('user_id').first()

In [None]:
plt.figure(figsize = (12, 20))
ax  = plt.subplot(121)
first_interaction.answered_correctly.value_counts().plot(kind = 'pie', ax = ax, autopct='%1.3f%%', startangle=270, fontsize=17)
ax = plt.subplot(122)
first_interaction.content_type_id.value_counts().plot(kind = 'pie', ax = ax, autopct='%1.3f%%', startangle=270, fontsize=17)

34% unique content_id's are used

Work on progress....