In [None]:
import numpy as np 
import pandas as pd
from scipy.stats import skew
import gc

import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Exploring *train* dataset

### Description of the dataset
- row_id: (int64) ID code for the row.

- timestamp: (int64) the time in milliseconds between this user interaction and the first event completion from that user.

- user_id: (int32) ID code for the user.

- content_id: (int16) ID code for the user interaction

- content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

- task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

- user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.

- answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.

- prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

- prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback

# 1. Univariate Exploration

In [None]:
%%time

#reading de train dataset
train = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train.memory_usage(deep=True)

Seems that the Feature has the wrong type. It should be **bool** but it is **object** and it is undesirably consuming RAM.  
Let's cast it to bool.

In [None]:
train.prior_question_had_explanation = train.prior_question_had_explanation.astype('bool')

In [None]:
train.info()

Now let's check how much missing values do we have here

In [None]:
train.isnull().sum()

So we need to manage missing value in the column "prior_question_elapsed_time".  
Let's fill the missing value for the mean value of the column.


In [None]:
train.prior_question_elapsed_time = train.prior_question_elapsed_time.fillna(value = train.prior_question_elapsed_time.mean())

In [None]:
# checking the result
train.isnull().sum()

Now let's visualize the distribution of timestamp column

In [None]:
fig, ax = plt.subplots(figsize = (18, 8))
ax.hist(train.timestamp, bins = 70)


ax.set_xlabel('Timestamps', fontsize=18);
ax.set_ylabel('Frequency', fontsize=18);
ax.set_title('Timestamp Distribution', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

Looking at the distribution, we can conclude that the most amount of interactions was done by users that was not active for very long time.   
**We can say that the behaviour of this distribution could be discrebed by the fuction f(x) = 1/x where x is the amount of the timestamps in that bin.**

In [None]:
fig, ax = plt.subplots(figsize = (18, 8))
ax.plot(np.arange(1,100), 1/np.arange(1,100));

ax.set_xlabel('x', fontsize=18);
ax.set_ylabel('1/x', fontsize=18);
ax.set_title('1/x behaviour', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

As expected, the behaviour match with the equation.

Let's check which contents are most frequent and how content_id distribution looks like

In [None]:
fig, ax = plt.subplots(figsize = (18, 8))
ax.hist(train.content_id, bins = 200)


ax.set_xlabel('IDs', fontsize=18);
ax.set_ylabel('Frequency', fontsize=18);
ax.set_title('Content_Id Distribution', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

Looking at the graph we can say that IDs until 11000 are the most frequent and in this range there is some points bins that appear way more thant the others. That means this IDs are more acessed by the users.   
It is also important to say that we need to check which of those IDs stands for Questions and Lectures. This analysis is done in the next section where we explore bivariate behaviour.

Now let's check the distribution for user_id in order to get to know more about this feature

In [None]:
proportion_content_type_id = (train.content_type_id.value_counts().values/
                              train.content_type_id.value_counts().values.sum())

# plotting
fig, ax = plt.subplots(figsize = (18, 8))
ax.bar(['Question', 'Lecture'], proportion_content_type_id)

# layout setup
ax.set_xlabel('Content Type', fontsize=18);
ax.set_ylabel('Proportion', fontsize=18);
ax.set_title('Proportion between Questions and Lectures', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

# adding values on the top of each bar
rects = ax.patches
labels = [str("{:.2f}".format(x)) for x in proportion_content_type_id]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height, label,
            fontsize=14, ha = 'center', va = 'bottom')
    
plt.show()

It is clear that the giant proportion of interation was solving questions than taking lectures. If we could see a relationship between this proportion and the proportion of corect answers for the user, it could be a great clue of the way that we should build our ML solution. I will test it latter when it gets on bivariate analysis.

Now let's investigate the task_container_id feature and check if we can extract some insight about the containers.

In [None]:
train.task_container_id.value_counts()

In [None]:
frequency_task_container = pd.Series(train.task_container_id.value_counts().values,
                                     index = (train.task_container_id.value_counts().index).astype('str')).sort_values(ascending=True)

# plotting
fig, ax = plt.subplots(figsize = (18, 10))
ax.barh(frequency_task_container.tail(40).index, frequency_task_container.tail(40))
#plt.gca().invert_yaxis()

# layout setup
ax.set_xlabel('Frequency', fontsize=18);
ax.set_ylabel('Task Container Ids', fontsize=18);
ax.set_title('30 Most frequent Task Container', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

Let's check of the answers examining the feature "user_answer"

In [None]:
# plotting
fig, ax = plt.subplots(figsize = (18, 8))
ax.bar(train.user_answer.value_counts().index, train.user_answer.value_counts().values)
#plt.gca().invert_yaxis()

# layout setup
ax.set_xlabel('Answers', fontsize=18);
ax.set_ylabel('Frequency', fontsize=18);
ax.set_title('Total of each answer was choosen', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

Well it does already say something but let's put it in %

In [None]:
answer = train.user_answer.value_counts().drop(index = -1)
answer_percent = answer.values / answer.values.sum()

# plotting
fig, ax = plt.subplots(figsize = (18, 8))
ax.bar(answer.index, answer_percent)
#plt.gca().invert_yaxis()

# layout setup
ax.set_xlabel('Frequency', fontsize=18);
ax.set_ylabel('Answers', fontsize=18);
ax.set_title('Total of each answer was choosen in %', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
plt.xticks([0, 1, 2, 3])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

# adding values on the top of each bar
rects = ax.patches
labels = [str("{:.2f}".format(x)) + '%' for x in answer_percent]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height , label,
            fontsize=14, ha = 'center', va = 'bottom')


plt.show()

We see that only answer 2 was less chosen than the others in around 9%. We see also that the other answers were chosen around 27% of the times. **The question here is why?** It should have the same value assuming that there are 101 milion interactions, enough samples to see a equal distribution between the answers.

In [None]:
train.head()

Now let's check how many questions (in proportion) were answered right and wrong!

In [None]:
answered_right = train.answered_correctly.value_counts().drop(index = -1)
answered_right_percent = answered_right / answered_right.values.sum()


# plotting
fig, ax = plt.subplots(figsize = (18, 8))
ax.bar(answered_right.index, answered_right_percent)

# layout setup
ax.set_xlabel('Answers', fontsize=18);
ax.set_ylabel('Proportion', fontsize=18);
ax.set_title('Proportion of correct answers in all interations', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
plt.xticks([0, 1], ['Wrong', 'Right'])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

# adding values on the top of each bar
rects = ax.patches
labels = [str("{:.2f}".format(x)) for x in answered_right_percent]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height , label,
            fontsize=14, ha = 'center', va = 'bottom')


plt.show()

By looking the graph above, we see that there is around 66% of the answers would be correct. That is important because if we decide to bild a baseline that only say that the answer will be correct, we expect to be correct on 66% of the time. So our model must perform above this value or if we decide for another baseline, it must be better than 66% of correct predictions! 

In [None]:
fig, ax = plt.subplots(figsize = (18, 8))
ax.hist(train.prior_question_elapsed_time, bins = 100)


ax.set_xlabel('Time (miliseconds)', fontsize=18);
ax.set_ylabel('Frequency', fontsize=18);
ax.set_title('Time demanded to answer each question in the previous bundle', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

plt.show()

Well, it looks like a normal distribution but skewed to the right. Let's check it out.

In [None]:
skew(train.prior_question_elapsed_time)

In [None]:
train.prior_question_elapsed_time.describe()

A value greater than 0 means that the distribution is right skewed. A value of 4.83 means that this skewness is pretty strong actualy. The mean is much bigger than the mode and to fill null values with mean could lead to a undesired behaviour because we want to our descriptive statistics to be the most concentrate aroud the majority of the data as possible. So, in this case the right tail will lead to a erroneous assumption.
Here we conclude: 
1. The fillna should be with the median and not the mean;
2. As we can see, the standard deviation is almost twice as bigger than the mean and that is pretty bad because mean is very sensitive to outliers or too skewed distribution.
3. It would be better to use the median. So let's reload the data and use median in the fillna.


In [None]:
del train
gc.collect()

In [None]:
train = pd.read_pickle('../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip')
train.prior_question_had_explanation = train.prior_question_had_explanation.astype('bool')
train.prior_question_elapsed_time = train.prior_question_elapsed_time.fillna(value = train.prior_question_elapsed_time.median())

Let's finally take a look at the feature *"prior_question_had_explanation"* in order to understand in how many interactions did the users see an explanation after answering a question bundle.

In [None]:
explanation = train.prior_question_had_explanation.value_counts()
explanation_percent = explanation / explanation.values.sum()


# plotting
fig, ax = plt.subplots(figsize = (18, 8))
ax.bar(explanation.index, explanation_percent)

# layout setup
ax.set_xlabel('Had Explanation', fontsize=18);
ax.set_ylabel('Proportion', fontsize=18);
ax.set_title( 'Proportion of interations where the user saw explanation after answering', fontsize = 20)
plt.yticks(fontsize = 14)
plt.xticks(fontsize = 14)
plt.xticks([0, 1], ['No', 'Yes'])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False);

# adding values on the top of each bar
rects = ax.patches
labels = [str("{:.2f}".format(x)) for x in explanation_percent]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height , label,
            fontsize=14, ha = 'center', va = 'bottom')


plt.show()

As we can  see, the users in around 90% of the interactions saw an explanation after answering a question bundle.

# 2. Bivariate Exploration

In order to understand a little bit more about the correlation between the features, and to have some relationship to start exploring, let's create a correlation heatmap!

In [None]:
correlation = train.corr()
correlation

In [None]:
import plotly.figure_factory as ff

z = np.around(correlation.values, decimals = 4)
x = list(correlation.columns)
y = list(correlation.columns)

fig = ff.create_annotated_heatmap(z, x = x, y = y,
                                  colorscale = 'RdBu',
                                  font_colors = ['black'],
                                  showscale = True)

fig.data[0].update(zmin = -1, zmax = 1)
fig.layout.title = 'Features Correlation'
fig.update_layout(width=800, height=900)
fig.show()

Assuming that only correlations where z > 0.4 or z < -0.4 are strong enough to worth an exploration, the relatioships that we will explore are:
- timestamp & task_container_id - z = 0.4334
- content_id & content_type_id - z = 0.4146
- content_type_id & prior_question_had_explanation - z = -0.3915

As expected the features user_id and row_id have z = 1

Because the train dataframe has 101 milion intances, let's use some samples to be able to explore it better

**Exploring the correlation between timestamp & task_container_id (z = 0.4334).**

In [None]:
# 1% of the dataset
train_sample = train.sample(1000000, random_state = 42)
train_sample

In [None]:
fig = px.scatter(train_sample, train_sample.timestamp, train_sample.task_container_id,
                 marginal_x="histogram", marginal_y="histogram", opacity = 0.1)
fig.show()

As expected, we do se some relationship between this two variables but weak.  

We can also see that there are a greater concentration of the points in the left down side of the graph, that means, most of the timestamps were between 0 and 40 bilions and the most frequent task cointainers were between 0 an 4k.

**Exploring the correlation between content_id & content_type_id (z = 0.4146).**

In [None]:
fig = make_subplots(rows = 1, cols = 2, 
                   subplot_titles = ('Boxplot of the content_id for Questions and Lectures',
                                     'Distribution of content_id (for comparison)'),
                   column_widths=[0.7, 0.3])

fig.add_trace(go.Violin(x = train_sample.content_type_id, 
                        y = train_sample.content_id, 
                        box_visible=True), row = 1, col = 1)

              
fig.add_trace(go.Histogram(x = train_sample.content_id), row = 1, col = 2)   


fig.update_layout(xaxis = {'tickmode':'array',
                           'tickvals':[0, 1],
                           'ticktext':['Question', 'Lecture']},
                  title_text = "Analysing the relationship between content_id and content_type_id",
                  showlegend = False)

fig.update_xaxes(title_text = "content_type_id", row=1, col=1)
fig.update_yaxes(title_text = "content_id", row=1, col=1)
fig.update_xaxes(title_text = "content_id", row=1, col=2)

              
fig.show()

Analysing the graph we can say that:
- For Questions, the content_id range is from 0 to 14k. We can also, in the second graph, that most part of the content are from this range. It goes right away with we saw before, where 98% of the interactions were of questions and only 2% of lectures.
- Question content_id distribution is very heterogen.
- For Lectures, 75% of the data are from ids over 8,6k and as content_id grows, there are less interactions. It explains why we see this tail with small frequency: the major part of this interactions are from questions and because only 2% of instances are of lectures, this tail must present small frequency values.
- Lectures content_id distribution are very homogen, so we can supose there are no prefered lectures.

**Exploring the correlation between content_type_id & prior_question_had_explanation (z = -0.3915).**

In [None]:
train.prior_question_had_explanation.value_counts()

In [None]:
questions = train[train['content_type_id'] == False]['prior_question_had_explanation']
lectures = train[train['content_type_id'] == True]['prior_question_had_explanation']
explanation = ["Hadn't Explanation", "Had Explanation"]

trace1 = go.Bar(x = explanation, 
                y = questions.value_counts().values,
                name = 'Questions')

trace2 = go.Bar(x = explanation, 
                y = lectures.value_counts().values,
                name = 'Lectures')

data = [trace1, trace2]

layout = go.Layout(title = 'Relationship of Content Type and if the previous question had explanation')

fig = go.Figure(data = data, layout = layout)
fig.show()

Analysing the graph we can conclude that, in one hand if the user didn't wacht an explanation of after solving the previous question bundle, it is more likely that the next interaction will be with another question posed. On the other hand, if the student had explanation, we can say that the next definitivly won't a lecture.

In [None]:
del [[questions, lectures]]
gc.collect()

*Now let's check out other possible relationships.*

**Checking the relationship between prior_question_had_explanation & answered_correctly**

In [None]:
incorrect = train[train['answered_correctly'] == 0]['prior_question_had_explanation']
correct = train[train['answered_correctly'] == 1]['prior_question_had_explanation']
explanation = ["Hadn't Explanation", "Had Explanation"]

trace1 = go.Bar(x = explanation, 
                y = incorrect.value_counts().values,
                name = "Incorrect")

trace2 = go.Bar(x = explanation, 
                y = correct.value_counts().values,
                name = "Correct")

data = [trace1, trace2]

layout = go.Layout(title = 'Relationship of Content Type and if the previous question had explanation')

fig = go.Figure(data = data, layout = layout)
fig.show()

The graphic above shows an unexpected behaviour, if the student didn't see an explanation after solving the last question bundle it is more likely that he will answer the question correctly. But if he had an explanation, the likelyhood it's about the same. That is pretty unexpected to me.  
We might think as a cause of this behaviour to be that the student move to the next question bundle but his thoughts still at the last problems and he probabily still have some questions about the subject. It suggests also that a great next bundle would be something that still related to the previous explanation in order reforce the content and improve the learning process.

In [None]:
del [[incorrect, correct]]
gc.collect()

**Checking the relationship between task_container_id & answered_correctly**  
The main ideia is to check if there are somo containers that has 

In [None]:
incorrect = train[train['answered_correctly'] == 0]['task_container_id'].value_counts().reset_index()
incorrect.columns = ['task_container', 'count']
incorrect['task_container'] = incorrect['task_container'].astype(str) + ' -'
incorrect = incorrect.sort_values(['count']).tail(30)


correct = train[train['answered_correctly'] == 1]['task_container_id'].value_counts().reset_index()
correct.columns = ['task_container', 'count']
correct['task_container'] = correct['task_container'].astype(str) + ' -'
correct = correct.sort_values(['count']).tail(30)

fig = make_subplots(rows = 1, cols = 2, subplot_titles = ('Incorrect','Correct'))

fig.add_trace(go.Bar(x = incorrect['count'], 
                     y = incorrect['task_container'], 
                     orientation='h'), row = 1, col = 1)

fig.add_trace(go.Bar(x = correct['count'], 
                     y = correct['task_container'], 
                     orientation='h'), row = 1, col = 2)  

fig.update_yaxes(title_text = 'task_container_id', tickvals = incorrect['task_container'], row=1, col=1)
fig.update_yaxes(tickvals = correct['task_container'], row=1, col=2)
fig.update_layout(title_text = '30 most frequent task containers for incorrect and correct answers', showlegend = False)

fig.show()

Let's take a closer look at the top 10 most frequent task containers answered correctly and incorrectly. 

In [None]:
for i in range(0, 10):
    print("Top " + str(i+1) + " incorrect: " + str(incorrect['task_container'][i]) + 
          " Top " + str(i+1) + " correct: "  + str(correct['task_container'][i][:-2]))

Analysing the graphs and the print above we can conclude the following:
- if the taks container, like container 6, has a greater frequency in incorrect answers than in correct answers, we might say that this container has hard difficulty.
- if the task container, like container 15, has more or less the same frequency in both incorrect and correct answers count, we might say that this container has middle difficulty.
- if the task container, like container 14, has a greater frequency in correct answers than in incorrect answers, we might say that this container has a easy difficulty.

This conclusion could be the most important so far, because it allows us to build a new feaure where we classify the level of difficulty of the container (could be 3 levels or whatever we want), and this new feature might help to improve our machine learning models!  
Let's create this feature and explore it a little!

In [None]:
del [[incorrect, correct]]
gc.collect()

In [None]:
incorrect = train[train['answered_correctly'] == 0]['task_container_id'].value_counts().reset_index()
incorrect.columns = ['task_container', 'count']
incorrect = incorrect.sort_values(['task_container'])


correct = train[train['answered_correctly'] == 1]['task_container_id'].value_counts().reset_index()
correct.columns = ['task_container', 'count']
correct = correct.sort_values(['task_container'])

task_container_difficulty = []
for i in correct['task_container']:
    correct_value = correct.loc[correct['task_container'] == i]['count'].values
    incorrect_value = incorrect.loc[incorrect['task_container'] == i]['count'].values
    
    if incorrect_value >= (correct_value * 1.2):
        task_container_difficulty.append("very hard")
       
    elif incorrect_value >= (correct_value * 1.05):
        task_container_difficulty.append("hard")
        
    elif (incorrect_value * 1.5) <= correct_value:
        task_container_difficulty.append("very easy")
        
    elif (incorrect_value * 1.05) <= correct_value:
        task_container_difficulty.append("easy")
        
    else:
        task_container_difficulty.append("middle")
        
task_container_difficulty = pd.DataFrame(task_container_difficulty, columns = ["difficulty"])

In [None]:
task_container_difficulty

Great! Now we may plot it and understand the distribution of the difficulty

In [None]:
task_container_difficulty.value_counts()

First thing to notice is that the "very easy" questions are so much more than the other levels. It may make sense when we thing that students need to solve easy questions to step on new subject in a way that they don't feel overwhelmed and afraid to dive into the different areas.  

Another point is that this number is still too bigger an we expected to see more questions of other levels. It may be good to work with the thresholds a bit! I think here is a great opportunity to go crazy and test whatever thresholds combination that comes in mind.

Now let's insert this results in our train dataset and check how were the distribution of the interactions based on their level of difficulty!

In [None]:
train = train.join(task_container_difficulty, on = "task_container_id")

Great! Now let's analyse em plot the distribution.

In [None]:
custom_dict = {"very easy":0,
             "easy":1,
             "middle":2,
             "hard":3,
             "very hard":4}

difficulty_value_counts = train["difficulty"].value_counts().sort_index(key=lambda x: x.map(custom_dict))
difficulty_value_counts

In [None]:
fig = px.bar(x = difficulty_value_counts.index, 
             y = difficulty_value_counts.values)

fig.update_layout(title_text = "Level of Difficulty Distribution")
fig.update_xaxes(title_text = "Container level of difficulty")
fig.update_yaxes(title_text = "Frequency")

fig.show()

Seems that very easy container taks are the greatest part of the interaction and goes directly to which we've said before (more easy task must be present to the user so they can step into new subjects without get scaried and abandon the program), but it still to much very easy questions, which indicates that we should change the thresholds.

In order to see de distribution of the other levels, let's plot them without the very easy level of difficulty.

In [None]:
fig = px.bar(x = difficulty_value_counts.index[1:], 
             y = difficulty_value_counts.values[1:])

fig.update_layout(title_text = 'Level of Difficulty Distribution without "Very Easy" level')
fig.update_xaxes(title_text = "Container level of difficulty")
fig.update_yaxes(title_text = "Frequency")

fig.show()

Looking at the distribution above, we may say that after a step into a subject with very easy and easy question, the user prefer hard and very hard questions and it makes sense because to get sharp in an area, it demands the solving of complex tasks!

**Checking the relationship between difficulty & answered_correctly**

In [None]:
incorrect = train[train['answered_correctly'] == False]['difficulty']
correct = train[train['answered_correctly'] == True]['difficulty']
difficulty_level = ["Very Easy", "Easy", "Middle", "Hard", "Very Hard"]

custom_dict = {"very easy":0,
             "easy":1,
             "middle":2,
             "hard":3,
             "very hard":4}

trace1 = go.Bar(x = difficulty_level, 
                y = incorrect.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values,
                name = "Incorrect")

trace2 = go.Bar(x = difficulty_level, 
                y = correct.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values,
                name = "Correct")

data = [trace1, trace2]

layout = go.Layout(title = 'Relationship of Task Container level of Difficulty and Answers')

fig = go.Figure(data = data, layout = layout)
fig.show()

As we can see, for very easy task containers there are much more correct questions than incorrect. It is something that we already expect to see and confirms the hypothesis so far.

By looking closer to the other levels, we see that the distribution presents a trasitin where for easy containers there are still more correct answer, for middle difficulty the amount for correct and incorrect was about the same and for hard and very hard there are more incorret than correct answers.  
This behaviour is exactly what we expect to see in real world and it is a great clue that the analysis is going in the right path! 

**Checking the relationship between difficulty & prior_question_had_explanation**  
We expect, intuitively, to see more explanation for more difficulty task containers, let's check if it matches with the reality

In [None]:
hadnt_explanation = train[train['prior_question_had_explanation'] == 0]['difficulty']
had_explanation = train[train['prior_question_had_explanation'] == 1]['difficulty']

In [None]:
had_explanation.value_counts()

In [None]:
difficulty_level = ["Very Easy", "Easy", "Middle", "Hard", "Very Hard"]

custom_dict = {"very easy":0,
              "easy":1,
              "middle":2,
              "hard":3,
              "very hard":4}

trace1 = go.Bar(x = difficulty_level, 
                y = hadnt_explanation.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values,
                name = "Hadn't Explanation")

trace2 = go.Bar(x = difficulty_level, 
                y = had_explanation.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values,
                name = "Had Explanationt")

data = [trace1, trace2]

layout = go.Layout(title = 'Relationship of Task Container level of Difficulty and if Prior Question Had Explanation')

fig = go.Figure(data = data, layout = layout)
fig.show()

Let's look first at very easy containers. Here, the amount amount of task containers where the user had an explanation in the previous question bundle is way more bigger thant the amount of users that hadn't an explanation.  
It matches with our hypothesis of the very easy containers are more used for beginners in a subject, which leads to the necessity to watch some explanation after solving a question bundle and before to move to the next container. It also explains why the biggest amount/rate of "had explanation" is for very easy containers than for very hard.

Now, in order to get a better understanding of the behaviuor for the other levels, let's plot them with the very easy columns.

In [None]:
trace1 = go.Bar(x = difficulty_level[1:], 
                y = hadnt_explanation.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values[1:],
                name = "Hadn't Explanation")

trace2 = go.Bar(x = difficulty_level[1:], 
                y = had_explanation.value_counts().sort_index(key = lambda x: x.map(custom_dict)).values[1:],
                name = "Had Explanationt")

data = [trace1, trace2]

layout = go.Layout(title = 'Relationship of Task Container level of Difficulty and if Prior Question Had Explanation')

fig = go.Figure(data = data, layout = layout)
fig.show()

Here we could point out two interesting points:
- The amount of "had explanation" are pretty much stable around the value 375k interactions. It may show that just a few users are interested in dive deepper or have difficulty enough to watch explanations about the previous question bundle.
- The amount of "hadn't explanation" grows as the level of difficulty also grows. It may show that as the difficulty the users may lost interest and just don't dive deeper into the explanations. To confirm this hypothesis, a multivariate plot with correct and incorrect answers will comes in handy.

# 3. NEXT STEPS 
So far we explore one variate and bivariate relationships, so the next steps should be:
- Explore multivariate relationships;
- Explore the lectures and questions datasets in the same way that was done for the train dataset;
- Explore the relationships between the 3 datasets!

To be continued...!