## <span style="color:green">train.csv<span style="color:darkblue">
### <span style="color:green">The variables :<span style="color:darkblue"> 

**<span style="color:darkblue">row_id</span>**: (int64) ID code for the row.

<span style="color:darkblue">**user_id**</span>: (int32) ID code for the user.
    
<span style="color:darkblue">**timestamp**</span>: (int64) the time in milliseconds between this user interaction and the first event completion from that user. <br>
<span style="color:crimson">**Continuous variable**</span>

    
<span style="color:darkblue">**user_answer**</span>: (int8) <br />
0, 1, 2, 3 if content_type_id == 0. -1 if content_type_id <br>
<span style="color:crimson">**Categorical variable**</span>


<span style="color:darkblue">**content_id**</span>: (int16) ID code for the user interaction

    
<span style="color:darkblue">**content_type_id**</span>: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.<br>
<span style="color:crimson">**Categorical variable**</span>

<span style="color:darkblue">**task_container_id**</span>: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

<span style="color:darkblue">**prior_question_elapsed_time**</span>: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.<br>
<span style="color:crimson">**Continuous variable**</span>

<span style="color:darkblue">**prior_question_had_explanation**</span>: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.<br>
<span style="color:crimson">**Boolean variable**</span>

<span style="color:green">**TARGET**:</span><br>

<span style="color:darkblue">**answered_correctly**</span>: (int8) if the user responded correctly. Read -1 as null, for lectures.<br>



=> We want to predict if the user will answer correctly or not.<br>
We will exclure the value "-1" which corresponds to the lecture, not the answer.<br>
<span style="color:magenta">**So we have a binary classification problem to solve.**</span>

## Import librairies

In [None]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import nltk

from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Functions

In [None]:
def print_pie(data_df, column, title_fig, title_legend) :
    """
    Print a pie chart. 
    """
    fig, ax = plt.subplots(figsize=(6, 6))
    ttl = plt.title(title_fig, fontsize=15, weight="bold")
    ttl.set_position([0, 1.05])
    data_df[column].value_counts(normalize=True).sort_index().plot(kind='pie',
                                                                   startangle=180, 
                                                                   counterclock=False, 
                                                                   autopct='%1.1f%%', 
                                                                   fontsize = 14)
    plt.axis('equal')
    plt.ylabel('')
    plt.rcParams['legend.title_fontsize'] = 'large'
    ax.legend(title=title_legend, loc="center right",
              bbox_to_anchor=(1, 0, 1, 1), fontsize='medium')
    plt.show()

    
def print_filling_rate (data_df, a_label_abscisse, color_graphe, color_threshold  ) : 
    """ 
        Display the filling rate of the columns
        @donnees_df : dataframe qui contient les données
        @a_seuil : booléen égal à True si on souhaite afficher le seuil
        @a_label_abscisse : booléen égal à True si on souhaite afficher 
        le nom des colonnes en abscisse
        @color_graphe
        @color_threshold 
    """
    if(color_graphe == ''):
        color_graphe = 'blue'
    data = (data_df.count() / len(data_df)).sort_values().values
    ind = np.arange(len(data))
    width = 0.5
    fig, axes = plt.subplots(1, 1, figsize=(6, 3), dpi=100)
    tr = axes.bar(ind, data, width, color=color_graphe)
    axes.set_ylabel('Filling rates');
    if(a_label_abscisse):
        axes.set_xticks(ind )
        axes.set_xticklabels((data_df.count() / len(data_df)).\
                             sort_values().index, fontsize=10, rotation=90)
        axes.legend([tr], ['Filling rates'])
        
def plot_distribution_of_number_of_tags(nb_tags, with_return):
    tags_most_common = tags_frequence.most_common(nb_tags)
    tags_df = pd.DataFrame(tags_most_common, columns=['tags' , 'nb_tags']) 
    tags_df.columns
    tags_sorted_df = tags_df.sort_values(['nb_tags'], ascending=False)
    tags_counts = tags_sorted_df['nb_tags'].values
    plt.plot(tags_counts)
    plt.title("Distribution of number of tags")
    plt.grid()
    plt.xlabel("Number of tags")
    plt.ylabel("Number of occurences")
    plt.show()
    if(with_return):
        return tags_df

## Loading data

In [None]:
%%time

dtypes = {
    "row_id":"int64",
    "timestamp":"int64",
    "user_id":"int32",
    "content_id":"int16",
    "content_type_id":"int8",
    "task_container_id":"int16",
    "user_answer":"int8",
    "answered_correctly":"int8",
    "prior_question_elapsed_time":"float32", 
    "prior_question_had_explanation":"boolean"
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
                       low_memory=False, 
                       nrows=10**6, 
                       dtype=dtypes
                      )
print("Train size:", train_df.shape)

In [None]:
train_df.memory_usage(deep=True)

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

In [None]:
print(train_df.isnull().sum())
print("****************************************")
print(train_df.isnull().sum() / len(train_df))

print_filling_rate (train_df, True, "blue", "blue")

Distribution of questions and lectures content_type :

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Percentage of content type : questions or lectures', fontsize=15, weight="bold")
labels = ['Questions','Lectures']
colors = ['lightgray','peachpuff']
explode = (0.1,0.2 )
train_df["content_type_id"].value_counts(normalize=True).plot(kind='pie',
                                                              labels=labels,
                                                              colors=colors,
                                                              explode=explode,
                                                              startangle=50,
                                                              autopct='%1.1f%%',
                                                              fontsize=13)
plt.axis('equal') 
plt.ylabel('')
plt.show()

print(train_df['content_type_id'].value_counts().sort_index().to_frame())

1. The majority of the users interactions are <span style="color:magenta">**questions**</span> : 98% VS 2% for lectures.

In [None]:
%%time

questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')


## Merge of train_df and questions_df

In [None]:
train_questions_df = pd.merge(train_df[train_df['content_type_id']==0],
                              questions_df, 
                              how='left', 
                              left_on='content_id', 
                              right_on='question_id').sort_values('row_id')

In [None]:
train_questions_df.columns

In [None]:
train_questions_df.describe()

In [None]:
print("The dataset contains {} rows and {} columns. \n".\
      format(train_questions_df.shape[0], train_questions_df.shape[0]))
for col in train_questions_df:
    print("The column {} has {} unique values.".\
          format(col, train_questions_df[col].nunique()))

## Users

3 824 unique users

In [None]:
nb_questions_per_user = train_questions_df[['user_id','row_id']].\
                                 groupby(['user_id'],as_index=False).\
                                 agg(['count']).reset_index()

nb_questions_per_user.columns = ["user_id", "nb_questions"]

print("Mean : {}".format(nb_questions_per_user["nb_questions"].mean()))
print("Min : {}".format(nb_questions_per_user["nb_questions"].min()))
print("Max : {}".format(nb_questions_per_user["nb_questions"].max()))
print("Median : {}".format(nb_questions_per_user["nb_questions"].median()))


Median is 41. 50% of the users has answer at 41 or less questions.

Max = 10797. Mean (=10797) is sensible at the outliers.

Let have a look at the distribution of the number of question by user :

In [None]:
fig = plt.figure(figsize=(12,6))
nb_questions_per_user['nb_questions'].plot.hist(bins=100)
plt.title("Distribution of the number of questions by user")
plt.xticks(rotation=0)
plt.xlabel("Number of questions by user")
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
nb_questions_per_user[nb_questions_per_user['nb_questions']<200]['nb_questions'].plot.hist(bins=100)
plt.title("Distribution of the number of questions by user (max 1000 questions)")
plt.xticks(rotation=0)
plt.xlabel("Number of questions by user")
plt.show()

In [None]:
del nb_questions_per_user
gc.collect()

In [None]:
# Check if we have the same result if we calculate if the answer is correct and the target
user_accuracy_df = train_questions_df[['user_id', 'user_answer', 'correct_answer','answered_correctly']]

user_accuracy_df['answered_correctly_calc'] = user_accuracy_df['user_answer'] -\
            user_accuracy_df['correct_answer']

user_accuracy_df['different_result'] = 1
user_accuracy_df.loc[user_accuracy_df['answered_correctly_calc'] != 0, 'different_result'] = 0
user_accuracy_df['is_different_result'] = user_accuracy_df['different_result'] -\
            user_accuracy_df['answered_correctly']


In [None]:
print(user_accuracy_df['is_different_result'].sum())


The columns 'user_answer' is just here for information, maybe we can find a pattern : when the user doesn't know the answer, he uses to choose the same answer. Or maybe the question has a subtlety and students with a lower level do the same error, that can help to detect a misunderstood and propose a solution to help students, ...

In [None]:
print_pie(user_accuracy_df, 'user_answer', "Percentage of user answers:", "User answers: ")
print(user_accuracy_df.user_answer.value_counts())

The user answers are 0, 1, 2 or 3. But what means 0 : is it a possible answer, or does it mean "no answer" ? 
Let have a look at the possible answers (in questions_df) :

In [None]:
print_pie(questions_df, 'correct_answer', 
          "Percentage of possible answers:", 
          "Possible answers: ")

Ok, 0 is a possible answer.

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Percentage of questions correctly answered:', fontsize=15, weight="bold")
labels = ['Correct', 'Uncorrect']
colors = ['lightgray', 'peachpuff']
explode=(0.1,0)
user_accuracy_df["answered_correctly"].\
                value_counts(normalize=True).\
                plot(kind='pie', labels=labels, colors=colors, explode=explode,
                     startangle=50, autopct='%1.1f%%', fontsize = 13)
plt.axis('equal') 
plt.ylabel('')
plt.show()

print(user_accuracy_df['answered_correctly'].value_counts().sort_index().to_frame())

### The part of questions : 

In [None]:
answer_part_df = train_questions_df[['part', 'answered_correctly']].\
                                 groupby(['part'],as_index=False).\
                                 agg(['mean']).reset_index()
answer_part_df.columns = ["part", 'answered_correctly_mean']

In [None]:
answer_part_df

The Toeic has 2 sections : Listening (part 1 -> 4) and Reading (part 5 -> 7)
The section listening seems to be easy than the reading section.
Parts 1-> 3 are the easiest parts. Parts 4 and 5 the more difficult.

In [None]:
del answer_part_df
gc.collect()

In [None]:
answer_nb_bundle_by_part_df = questions_df[['part', 'bundle_id']].\
                                 groupby(['part'], as_index=False).\
                                 agg(['count']).reset_index()
answer_nb_bundle_by_part_df.columns = ["part", 'nb_bundle']
answer_nb_bundle_by_part_df

We have between 992 and 5511 possible bundles of questions by part.


In [None]:
answer_nb_part_by_bundly_df = questions_df[['bundle_id', 'part']]
answer_nb_part_by_bundly_df.drop_duplicates(keep='first', inplace=True)

result_nb_part_by_bundle = answer_nb_part_by_bundly_df.\
            groupby(['bundle_id'],as_index=False).agg(['count']).reset_index()
result_nb_part_by_bundle.columns = ["bundle_id", 'nb_part']
result_nb_part_by_bundle.describe()

One part can have several bundles.
But one bundle belongs to only one part.

In [None]:
del answer_nb_part_by_bundly_df
del result_nb_part_by_bundle
gc.collect()

In [None]:
answer_bundle_df = train_questions_df[['bundle_id', 'part', 'answered_correctly']].\
                                 groupby(['bundle_id', 'part'],as_index=False).\
                                 agg(['mean']).reset_index()
answer_bundle_df.columns = ["bundle_id", "part", 'answered_correctly_mean']

In [None]:
fig = plt.figure(figsize=(12,6))
answer_bundle_df['answered_correctly_mean'].plot.hist(bins=100)
plt.title("Distribution of mean of correct answers by bundle")
plt.xticks(rotation=0)
plt.xlabel("mean of correct answer by bundle")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.hist(answer_bundle_df[answer_bundle_df['part']<4]['answered_correctly_mean'], 
         bins=50, color="blue", 
         alpha=0.5, label='Bundle in part 1-3')
plt.hist(answer_bundle_df[(answer_bundle_df['part']>=4) & \
                          (answer_bundle_df['part']<6)]['answered_correctly_mean'], 
         bins=50, color="red", alpha=0.25, 
         label='Bundle in part 4-5')
plt.legend(loc='upper left')

plt.title('Distribution of mean of correctly answered questions by group of bundle')
plt.grid()

plt.show()

In [None]:
del answer_bundle_df
gc.collect()

## timestamp

In [None]:
user_timestamp_df = train_df[['user_id', 'timestamp']].\
                                 groupby(['user_id'],as_index = False).\
                                 agg(['max','min','mean']).reset_index()
user_timestamp_df.columns = ["user_id", 'max_timestamp', 'min_timestamp', 'mean_timestamp']

In [None]:
user_timestamp_df.describe()

The min timestamp is 0. So we should have the history of interactions from the first interaction.

'timestamp' is in milliseconds. It is difficult for human to have an idea about how long are this periods with this unity. We will convert it in month and observe if it easier to represent this periods.

In [None]:
train_df['timestamp_by_month'] = train_df['timestamp'] / (1000 * 60 * 60 * 24 * 365) * 12
fig = plt.figure(figsize=(12,6))
train_df['timestamp_by_month'].plot.hist(bins=100)
plt.title("Histogram of timestamp converted in month")
plt.xticks(rotation=0)
plt.xlabel("Months between this user interaction and the first event completion from that user")
plt.show()

In [None]:
fig = plt.figure(figsize=(12,6))
train_df[(train_df['timestamp_by_month'] >= 1) &\
         (train_df['timestamp_by_month'] < 6)]['timestamp_by_month'].plot.hist(bins=100)
plt.title("Histogram of timestamp converted in month\
           between 1 month and 6 months")
plt.xticks(rotation=0)
plt.xlabel("Months between this user interaction and\
            the first event completion from that user")
plt.show()

In [None]:
user_timestamp_df['max_timestamp_in month'] = user_timestamp_df['max_timestamp'] / \
                   (1000 * 60 * 60 * 24 * 365) * 12

In [None]:

fig = plt.figure(figsize=(12, 6))
(user_timestamp_df["max_timestamp"] / 1000 / 60 / 60 / 24 / 365 * 12).plot.hist(bins=100)
plt.title("Histogram of max timestamp of users")
plt.xticks(rotation=0)
plt.xlabel("max timestamp of users")
plt.show()

## Questions

In [None]:
questions_df.info()

In [None]:
questions_df.describe()

In [None]:
print(questions_df.isnull().sum())
print("****************************************")
print(questions_df.isnull().sum() / len(train_df))

print_filling_rate (questions_df, True, "green", "green")

Only 1 row of "tags" has missing value.

## Tags

In [None]:
questions_df['tags'].fillna("", inplace=True)
questions_df["nb_tags"] = questions_df["tags"].apply(lambda text: len(text.split()))

questions_df["nb_tags"].describe()

In [None]:
import matplotlib.style as style
style.use('seaborn-dark-palette')
fig, ax = plt.subplots(figsize=(6, 4))
ax.set_title("Distribution of number of tags by question", 
             fontsize=15, weight="bold");

sns.countplot(questions_df['nb_tags'], palette="Set1")
ax.set_ylabel("Frequency", fontsize=14)
ax.set_xlabel("Number of tags by question", fontsize=14);

Create 2 lists :

with all tags of all questions (a same tag can appear several times in this list)
with unique tags. A same tag can appear only one time).

In [None]:
questions_df['tags_list'] = questions_df['tags'].apply(lambda x: x.split())

# List with all tags 
tags_list = [item for sublist in questions_df['tags_list'].values for item in sublist]
print('{} tags are used in questions.'.format(len(tags_list)))

# List of unique tags
tags_unique_list = list(set(tags_list))
print('There are {} unique tags.'.format(len(tags_unique_list)))

In [None]:
tags_frequence= nltk.FreqDist(tags_list)

plot_distribution_of_number_of_tags(len(tags_unique_list), False)

In [None]:
plot_distribution_of_number_of_tags(25, False)

In [None]:
words_most_common = tags_frequence.most_common(10)
fq_words_df = pd.DataFrame(words_most_common, columns = ['tags' , 'nb_tags']) 
print((fq_words_df['tags'][:10]).to_list())

del fq_words_df
gc.collect()