### Let's start by cleaning the data as in part1 but using a pipe for cleaner code

In [None]:
#loading data

import numpy as np 
import pandas as pd 
import riiideducation 
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import os
import warnings 
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=3*(10**6), 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
train_df.head()

In [None]:
question = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
question.head()

In [None]:
lecture = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
lecture.head()

In [None]:
def seperate_interactions(train_df):
    questions_interactions = train_df.merge(question, left_on = 'content_id', right_on = 'question_id', how = 'left')
    questions_interactions = questions_interactions[questions_interactions.content_type_id == 0]
    questions_interactions.rename(columns = {'part': 'test_part'}, inplace = True)

    lectures_interactions = train_df.merge(lecture, left_on = 'content_id', right_on = 'lecture_id', how = 'left') 
    lectures_interactions.rename(columns = {'part': 'category'}, inplace = True)
    lectures_interactions = lectures_interactions[lectures_interactions.content_type_id == 1]
    return questions_interactions,lectures_interactions
def fill_question_nulls(questions_interactions,elapse_val=-1,expl_val=-1):
    '''Fixes the nulls in prior_question_elapsed_time and prior_question_had_explanation'''
    indeces = questions_interactions[questions_interactions.prior_question_had_explanation.isnull()].index
    values = {'prior_question_elapsed_time': elapse_val, 'prior_question_had_explanation': False}
    questions_interactions.fillna(value=values,inplace=True)
    questions_interactions.prior_question_had_explanation = questions_interactions.prior_question_had_explanation.astype('int8')
    questions_interactions.loc[indeces,'prior_question_had_explanation'] = expl_val
    return questions_interactions


In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
           # print("dtype after: ",props[col].dtype)
           # print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

In [None]:
questions_interactions,lectures_interactions = seperate_interactions(train_df)
questions_interactions_cleaned = (questions_interactions.pipe(fill_question_nulls).pipe(reduce_mem_usage))

In [None]:
questions_interactions_cleaned.head()

In [None]:
non_sense_columns_lecture = ['content_type_id','user_answer','answered_correctly','prior_question_elapsed_time','prior_question_had_explanation']
non_sense_columns_question =['content_id','content_type_id']
questions_interactions_cleaned.drop(columns=non_sense_columns_question,inplace=True)
lectures_interactions.drop(columns=non_sense_columns_lecture,inplace=True)

### Now let's try to answer some questions

# Are students getting better after they spend much time on the app ?

In [None]:
# let's first get the maximum time a student spent on the app
print('The maximum time is {} hour'.format(questions_interactions_cleaned.timestamp.max()/(1000*60*60*24)))

Let's see the distribution of the overall time spent by the students in hours

In [None]:
x=questions_interactions_cleaned.groupby('user_id').timestamp.max()/(1000*60*60*24)
plt.hist(x=x,bins=100);


In [None]:
x.value_counts().sort_index()

> We can see that a lot of users don't continue using the app 

> Let's see how are they doing compared with those who use the app for longer times

## Let's see the number of persons who use the app for less than half month

In [None]:
students_quit_early = x[x<24*15]
df_early_quit = questions_interactions_cleaned[questions_interactions_cleaned.user_id.isin(students_quit_early.index)]
df_early_quit.groupby('user_id').answered_correctly.mean().hist(grid=False);

The users mean performance is slightly right skewed and has mean at about 0.6

## Now let's check other students performance

In [None]:
df_cont = questions_interactions_cleaned[~questions_interactions_cleaned.user_id.isin(students_quit_early.index)]
df_cont.groupby('user_id').answered_correctly.mean().hist(bins=19,grid=False);

In [None]:
100*df_cont.shape[0]/questions_interactions_cleaned.shape[0]

In [None]:
x[x>24*15].shape

> There are just 511 student who continued for more than one month with just 18% percent of the total records.

> Their distribution is more right skewed which suggests that they have better grades

Let's see the relation between time spent and score for these students

In [None]:
df_new = df_cont.copy()
df_new['timestamp'] = round(df_new['timestamp']/(1000*60*60*24))
df_new = df_new.groupby('user_id').agg({'answered_correctly':'mean','timestamp':'std'})
sns.regplot(x='timestamp',y='answered_correctly',data=df_new,fit_reg=True);

  > The standard deviation values start from 50 to 400 and looks to have negative relationship with the performance with high outliers.

In [None]:
df_new = df_early_quit.copy()
df_new['timestamp'] = round(df_new['timestamp']/(1000*60*60*24))
df_new = df_new.groupby('user_id').agg({'answered_correctly':'mean','timestamp':'std'})
sns.regplot(x='timestamp',y='answered_correctly',data=df_new,fit_reg=True);

> The relation looks to be positive with a lot of students with low time standard deviation and maximum standard deviation of 175.

## Is there a relation between the part (relevant section of the TOEIC test) being answered and the performance ? 

In [None]:
color = sns.color_palette()[0]
sns.countplot(data=questions_interactions_cleaned,x='test_part',color=color);

Most of the questions lies on the 5th part followed by the 2nd part.

In [None]:
questions_interactions_cleaned.groupby('test_part').answered_correctly.mean()

The first 5 parts look to a decreasing score pattern.

In [None]:
df_early_quit.groupby('test_part').answered_correctly.mean()

In [None]:
df_cont.groupby('test_part').answered_correctly.mean()

> The position of these parts is  independent on the student category.

# Let's check the performance against the task_container_id

In [None]:
containers_perf = questions_interactions_cleaned.groupby('task_container_id').answered_correctly.mean()
containers_perf.hist(grid=False,bins=100);

>A lot of the task containers have only one record that they have just a 0 or 1 mean.

In [None]:
containers_perf_filtered = containers_perf[containers_perf>0][containers_perf<1]
containers_perf_filtered.hist(grid=False,bins=200);

In [None]:
containers_perf_filtered.shape

In [None]:
containers_perf_filtered.value_counts().nlargest(5)

> The mean of the filtered is high at about 0.66

> It can be observed we have peaks at 0.667. 0.5,0.75,0.33

> The peak of 0.667 can appear when a container has only 3 questions, one of them answered wrongly to have a 2/3 score.


# Are students performing well when they see the lecture related to the question before answering ?


In [None]:
user_lectures_info = lectures_interactions.groupby(['user_id','category']).timestamp.min().reset_index()
user_lectures_info['user_cat'] = user_lectures_info['user_id'].astype(str)+'_'+user_lectures_info['category'].astype('int32').astype(str)
user_lectures_info = user_lectures_info[['user_cat','timestamp']]
user_lectures_info.rename(columns = {'timestamp': 'min_time'}, inplace = True)
user_lectures_info.head()

In [None]:
questions_interactions_cleaned_copy = questions_interactions_cleaned.copy()
questions_interactions_cleaned_copy['user_cat'] = questions_interactions_cleaned_copy['user_id'].astype(str)+'_'+questions_interactions_cleaned_copy['test_part'].astype(str)
questions_interactions_cleaned_copy = questions_interactions_cleaned_copy.merge(user_lectures_info, on = 'user_cat', how = 'left') 
questions_interactions_cleaned_copy.head()

In [None]:
questions_interactions_cleaned_copy.fillna(questions_interactions_cleaned_copy.timestamp.max()+1,inplace=True)
questions_interactions_cleaned_copy['is_lec_watched'] = questions_interactions_cleaned_copy['timestamp'] > questions_interactions_cleaned_copy['min_time']


In [None]:
sns.countplot(questions_interactions_cleaned_copy.is_lec_watched);

In [None]:
questions_interactions_cleaned_copy.groupby('is_lec_watched').answered_correctly.mean()

### How much the early quitting students watch the lectures before answering compared with those who didn't quit early ?

In [None]:
early_lec_watch = questions_interactions_cleaned_copy[questions_interactions_cleaned_copy.user_id.isin(students_quit_early.index)]
sns.countplot(early_lec_watch.is_lec_watched);


In [None]:
cont_lec_watch = questions_interactions_cleaned_copy[~questions_interactions_cleaned_copy.user_id.isin(students_quit_early.index)]

sns.countplot(cont_lec_watch.is_lec_watched);

> The early quitting students watched lectures less

In [None]:
early_lec_watch.groupby('is_lec_watched').answered_correctly.mean()

In [None]:
cont_lec_watch.groupby('is_lec_watched').answered_correctly.mean()

### Watching the lecture before answering looks to improve the early quitting students performance and doesn't affect the others much.

# Let's see how the type of the lecture affects the previous results

In [None]:
user_lectures_info = lectures_interactions.groupby(['user_id','category','type_of']).timestamp.min().reset_index()
user_lectures_info['user_cat'] = user_lectures_info['user_id'].astype(str)+'_'+user_lectures_info['category'].astype('int32').astype(str)
user_lectures_info = user_lectures_info[['user_cat','type_of','timestamp']]
user_lectures_info.rename(columns = {'timestamp': 'min_time'}, inplace = True)
user_lectures_info.head()

In [None]:
questions_interactions_cleaned_copy = questions_interactions_cleaned.copy()
questions_interactions_cleaned_copy['user_cat'] = questions_interactions_cleaned_copy['user_id'].astype(str)+'_'+questions_interactions_cleaned_copy['test_part'].astype(str)
questions_interactions_cleaned_copy = questions_interactions_cleaned_copy.merge(user_lectures_info, on = 'user_cat', how = 'left') 
values = {'min_time':questions_interactions_cleaned_copy.timestamp.max()+1,'type_of':'na'} 
questions_interactions_cleaned_copy.fillna(value=values,inplace=True)
questions_interactions_cleaned_copy['is_lec_watched'] = questions_interactions_cleaned_copy['timestamp'] > questions_interactions_cleaned_copy['min_time']
questions_interactions_cleaned_copy.head()

In [None]:
for val in questions_interactions_cleaned_copy.type_of.unique():
    df = questions_interactions_cleaned_copy[questions_interactions_cleaned_copy.type_of==val]
    y=  df.groupby('is_lec_watched').answered_correctly.mean()
    x = y.index
    sns.barplot(x=x,y=y);
    plt.title(val)
    plt.show()


 The type_of doesn't look to have an effect on the correct answers

## Let's now check the base student level against his overall performance

In [None]:
records_per_student = questions_interactions_cleaned.groupby('user_id').row_id.count()
records_per_student.hist(bins=100,grid=False);

In [None]:
records_per_student_count = records_per_student.value_counts()
records_per_student_count[records_per_student_count<10 ].shape[0]/records_per_student_count.shape[0]

> 88% of the students has less than 10 records !!

> We may focus on looking at different features independent from students.

## Can we get the questions easiness ?

In [None]:
100*questions_interactions_cleaned_copy.question_id.nunique()/questions_interactions_cleaned_copy.shape[0]

In [None]:
quest_perf = questions_interactions_cleaned_copy.groupby('question_id').answered_correctly.mean()
quest_perf.hist(grid=False,bins=100);

> Many questions has only one answered right record

> The mean is 0.8 when we filter out the always right questions

In [None]:
filtered_quest_per = quest_perf[quest_perf<1]
filtered_quest_per.hist(grid=False,bins=20);

In [None]:
filtered_questions = filtered_quest_per.index
filtered_df = questions_interactions_cleaned_copy[questions_interactions_cleaned_copy.question_id.isin(filtered_questions)]
100*filtered_df.shape[0]/questions_interactions_cleaned_copy.shape[0]

# Finally let's check the tags relation with the performance.

In [None]:
df = questions_interactions_cleaned_copy.copy()
df = df.assign(tags2=df['tags'].str.split(' ')).explode('tags2')
df['tags2'] = df['tags2'].astype('int32') 
df.head()

In [None]:
tags_perf = df.groupby('tags2').answered_correctly.mean()
tags_perf.hist(grid=False,bins=20);

> some tags look to have high scores at 0.7 and less tags has higher scores


In [None]:
df = questions_interactions_cleaned_copy.copy()
df = df.assign(tags2=df['tags'].str.split(' '))
df['num_tags'] = df['tags2'].str.len()
df.head()

In [None]:
plt.plot(df.groupby('num_tags').answered_correctly.mean().index,df.groupby('num_tags').answered_correctly.mean(),color=color);

> The relation between the number of the tags and the performance is positive

* We can see that the statistics across the data are different depending on the student quit time.
* The standard deviation of time for each student is related to his performance
* The features that addresses the question difficulity (eg. task,container_id,tag) are more related to the performance than the features related to the students.
* The most related feature to the performance is the number of tags per question.

# Refrences

1- memory used reduction: https://www.kaggle.com/cdeotte/dae-book3c from the cool grand master: Chris Deotte

2- https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

