## Let's firstly import the libraries

In [None]:
#loading data

import numpy as np 
import pandas as pd 
import riiideducation 
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import os
import warnings 
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=10**6, 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
train_df.head()

In [None]:
question = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
question.head()

In [None]:
lecture = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
lecture.head()

* The question csv and lecture csv files hold information about the questions and the lectures present in the training dataset as the types of the interactions in the training data sets are either an interaction with lectures or questions.

  * The interactions type is represented in the content_type_id column (0 for question interaction and 1 for question interaction)

In [None]:

questions_interactions = train_df.merge(question, left_on = 'content_id', right_on = 'question_id', how = 'left')
questions_interactions = questions_interactions[questions_interactions.content_type_id == 0]
questions_interactions.rename(columns = {'part': 'test_part'}, inplace = True)

lectures_interactions = train_df.merge(lecture, left_on = 'content_id', right_on = 'lecture_id', how = 'left') 
lectures_interactions.rename(columns = {'part': 'category'}, inplace = True)
lectures_interactions = lectures_interactions[lectures_interactions.content_type_id == 1]

questions_interactions.shape, lectures_interactions.shape

> Most of the interactions are with questions.

### Let's deal with nulls

In [None]:
print(questions_interactions.isnull().sum())
lectures_interactions.isnull().sum()

> The columns that has nulls in the question interactions are the columns that puts nulls in the place where ni previous records are present.

In [None]:
# let's give the prior_question_elapsed_time nulls -1 as the user didn't didn't have previous bundle yet
# and -1 for the prior_question_had_explanation after converting booleans into integers
#as they didn't have any questions before
indeces = questions_interactions[questions_interactions.prior_question_had_explanation.isnull()].index
print(indeces)
values = {'prior_question_elapsed_time': -1, 'prior_question_had_explanation': False}
questions_interactions.fillna(value=values,inplace=True)
questions_interactions.prior_question_had_explanation = questions_interactions.prior_question_had_explanation.astype('int8')
questions_interactions.loc[indeces,'prior_question_had_explanation'] = -1
questions_interactions.head()

In [None]:
questions_interactions.prior_question_had_explanation.value_counts()

Now let's deal with the lectures interactions nulls

In [None]:
lectures_interactions.head()

* content_type_id
* user_answer
* answered_correctly 
* prior_question_elapsed_time 
* prior_question_had_explanation

doesn't have any meaning in this data frame anymore so we will drop them.

In [None]:
lectures_interactions.drop(columns=['content_type_id','user_answer','answered_correctly','prior_question_elapsed_time','prior_question_had_explanation'],inplace =True)
lectures_interactions.head()

## Let's try improving the memory usage for faster analysis

In [None]:
start_mem_usg1 = questions_interactions.memory_usage().sum() / 1024**2 
start_mem_usg2 = lectures_interactions.memory_usage().sum() / 1024**2 

print("Memory usage of questions_interactions dataframe is :",start_mem_usg1," MB")
print("Memory usage of lectures_interactions dataframe is :",start_mem_usg2," MB")

In [None]:
questions_interactions.dtypes

In [None]:
# as the content id is the same as the question id and the content_type_id has only the zero values
# we will drop them
questions_interactions.drop(columns=['content_id','content_type_id'],inplace=True)

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
           # print("dtype after: ",props[col].dtype)
           # print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist



In [None]:
questions_interactions,_ = reduce_mem_usage(questions_interactions)

In [None]:
questions_interactions.dtypes

## Let's explore the questions interaction distributions

In [None]:
continous_columns = ['timestamp','user_id','task_container_id','prior_question_elapsed_time','question_id','bundle_id']
questions_interactions.hist(column=continous_columns, grid=False,figsize=(20,15));

* The bundle_id and the question_id have the same exact distribution which may indicate a duplicate column.
* user_id has uniform distributioin which indicates random user ids are used in the application
* The time stamp has a left skewed distributions which may indicate the relative small times the users use the application in before stopping using it and 
* prior_question_elapsed_time is left skewed too which may indicate that most of the students don't take a lot of time before answering a question
* task_container_id is left skewed which is strange and may indicate a meaning in these ids which makes thier distribution affected by the student actions

In [None]:
color = sns.color_palette()[0]
discrete_columns = ['answered_correctly','user_answer','correct_answer','prior_question_had_explanation','test_part']
for col in discrete_columns:
    plt.figure(figsize=(5,5))
    sns.countplot(data=questions_interactions,x=col,color=color)
    plt.show()

* The number of answered correctly questions is double the wrong answered questions
* Most of the questions had explanation
* Users answers and correct answers are very similar (however there are a lot of wrong answers) which may indicates an interesting relation between them
* The 5th test part has a lot of records followed by the second part

## Let's preprocess the tags feature and explore it too

In [None]:
df = questions_interactions.copy()
df = df.assign(tags2=df['tags'].str.split(' ')).explode('tags2')
df['tags2'] = df['tags2'].astype('int32') 
df['tags2'].nunique()

In [None]:
df['tags2'].hist(grid=False,figsize=(10,5));

The distribution is more like normal distribution which may give the tags ids meaning.

## Let's take a look at the lectures_interactions data frame

In [None]:
lectures_interactions.head()

In [None]:
continous_cols = ['timestamp','user_id','content_id','task_container_id','lecture_id','tag']
lectures_interactions.hist(column=continous_cols, grid=False,figsize=(20,15));

* The time stamp has a left skewed distributions which may indicate the relative small times the users use the application in before stopping using it and 
* user_id, content_id, lecture_id looks to have uniform distributions which means they are just random ids
* task_container_id is left skewed which is strange and may indicate a meaning in these ids which makes thier distribution affected by the student actions
* The tags here have uniform distribution.

In [None]:
color = sns.color_palette()[0]
discrete_columns = ['type_of','category']
for col in discrete_columns:
    plt.figure(figsize=(5,5))
    sns.countplot(data=lectures_interactions,x=col,color=color)
    plt.show()

* The category has the same shape like the test_part in the question_interactions which suggests close relationship between them
* Most of the lectures types are concept based.

## Refrences 
1- memory used reduction: https://www.kaggle.com/cdeotte/dae-book3c from the cool grand master: Chris Deotte

2- https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

## Wait for part 2 with more insights and modular data wrangling :)