In [None]:
pip install -q git+https://github.com/Aykhan-sh/pandaseda@master

In [None]:
import pandas as pd
import numpy as np
import pandaseda.Functional as pf

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def countplot(df, cols, subs, figsize):
    plt.figure(figsize = figsize)
    for idx, i in enumerate(cols):
        plt.subplot(subs[0], subs[1], idx+1)
        sns.countplot(df[i])
        plt.title(i, fontdict={'size':25})
        plt.xticks(size = 15)
        plt.yticks(size = 15)
        plt.xlabel('')

# Structure of DataFrames

In [None]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')
train_chunks = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', chunksize = 10000000)
train_chunk = next(iter(train_chunks))
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')

## Lectures

In [None]:
lectures

In [None]:
lectures_desc = pf.desc(lectures, print_sorted = True)

In [None]:
temp_cols = ['type_of', 'part']
plt.figure(figsize = (16,8))
for idx, i in enumerate(temp_cols):
    plt.subplot(1,2, idx+1)
    sns.countplot(lectures[i])
    plt.title(i, fontdict={'size':25})
    plt.xticks(size = 15)
    plt.yticks(size = 15)
    plt.xlabel('')

## Train (first 10 000 000 rows)

In [None]:
train_chunk

In [None]:
train_chunk_desc = pf.desc(train_chunk, print_sorted = True)

In [None]:
temp_cols = ['content_type_id', 'prior_question_had_explanation', 'answered_correctly', 'user_answer']
countplot(train_chunk, temp_cols, (2,2), (20,20))

In [None]:
plt.figure(figsize = (20,7))
sns.distplot(train_chunk.prior_question_elapsed_time)
plt.title('Prior question elapsed time', fontdict = {'size': 20})
plt.xlabel('');

In [None]:
plt.figure(figsize = (20,15))
sns.boxplot(train_chunk.answered_correctly, train_chunk.prior_question_elapsed_time)
plt.title('Distribution of time elapsed over the result', fontdict = {'size': 20})
plt.xlabel('answered_correctly', size = 16)
plt.ylabel('prior_question_elapsed_time', size = 16);

In [None]:
plt.figure(figsize = (12,12))
sns.countplot(train_chunk.answered_correctly, hue = train_chunk.prior_question_had_explanation)
plt.title('Answers result with and without explanations', fontdict = {'size': 20})
plt.xlabel('Answered_correctly', size = 16);

In [None]:
plt.figure(figsize = (20,7))
sns.distplot(train_chunk.groupby('user_id').agg({'row_id':'count'}).row_id, bins = 300)
plt.title('Distribution of answered questions by user', fontdict = {'size': 20});
plt.xlabel('Number of questions', size = 12);
plt.xlim(0, 3000);

In [None]:
temp_train = train_chunk.groupby('user_id').agg({'answered_correctly': 'sum', 'row_id':'count'})
plt.figure(figsize = (20,7))
sns.distplot((temp_train.answered_correctly * 100)/temp_train.row_id)
plt.title('Distribution of correct answers percentage by each user', fontdict = {'size': 20});
plt.xlabel('Percentage of correct answers', size = 12);

## Questions

In [None]:
questions

In [None]:
questions_desc = pf.desc(questions, print_sorted = True)

## Example_test

In [None]:
example_test

In [None]:
example_test_desc = pf.desc(example_test, print_sorted = True)

In [None]:
temp_cols = ['prior_question_had_explanation', 'group_num']
countplot(example_test, temp_cols, (1,2), (20,10))

## Sample Submission

In [None]:
example_sample_submission