In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from pprint import pprint

In [None]:
questions=pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
questions.head()

In [None]:
%%time
train=pd.read_pickle('../input/riiid-trainpkl/riiid_train.pkl.gzip')
train=train[train.content_type_id==0].copy()
train.head(2)

In [None]:
gc.collect()

In [None]:
pprint(train.info())

train.drop(columns=['timestamp',
                    'content_type_id',
                    'user_answer',
                    'prior_question_elapsed_time', 
                    'prior_question_had_explanation'
                   ],
           axis=1, inplace=True)

pprint(train.info())
gc.collect()

# lets check the Questions

In [None]:
user_question=train.groupby('user_id')[['row_id', 'answered_correctly']].agg({'row_id': 'count', 'answered_correctly': 'sum'}).reset_index().rename(columns={'row_id': 'num_questions'})
user_question['percent_answered_correctly']=100*user_question.answered_correctly/user_question.num_questions
user_question['answered_wrong']=user_question['num_questions']-user_question['answered_correctly']
user_question.head()


In [None]:
user_question.info()

In [None]:
train.info()

In [None]:
gc.collect()

In [None]:
user_question[['num_questions', 'answered_correctly', 'answered_wrong',
               'percent_answered_correctly']].corr()



In [None]:
user_question.describe()

In [None]:
sns.distplot(user_question.num_questions, bins=100)

In [None]:
sns.distplot(user_question.percent_answered_correctly, bins=100)

# Most users answered around 40-70% of the times

In [None]:
sns.scatterplot(data=user_question, 
                x='num_questions',
                y='percent_answered_correctly',
               )

looks like the variability in percent questions answered is getting decreased as the user answers more questions

Lets bin the users based on the quesions answered to look at the percentage of correct answeres: the variability and the mean.

In [None]:
print("Number Of users who anserwerd <=5 questions:", len(user_question[user_question.num_questions<=5]))


In [None]:

user_question[user_question.num_questions<=5]['percent_answered_correctly'].describe()

In [None]:
user_question[user_question.num_questions>10000].shape

In [None]:
def question_bin(num_questions):
    if num_questions<=10:
        return "0-10"
    elif num_questions>10 and num_questions<=30:
        return '10-30'
    elif num_questions >30 and num_questions<=100:
        return '30-100'
    elif num_questions >100 and num_questions<=1000:
        return '100-1000'
    elif num_questions >1000 and num_questions<=5000:
        return '1000-5000'
    elif num_questions>5000:
        return ">5000"
    

In [None]:

user_question['question_bin']=user_question.num_questions.apply(question_bin)
user_question.head()

In [None]:
question_bin=user_question.groupby('question_bin')[['percent_answered_correctly']].agg(['min', 'max', 'mean', 'std']).reset_index()
question_bin.columns=['question_bin', 'min', 'max', 'mean', 'std']
question_bin.head(10)


In [None]:

sns.barplot(data=question_bin, 
            x='question_bin',
            y='mean',
            order=['0-10', '10-30', '30-100', '100-1000', '1000-5000', '>5000']
           )

plt.show()
sns.barplot(data=question_bin, 
            x='question_bin',
            y='std',
            order=['0-10', '10-30', '30-100', '100-1000', '1000-5000', '>5000']
           )

plt.show()

clearly as the number of questions attempted increases; there is a raise in the mean of the correct answers and reduction in the varaince of correct answers.

In [None]:
del user_question
del question_bin

gc.collect()

# Questions Complexity

In [None]:
questions.head()

In [None]:
print("Number Of Questions:", len(questions))
print("Number Of Bundles:", questions.bundle_id.nunique())

In [None]:
%%time
question_stat=train.groupby('content_id')[['answered_correctly', 'user_id']].agg({'answered_correctly': ['count', 'sum'],
                                                                                  'user_id': 'nunique'
                                                                                 }).reset_index()

question_stat.columns=['question_id', 'num_attempted', 'correctly_answered', 'num_users_attempted']
question_stat['percent_answered_correctly']=100*question_stat['correctly_answered']/question_stat['num_attempted']
question_stat.head()

In [None]:
question_stat.head()

In [None]:
question_stat.describe()

In [None]:
plt.title('Questions Attempted', )
sns.distplot (question_stat.num_attempted, bins=400)
plt.show()

plt.title("Distribution Of Question Correctly Answered")
sns.distplot(question_stat.correctly_answered)
plt.show()

plt.title("Distribution Of Users Attempting the question")
sns.distplot(question_stat.num_users_attempted)
plt.show()


plt.title("Distribution Of Correctly Answering")
sns.distplot(question_stat.percent_answered_correctly)
plt.show()

In [None]:
sns.boxplot(question_stat.percent_answered_correctly)

Few Questions had only one user who attempted or correctly answered

In [None]:
question_stat.sort_values('num_attempted', ascending=False).head(10)

In [None]:
question_stat.sort_values('percent_answered_correctly', ascending=False).head(10)

In [None]:
question_stat[question_stat.percent_answered_correctly >= 90].num_attempted.describe()

In [None]:
sns.scatterplot(data=question_stat,
                x='percent_answered_correctly',
                y='num_attempted'
               )

In [None]:
len(question_stat[(question_stat.num_attempted>=1000) & (question_stat.num_attempted<=6000)])/len(question_stat)

In [None]:

question_stat[(question_stat.num_attempted>=1000) & (question_stat.num_attempted<=6000)].describe()

About 46.5% of the questions had the attempts between the range [1000, 6000]

In [None]:
sns.scatterplot(data=question_stat[(question_stat.num_attempted>=1000) & (question_stat.num_attempted<=6000)],
                x='percent_answered_correctly',
                y='num_attempted'
               )

Even in the subset range of questions attempted, there seems to be a big varainace in the 
correct answers.

The Percent of Correct Answers skewed to >=50 percent 

In [None]:
100*len(question_stat[(question_stat.num_attempted>=1000) & 
              (question_stat.num_attempted<=6000) &
              (question_stat.percent_answered_correctly>=50)
             ])/len(question_stat)

More than 41.6% of the Questions are attempted between [1000, 6000] times and answered atleaset 50% of the times.

In [None]:
question_stat.corr()

In [None]:
sns.heatmap(question_stat.corr(), annot=True)

# lets include the parts and tags 

In [None]:
top10_question_ids=question_stat.sort_values('num_attempted', ascending=False).question_id.values[:10]

print("Top 10 Questions Attempted.")
questions[questions.question_id.isin(top10_question_ids)]

In [None]:
question_stat.head()

In [None]:
question_stat=question_stat.merge(questions[['question_id', 'bundle_id', 'part']],
                                  how='inner',
                                  on='question_id')
question_stat.head()

In [None]:
print('Number Of Parts:', question_stat.part.nunique())
question_stat.part.value_counts()

In [None]:
labels=list(question_stat.part.value_counts().index)
values=list(question_stat.part.value_counts().values)


plt.figure(figsize=(17, 5))
plt.title('Distribution Of Question Part')
plt.pie(values,labels=labels, autopct='%.2f%%')
plt.legend(loc='best')
plt.show()

In [None]:
plt.title('Count of Questions by their Parts.')
sns.countplot(data=question_stat, x='part')
plt.show()

In [None]:
del question_stat
gc.collect()

# get the relative performance of Users on the Questions

In [None]:
user_question_stat=train[['user_id', 'content_id', 'answered_correctly']].copy()
user_question_stat.rename(columns={'content_id': 'question_id'}, inplace=True)
user_question_stat=user_question_stat.merge(
    question_stat[['question_id', 'num_attempted', 'percent_answered_correctly']],
    how='inner',
    on='question_id'
)


user_question_stat.head()

In [None]:
sns.scatterplot(data=user_question_stat,
                x='num_attempted',
                y='percent_answered_correctly'
               )