In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
import itertools

In [None]:
df = pd.read_feather('../input/ednet-kt34/KT3_merged.feather')
questions_df = pd.read_csv('../input/ednet-contents/questions.csv')

In [None]:
questions_df.head()

In [None]:
df.head(20)

In [None]:
# questions_df[['first_correct', 'avg_attempts', 'final_correct', 'time_spend']] = pd.DataFrame([[0, 0, 0, 0]], index=questions_df.index)

In [None]:
questions_df.head()

In [None]:
df['action_type'].unique()

In [None]:
df[df['action_type']=='respond']['source'].unique()

In [None]:
answers_dict = {q: questions_df[questions_df['question_id']==q]['correct_answer'].to_numpy()[0]
               for q in questions_df['question_id']}

In [None]:
df_respond = df[df['action_type']=='respond']

In [None]:
df_respond['correct_answer'] = df_respond['item_id'].map(answers_dict)
df_respond['is_correct'] = np.where(df_respond['correct_answer']==df_respond['user_answer'], 1, 0)

In [None]:
df_respond.head()

In [None]:
del answers_dict

## Final correct respond

In [None]:
df_final_respond = df_respond.drop_duplicates(subset=['user_id', 'item_id'], keep='last')
print(df_respond.shape, df_final_respond.shape)

In [None]:
sns.countplot(x='is_correct', data=df_final_respond)

In [None]:
df_final_correct = df_final_respond[df_final_respond['is_correct']==1]

In [None]:
questions_df['final_correct'] = questions_df['question_id'].map(df_final_correct['item_id'].value_counts())

In [None]:
questions_df.head()

## No. of users trying each question

In [None]:
users_per_question = df_respond.groupby('item_id')['user_id'].unique().apply(lambda x: len(x))
users_per_question.head()

In [None]:
questions_df['num_users'] = questions_df['question_id'].map(users_per_question)

In [None]:
questions_df.head()

## Percentage of correct attempts

In [None]:
questions_df['pct_correct'] = questions_df['final_correct'] / questions_df['num_users']
questions_df.head()

## Avg no. of attempts

In [None]:
question_total_attempts = df_respond['item_id'].value_counts()

In [None]:
questions_df['avg_attempts'] = questions_df['question_id'].map(question_total_attempts / users_per_question)

In [None]:
del question_total_attempts

In [None]:
questions_df.head()

## Percentage of correct at first time

In [None]:
gb = df_respond.groupby('item_id')['user_id'].value_counts()

In [None]:
gb

In [None]:
# questions that have only-one attempts
questions_with_one_att = gb[gb==1].reset_index(name='count')
questions_with_one_att

In [None]:
questions_with_one_att['item_id'].value_counts()

In [None]:
one_correct = pd.merge(questions_with_one_att, df_final_correct, on=['item_id', 'user_id'], how='inner')[['item_id', 'user_id']]
one_correct

In [None]:
questions_df['correct_at_first'] = questions_df['question_id'].map(one_correct['item_id'].value_counts() / users_per_question)

In [None]:
questions_df.head(10)

In [None]:
del one_correct, questions_with_one_att, df_final_correct, df_final_respond, gb, users_per_question

## Basic analysis

Correlation

In [None]:
corr = questions_df[['pct_correct', 'avg_attempts', 'correct_at_first']].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='YlGn', annot=True)

In [None]:
sns.pairplot(questions_df[['pct_correct', 'avg_attempts', 'correct_at_first']], kind='reg', plot_kws={'line_kws':{'color':'red'}}, corner=True)

## Tags

In [None]:
tags = set(itertools.chain.from_iterable(questions_df['tags'].str.split(';')))
print(tags)

In [None]:
for tag in tags:
    if tag != '-1':
        questions_df[tag] = 0

In [None]:
def encode_tags(cols):
    tags = questions_df[questions_df['question_id']==cols['question_id']]['tags'].str.split(';').values[0]
    if tags[0] != '-1':
        for tag in tags:
            cols[tag] = 1
    return cols

In [None]:
questions_df = questions_df.apply(encode_tags, axis=1)

In [None]:
stats = {'tag': [], 'pct_correct': [], 'avg_attempts': [], 'correct_at_first': []}
for tag in tags:
    if tag != '-1':
        stats['tag'].append(tag)
        stats['pct_correct'].append(questions_df[questions_df[tag]==1]['pct_correct'].mean())
        stats['avg_attempts'].append(questions_df[questions_df[tag]==1]['avg_attempts'].mean())
        stats['correct_at_first'].append(questions_df[questions_df[tag]==1]['correct_at_first'].mean())
        
tags_df = pd.DataFrame(stats)
tags_df

Percentage of correct answers

In [None]:
print('Highest pct of correct ans:\n', tags_df.loc[tags_df['pct_correct'].argmax()])
print()
print('Lowest pct of correct ans:\n', tags_df.loc[tags_df['pct_correct'].argmin()])

Avg no. of attempts

In [None]:
print('Highest avg no. of attempts:\n', tags_df.loc[tags_df['avg_attempts'].argmax()])
print()
print('Lowest avg no. of attempts:\n', tags_df.loc[tags_df['avg_attempts'].argmin()])

Answer correctly using one attempts

In [None]:
print('Highest pct of 1st correct attempt:\n', tags_df.loc[tags_df['correct_at_first'].argmax()])
print()
print('Lowest pct of 1st correct attempt:\n', tags_df.loc[tags_df['correct_at_first'].argmin()])

## Write to file

In [None]:
questions_df.to_csv('question_features.csv', index=False)