# Repetitions in Riiid

Many users took tests for multiple times.

Below you can see effect of repetitions for a single bundle.

In [None]:
import os
import re
from collections import defaultdict
import pickle
import gc

import numpy as np
import pandas as pd
import psutil
from pylab import plt, plot, hist, legend, array, arange, zeros, ones, sqrt, where, cm
import seaborn as sns

_ = np.seterr(divide='ignore', invalid='ignore')

from tqdm import tqdm
# tqdm.pandas()

In [None]:
PATH = '../input/riiid-test-answer-prediction/'
col_dtype = {'row_id': 'int64', 
             'timestamp': 'int64', 
             'user_id': 'int32', 
             'content_id': 'int16', 
             'content_type_id': 'int8',
             'task_container_id': 'int16', 
             'user_answer': 'int8', 
             'answered_correctly': 'int8', 
             'prior_question_elapsed_time': 'float32', 
             'prior_question_had_explanation': 'boolean',
             }
SECPERD = 24*60*60

TARGET = 'answered_correctly'

In [None]:
Q = pd.read_csv(PATH+'questions.csv', engine='c')

In [None]:
D = pd.read_csv(PATH+'train.csv', engine='c'#, nrows=700_000
               )

Let's select questions in bundles for stereotype check

In [None]:
QN = Q.bundle_id.value_counts()

QN[QN==5].head(20)

## Typical trajectory for 1 bundle

In [None]:
qid = 7780
bid = Q.loc[Q.question_id==qid,'bundle_id'].iloc[0]
z = Q.loc[Q.question_id==qid,'part'].iloc[0]

print(D[D.content_id==qid].content_type_id.value_counts(dropna=False))
aa = Q.loc[Q.bundle_id==bid, 'correct_answer']
nb = len(aa)


qii = qid + arange(nb)
X = D[(D.content_type_id==0) & (D.content_id.isin(qii))]

uagg = X.iloc[-1_000:].groupby(['user_id','task_container_id'])

# xb = X.iloc[-nb:]
# xb

for (uid,tcid), xb in iter(uagg):
    if len(xb)!= nb:
        logger.warning(f'? {len(xb)} of {nb} records (u={uid},tc={tcid}) ')
#         continue
    i = uid
    co = plt.cm.jet(i%256, alpha=.05)
    xb.sort_values('content_id', inplace=True)
    bbcorr = (xb.answered_correctly.values > 0)
    ii = arange(len(bbcorr))+1
    plot(ii[~bbcorr], xb.user_answer[~bbcorr], color=co, marker='x', lw=0)
    plot(ii[bbcorr], xb.user_answer[bbcorr], color=co, marker='o', lw=0)
    plot(ii, xb.user_answer, color=co, marker=None, lw=5)
    
plot(ii, aa, marker='o', ms=30, mec='k', mfc='None', lw=0)    
plt.xticks(ii, qii)
plt.box(False)
plt.title(f'{len(uagg)} answers on bundle {qid}, part {z}');

not all users have data for complete bundle:  they skipped some questions or the data are fragmented

(for the whole dataset it is true)

In [None]:
uagg = X.sort_values('content_id').groupby(['user_id','task_container_id'], as_index=True)
utn = uagg.row_id.count()
utn

## Repeats

In [None]:
u_ntc = utn.reset_index().groupby('user_id')['task_container_id'].count()
u_ntc[u_ntc>1]

In [None]:
u_ntc.max()

Up to 8 repetitions!

It is interesting, whether tcid is related with order of training for every user?

In [None]:
tcpo = utn.reset_index()['task_container_id'].value_counts().sort_index()

tcpo.rolling(50, 1).sum().plot();

Can timestamp differ for a single bundle?

In [None]:
mm_utc = uagg['timestamp'].agg(['min','max'])

np.all(mm_utc['max'] == mm_utc['min'])

No!  Bundle of questions is momental in time!

So we dont know the order ... may be they are randomised, but we assume the order as question_id grows.

Balls by user and task_container ...

In [None]:
b_utc = uagg['answered_correctly'].sum()
b_utc

In [None]:
nrep=6
bagg = b_utc[u_ntc[u_ntc>2].index].droplevel(1).reset_index().groupby('user_id')['answered_correctly']

brep = pd.concat([bagg.nth(i) for i in range(nrep)], 1)
brep.columns=range(nrep)
brep

In [None]:
plt.errorbar(arange(nrep), brep.mean(), brep.std());
plt.xlabel('Attempt')
plt.ylabel(f'Q of {nb}')
plt.title(f'Progress for bundle {bid}');

for poor starters only ...

In [None]:
brep_ = brep[brep[0]<3]
plt.errorbar(arange(nrep), brep_.mean(), brep_.std());
plt.xlabel('Attempt')
plt.ylabel(f'Q of {nb}')
plt.title(f'Progress for bundle {bid}');

## Always dummy ?

Whether some users click on the same answer option?

In [None]:
uaa = uagg.apply(lambda x: None if len(x) < nb else pd.Series(x['user_answer'].values, index=qii, name=x.index[0]))
uaa

In [None]:
n = uaa.shape[0]

ncorr = len(uaa[(uaa == aa.values).all(1)])

ncorr/n

Only 22% of users completed the bundle w/o errors!

In [None]:
((uaa == aa).sum() / n).plot(kind='barh'); 
plt.xlim(0,1);
plt.title('% of correct anser per question');

Random guess answers (probability=0.25) are included!

But also partitial knowledge...

Absolute dummies are those who click the same variant.

In [None]:
traj = np.ones(uaa.shape[1]) * 0

uaa[(uaa == traj).all(1)]

There were such dummy users!

Let's count percentage...

In [None]:
{i: '{:.2%}'.format(sum((uaa == (np.ones(uaa.shape[1]) * i)).all(1)) / n ) for i in range(4) }

The second variant is more even popular!

... or they may randomised in test presentation software?

Koreans who took TOEIC, write in comments!

## Conclusion

You can trace repetition and give higher predictions for repeated measures or craft an additional feature for machine learning!

You can mark those users or sessions as dummy style, and predict 0.25 - the probability for dummy choice of 4.

But you must repeat this analysis for every question!