In [None]:
import numpy as np 
import pandas as pd
import psutil
from time import time
from contextlib import contextmanager
from tqdm.notebook import tqdm

In [None]:
dtypes = {'timestamp': 'int64', 
          'user_id': 'int32' ,
          'content_id': 'int16',
          'content_type_id': 'int8',
          'answered_correctly':'int8'}
train_cols = ['timestamp', 
              'user_id', 
              'content_id', 
              'content_type_id', 
              'answered_correctly']

train_df = pd.read_pickle('../input/cv-strategy/cv4_train.pickle')
train_df = train_df[train_cols]
train_df = train_df.astype(dtypes)

train_df = train_df[train_df.content_type_id == False]

In [None]:
%%time
group = train_df[['user_id','content_id']].groupby('user_id').apply(lambda r: (r['content_id'].values))

In [None]:
%%time
group = train_df.groupby('user_id')['content_id'].apply(lambda r: (r.values))

In [None]:
test_df = pd.read_pickle('../input/cv-strategy/cv2_valid.pickle')

In [None]:
class Iter_Valid(object):
    '''
    https://www.kaggle.com/its7171/time-series-api-iter-test-emulator
    '''
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1

        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_content_type_id == 1:
                # no more than one task_container_id of "questions" from any single user
                # so we only care for content_type_id == 0 to break loop
                user_answer_list.append(self.user_answer[self.current])
                answered_correctly_list.append(self.answered_correctly[self.current])
                self.current += 1
                continue
            if crr_user_id in added_user and ((crr_user_id != pre_added_user) or (crr_task_container_id != pre_task_container_id)):
                # known user(not prev user or differnt task container)
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and crr_task_container_id == pre_task_container_id:
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

In [None]:
iter_test = Iter_Valid(test_df,max_user=1000)
predicted = []
def set_predict(df):
    predicted.append(df)

In [None]:
%%time
pbar = tqdm(total=len(test_df))
previous_test_df = None
for (current_test, current_prediction_df) in iter_test:
    if previous_test_df is not None:
        answers = eval(current_test["prior_group_answers_correct"].iloc[0])
        responses = eval(current_test["prior_group_responses"].iloc[0])
        previous_test_df['answered_correctly'] = answers
        previous_test_df['user_answer'] = responses
        prev_group = previous_test_df[['user_id', 'content_id']]\
        .groupby('user_id').apply(lambda r: (
        r['content_id'].values))
        
        
    previous_test_df = current_test.copy()
    current_test = current_test[current_test.content_type_id == 0]
    # your prediction code here
    current_test['answered_correctly'] = 0.5
    set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
    pbar.update(len(current_test))

In [None]:
iter_test = Iter_Valid(test_df,max_user=1000)
predicted = []
def set_predict(df):
    predicted.append(df)

In [None]:
%%time
pbar = tqdm(total=len(test_df))
previous_test_df = None
for (current_test, current_prediction_df) in iter_test:
    if previous_test_df is not None:
        answers = eval(current_test["prior_group_answers_correct"].iloc[0])
        responses = eval(current_test["prior_group_responses"].iloc[0])
        previous_test_df['answered_correctly'] = answers
        previous_test_df['user_answer'] = responses
        prev_group = previous_test_df[['user_id', 'content_id']]\
        .groupby('user_id')['content_id'].apply(lambda r: (
        r.values))
        
        
    previous_test_df = current_test.copy()
    current_test = current_test[current_test.content_type_id == 0]
    # your prediction code here
    current_test['answered_correctly'] = 0.5
    set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
    pbar.update(len(current_test))