In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
from vowpalwabbit import pyvw
from datetime import datetime
from sklearn.metrics import roc_auc_score

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Get mean by each question id - with low memory usage
question_ratings = []
for chunk in pd.read_csv("/kaggle/input/riiid-test-answer-prediction/train.csv", chunksize=1000000):
    chunk = chunk[chunk.answered_correctly != -1].copy()
    chunk.answered_correctly = chunk.answered_correctly.astype('bool')            
    chunk = chunk.groupby('content_id').answered_correctly.agg(['count', 'sum'])
    question_ratings.append(chunk)
question_ratings = pd.concat(question_ratings)
question_ratings = question_ratings.groupby(level = 0).agg('sum')
question_ratings['questions_mean'] = question_ratings['sum'] / question_ratings['count']
answered_correctly_global_mean = question_ratings['sum'].sum() / question_ratings['count'].sum()
question_ratings.drop(columns = ['sum', 'count'], inplace=True)
print(question_ratings)

In [None]:
questions_dict = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', index_col='question_id')
questions_dict = questions_dict.join(question_ratings)
questions_dict = questions_dict.apply(lambda row: f'|q_params mean:{row["questions_mean"]} part:{row["part"]} {row["tags"]}', axis=1)
print(questions_dict)
questions_dict = questions_dict.to_dict()

In [None]:
lectures_dict = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv', index_col='lecture_id')
lectures_dict = lectures_dict.apply(lambda row: f'|l_params tag:{row["tag"]} part:{row["part"]} type_of:{row["type_of"]}', axis=1)
print(lectures_dict)
lectures_dict = lectures_dict.to_dict()

In [None]:
class vv_user_class:
    def __init__(self):
        self.lectures = []
        self.lectures_str = ''
        self.lectures_tags_str = ''
        
        self.questions = []
        self.questions_str = ''
        
        self.q_counter = 0
        self.l_counter = 0
        self.answered_correctly_sum = 0
        self.answered_correctly_mean = answered_correctly_global_mean

    def get_features(self, user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation):       
        l_last = lectures_dict[self.lectures[-1]] if len(self.lectures) > 0 else ''        
#         result = f'|q_hist {self.questions_str} |l_hist {self.lectures_str} {questions_dict[content_id]} {l_last} |user {user_id} |question {content_id} f1:{prior_question_elapsed_time} f2:{prior_question_had_explanation}'
        result = f'|q_hist {self.questions_str} |l_hist {self.lectures_str} {l_last} {questions_dict[content_id]} f1:{prior_question_elapsed_time} f2:{prior_question_had_explanation} t:{timestamp} lc:{self.l_counter} qc:{self.q_counter} um:{self.answered_correctly_mean}'
        return result
    
    def update_question(self, answered_correctly, user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation):
        self.q_counter += 1
        self.answered_correctly_sum += answered_correctly
        self.answered_correctly_mean += self.answered_correctly_sum / self.q_counter
        self.questions_str += f' {content_id}:{1 if answered_correctly else -0.1}'
    
    def update_lecture(self, answered_correctly, user_id, content_id, content_type_id, timestamp):
        self.q_counter += 1
        self.lectures.append(content_id)
        self.lectures_str += f' {content_id}'

class vw_generator_str_class:
    def __init__(self):
        self.model = pyvw.vw(f'--l1 1e-7 --l2 1e-7 -b 19 --learning_rate 6', random_seed=17, loss_function='logistic', link='logistic', quiet=True)
        self.users = defaultdict(vv_user_class)

    def train(self, train_df):
        for idx, (user_id, content_id, content_type_id, answered_correctly, timestamp, prior_question_elapsed_time, prior_question_had_explanation)\
                in enumerate(zip(train_df.user_id, train_df.content_id, train_df.content_type_id, train_df.answered_correctly, train_df.timestamp, train_df.prior_question_elapsed_time, train_df.prior_question_had_explanation)):
            user = self.users[user_id]

            if content_type_id == 0:
                self.model.learn(str(answered_correctly*2-1) + ' ' + user.get_features(user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation))
                user.update_question(answered_correctly, user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation)
            else:
                user.update_lecture(answered_correctly, user_id, content_id, content_type_id, timestamp)

    def predict_than_train(self, train_df):
        prediction_list = []
        for idx, (user_id, content_id, content_type_id, answered_correctly, timestamp, prior_question_elapsed_time, prior_question_had_explanation) \
                in enumerate(zip(train_df.user_id, train_df.content_id, train_df.content_type_id, train_df.answered_correctly, train_df.timestamp, train_df.prior_question_elapsed_time, train_df.prior_question_had_explanation)):
            user = self.users[user_id]

            if content_type_id == 0:
                prediction_list.append(self.model.predict(user.get_features(user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation)))
                self.model.learn(str(answered_correctly*2-1) + ' ' + user.get_features(user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation))
                user.update_question(answered_correctly, user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation)
            else:
                prediction_list.append(-1)
                user.update_lecture(answered_correctly, user_id, content_id, content_type_id, timestamp)
        return prediction_list
    
    def predict(self, test_df):
        prediction_list = []
        for idx, (user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation) \
                in enumerate(zip(test_df.user_id, test_df.content_id, test_df.content_type_id, test_df.timestamp, test_df.prior_question_elapsed_time, test_df.prior_question_had_explanation)):
            user = self.users[user_id]

            if content_type_id == 0:
                prediction_list.append(self.model.predict(user.get_features(user_id, content_id, content_type_id, timestamp, prior_question_elapsed_time, prior_question_had_explanation)))
            else:
                prediction_list.append(-1)
        return prediction_list

# Train the model by chunks

In [None]:
vw_generator = vw_generator_str_class()
for id, chunk in enumerate(pd.read_csv("/kaggle/input/riiid-test-answer-prediction/train.csv", chunksize=500000)):
    start_time = datetime.now()
    chunk.answered_correctly = chunk.answered_correctly.astype('bool')  
    chunk.prior_question_elapsed_time.fillna(0, inplace = True)
    chunk.prior_question_had_explanation.fillna(True, inplace = True)

    if id % 10 == 0:
        chunk['answered_correctly_pred'] = vw_generator.predict_than_train(chunk)
        chunk = chunk[chunk.content_type_id == 0]
        print(f'chunk {id} done duration={datetime.now() - start_time} auc={roc_auc_score(chunk.answered_correctly, chunk.answered_correctly_pred)}')
    else:
        vw_generator.train(chunk)
        print(f'chunk {id} done duration={datetime.now() - start_time}')

# Prediction

In [None]:
import riiideducation
env = riiideducation.make_env()

test_df_prev = pd.DataFrame()
iter_test = env.iter_test()

start_time = datetime.now()
for (test_df, sample_prediction_df) in iter_test: 
    iter_start_time = datetime.now()
    
    # Update model from previous iteration
    if len(test_df_prev) != 0:
        answered_correctly = test_df.iloc[0].prior_group_answers_correct.strip('[]').split(',')
        answered_correctly = [int(i.strip()) for i in answered_correctly] 
        answered_correctly = np.array(answered_correctly)
        test_df_prev['answered_correctly'] = answered_correctly
        
        vw_generator.train(test_df_prev)

    # Prediction for current iteration
    test_df['answered_correctly'] = vw_generator.predict(test_df)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    
    test_df_prev = test_df.copy()
    print('Duration iter: {}'.format(datetime.now() - iter_start_time))
    
print('Duration total: {}'.format(datetime.now() - start_time))