In [None]:
import riiideducation
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import json

# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

In [None]:
DTYPE={
    'row_id': 'int64',
   'timestamp': 'int64',
   'user_id': 'int32',
   'content_id': 'int16',
   'content_type_id': 'int8',
   'task_container_id': 'int16',
   'user_answer': 'int8',
   'answered_correctly': 'int8',
   'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'boolean',
    }

USECOLS = ["user_id", "content_id", "answered_correctly"]
CHUNKSIZE = 10**7

In [None]:
class StreamingStats:
    def __init__(self):
        self.stats = {}
        
    def add_item(self, item, score, weight=1):
        stat = self.stats.get(item)
        if stat is not None:
            stat["ncount"] += weight
            stat["cum_score"] += score
            stat["score"] = stat["cum_score"]/stat["ncount"]
        else:
            self.stats[item] = {"ncount": weight, "cum_score": score, "score": score/weight}
            
    def mean(self, item):
        stat =  self.stats.get(item)
        return stat["score"] if stat is not None else 0.
    
    def __repr__(self):
        return "<StreamingStats of size {} >".format(len(self.stats))

In [None]:
user_stats = StreamingStats()
content_stats = StreamingStats()

user_stats, content_stats

In [None]:
chunked_df = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", usecols=USECOLS, chunksize=CHUNKSIZE)
chunked_df

In [None]:
for df in tqdm(chunked_df):
    
    df = df[df["answered_correctly"].isin([0,1])]
    
    stats = df.groupby("user_id")["answered_correctly"].agg(["count", "sum"])
    for user_id,row in stats.iterrows():
        user_stats.add_item(user_id , row["sum"], row["count"])
        
    stats = df.groupby("content_id")["answered_correctly"].agg(["count", "sum"])
    for content_id,row in stats.iterrows():
        content_stats.add_item(content_id , row["sum"], row["count"])
        
user_stats, content_stats

In [None]:
def scorer(u_score, c_score, beta=1.0, epsilon=1e-6):
    return (1+beta)*u_score*c_score/(epsilon + u_score + beta*c_score)

In [None]:
def predict(user_id, content_id, beta=1.0):
    u_score = user_stats.stats.get(user_id)
    if u_score is None:
        c_score = content_stats.stats.get(content_id)
        if c_score is None: return 0.5
        return c_score["score"]
    else:
        c_score = content_stats.stats.get(content_id)
        if c_score is None: return u_score["score"]
        return scorer(u_score["score"], c_score["score"], beta=beta)

In [None]:
# def predict(user_id, content_id, beta=1.0):
#     u_score = (user_stats.stats.get(user_id) or {"score": 0.5})["score"]
#     c_score = (content_stats.stats.get(content_id) or {"score": 0.5})["score"]
#     return scorer(u_score, c_score, beta=beta)


def predict_from_df(test_df, beta=1.0, update=True):
    test_df = test_df.loc[test_df['content_type_id'] == 0]
    
    answers_correct_not_null = test_df["prior_group_answers_correct"].notnull()
    
    scores = []
    
    for user_id, content_id, answers_correct, notnull in zip(test_df["user_id"], test_df["content_id"], 
                                                                test_df["prior_group_answers_correct"], answers_correct_not_null):
        
        score = predict(user_id, content_id, beta=beta)
        scores.append(score)
        
        
        if update and notnull and answers_correct:
#             print(answers_correct)
            answers_correct = json.loads(answers_correct.strip())
            if len(answers_correct):
                user_stats.add_item(user_id, np.sum(answers_correct), weight=len(answers_correct))
    
    test_df['answered_correctly'] = [predict(user_id, content_id, beta=beta)  
                                     for user_id, content_id in zip(test_df["user_id"], test_df["content_id"])]
    return test_df

In [None]:
BETA = 2.5
UPDATE = False

In [None]:
test_df = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")
predict_from_df(test_df, beta=BETA, update=UPDATE)[["row_id", "user_id", "content_id", "answered_correctly"]]

In [None]:
# test_df

In [None]:
%%time

iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    env.predict(predict_from_df(test_df, beta=BETA, update=UPDATE)[['row_id', 'answered_correctly']])