## Source Kernel
This kernel generates and submits predictions using the model and features developed in the kernel titled [RIIID: BigQuery-XGBoost End-to-End](https://www.kaggle.com/calebeverett/riiid-bigquery-xgboost-end-to-end).

In [None]:
import gc
import json
import pandas as pd
from pathlib import Path
import sqlite3
import riiideducation
import time
import xgboost as xgb
import numpy as np

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

## Load Model

In [None]:
PATH = Path('../input/riiid-submission')

In [None]:
model_xgb = xgb.Booster(model_file=PATH/'model.xgb')
print('model loaded')

## Load State

In [None]:
dtypes = {
    'answered_correctly': 'int8',
    'answered_correctly_content_id_cumsum': 'int16',
    'answered_correctly_content_id_cumsum_pct': 'int16',
    'answered_correctly_cumsum': 'int16',
    'answered_correctly_cumsum_pct': 'int8',
    'answered_correctly_cumsum_upto': 'int8',
    'answered_correctly_rollsum': 'int8',
    'answered_correctly_rollsum_pct': 'int8',
    'answered_incorrectly': 'int8',
    'answered_incorrectly_content_id_cumsum': 'int16',
    'answered_incorrectly_cumsum': 'int16',
    'answered_incorrectly_rollsum': 'int8',
    'bundle_id': 'uint16',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'correct_answer': 'uint8',
    'lecture_id': 'uint16',
    'lectures_cumcount': 'int16',
    'part': 'uint8',
    'part_correct_pct': 'uint8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_elapsed_time_rollavg': 'float32',
    'prior_question_had_explanation': 'bool',
    'question_id': 'uint16',
    'question_id_correct_pct': 'uint8',
    'row_id': 'int64',
    'tag': 'uint8',
    'tag__0': 'uint8',
    'tag__0_correct_pct': 'uint8',
    'tags': 'str',
    'task_container_id': 'int16',
    'task_container_id_orig': 'int16',
    'timestamp': 'int64',
    'type_of': 'str',
    'user_answer': 'int8',
    'user_id': 'int32'
}

batch_cols_all = [
    'user_id',
    'content_id',
    'row_id',
    'task_container_id',
    'timestamp',
    'prior_question_elapsed_time',
    'prior_question_had_explanation'
]

batch_cols_prior = [
    'user_id',
    'content_id',
    'content_type_id'
]

with open(PATH/'columns.json') as cj:
    test_cols = json.load(cj)

batch_cols = ['user_id', 'content_id', 'row_id'] + [c for c in batch_cols_all if c in test_cols]

print('test_cols:')
_ = list(map(print, test_cols))

dtypes_test = {k: v for k,v in dtypes.items() if k in test_cols}
dtypes_test = {**dtypes_test, **{'user_id': 'int32', 'content_id': 'int16'}}

### Load Users-Content

In [None]:
df_users_content = pd.read_pickle(PATH/'df_users_content.pkl')
df_users_content.head()

### Create Users Dataframe

In [None]:
df_users = df_users_content[['user_id', 'answered_correctly', 'answered_incorrectly']].groupby('user_id').sum().reset_index()
df_users = df_users.astype({'user_id': 'int32', 'answered_correctly': 'int16', 'answered_incorrectly': 'int16'})
df_users.head()

### Load Questions
Question related features joined with batches received from competition api prior to making predictions.

In [None]:
df_questions = pd.read_pickle(PATH/'df_questions.pkl')
df_questions.head()

## Create Database

In [None]:
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()

### Create Users-Content Table

In [None]:
%%time

chunk_size = 20000
total = len(df_users_content)
n_chunks = (total // chunk_size + 1)

i = 0
while i < n_chunks:
    df_users_content.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql('users_content', conn, method='multi', if_exists='append', index=False)
    i += 1

conn.execute('CREATE UNIQUE INDEX users_content_index ON users_content (user_id, content_id)')
del df_users_content
gc.collect()

In [None]:
%%time
pd.read_sql('SELECT * from users_content LIMIT 5', conn)

### Create Users Table

In [None]:
%%time

chunk_size = 20000
total = len(df_users)
n_chunks = (total // chunk_size + 1)

i = 0
while i < n_chunks:
    df_users.iloc[i * chunk_size:(i + 1) * chunk_size].to_sql('users', conn, method='multi', if_exists='append', index=False)
    i += 1

_ = conn.execute('CREATE UNIQUE INDEX users_index ON users (user_id)')
del df_users
gc.collect()

In [None]:
%%time
pd.read_sql('SELECT * from users LIMIT 5', conn)

### Create Questions Table

In [None]:
%%time

q_cols = [
    'question_id',
    'part',
    'tag__0',
    'part_correct_pct',
    'tag__0_correct_pct',
    'question_id_correct_pct'
]

df_questions[q_cols].to_sql('questions', conn, method='multi', index=False)
_ = conn.execute('CREATE UNIQUE INDEX question_id_index ON questions (question_id)')
del df_questions
gc.collect()

In [None]:
%%time
pd.read_sql('SELECT * from questions LIMIT 5', conn)

In [None]:
db_size = pd.read_sql('SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()', conn)['size'][0]
print(f'Total size of database is: {db_size/1e9:0.3f} GB')

In [None]:
import sys
if True:
    local_vars = list(locals().items())
    for var, obj in local_vars:
        size = sys.getsizeof(obj)
        if size > 1e7:
            print(f'{var:<18}{size/1e6:>10,.1f} MB')

## Predict

### Get State

In [None]:
def select_state(batch_cols, records):
    return f"""
        WITH b ({(', ').join(batch_cols)}) AS (
        VALUES {(', ').join(list(map(str, records)))}
        )
        SELECT
            {(', ').join([f'b.{col}' for col in batch_cols])},
            IFNULL(answered_correctly_cumsum, 0) answered_correctly_cumsum, 
            IFNULL(answered_incorrectly_cumsum, 0) answered_incorrectly_cumsum,
            IIF(
                (answered_correctly_cumsum + answered_incorrectly_cumsum) > 0,
                answered_correctly_cumsum * 100 / (answered_correctly_cumsum + answered_incorrectly_cumsum),
                0
            ) answered_correctly_cumsum_pct,
            IFNULL(answered_correctly_content_id_cumsum, 0) answered_correctly_content_id_cumsum,
            IFNULL(answered_incorrectly_content_id_cumsum, 0) answered_incorrectly_content_id_cumsum,
            {(', ').join(q_cols)}
        FROM b
        LEFT JOIN (
            SELECT user_id, answered_correctly answered_correctly_cumsum,
                answered_incorrectly answered_incorrectly_cumsum
            FROM users
            WHERE {(' OR ').join([f'user_id = {r[0]}' for r in records])}
        ) u ON (u.user_id = b.user_id)
        LEFT JOIN (
            SELECT user_id, content_id, answered_correctly answered_correctly_content_id_cumsum, 
            answered_incorrectly answered_incorrectly_content_id_cumsum
            FROM users_content uc
            WHERE {(' OR ').join([f'(user_id = {r[0]} AND content_id = {r[1]})' for r in records])}
        ) uc ON (uc.user_id = b.user_id AND uc.content_id = b.content_id)
        LEFT JOIN (
            SELECT {(', ').join(q_cols)}
            FROM questions
        ) q ON (q.question_id = b.content_id)
    """

### Update State

In [None]:
def update_state(df):
    
    def get_select_params(r):
        values_uc = f'({r.user_id}, {r.content_id}, {r.answered_correctly}, {1-r.answered_correctly})'
        values_u = f'({r.user_id}, {r.answered_correctly}, {1-r.answered_correctly})'
        return values_uc, values_u
    
    values = df.apply(get_select_params, axis=1, result_type='expand')
    
    return f"""
        INSERT INTO users_content(user_id, content_id, answered_correctly, answered_incorrectly)
        VALUES {(',').join(values[0])}
        ON CONFLICT(user_id, content_id) DO UPDATE SET
            answered_correctly = answered_correctly + excluded.answered_correctly,
            answered_incorrectly = answered_incorrectly + excluded.answered_incorrectly;
             
        INSERT INTO users(user_id, answered_correctly, answered_incorrectly)
        VALUES {(',').join(values[1])}
        ON CONFLICT(user_id) DO UPDATE SET
            answered_correctly = answered_correctly + excluded.answered_correctly,
            answered_incorrectly = answered_incorrectly + excluded.answered_incorrectly;
    """

In [None]:
def data_transform(df,FEATURE_COLS, is_training = True, is_validation = True): 
    #Casting types to reduce memory usage
#     df = reduce_memory_usage(df)
    
    #Dropping columns from the beginning to accelerate further computations
#     df.drop(['task_container_id'],axis=1,inplace=True)

    #Joining average marks for questions with the main dataframe
    df = df.join(Question_df['question_average'],on=['content_id'],rsuffix='_question_average')
    
    df = df.join(user_df[['performance','user_average', 'user_count']],on=['user_id'],rsuffix='_right')
    
    df['is_beginning'] = df['user_count'] < 20
    df['attemp'] = df.groupby(['user_id','content_id']).content_id.transform('cumcount').astype(np.uint8)
    #Recasting after join
    #df['prior_question_had_explanation'] = df['prior_question_had_explanation'].astype('bool')
    
    df = df.loc[df['content_type_id'] == False]
    
    if is_training or is_validation:
        df = df[FEATURE_COLS1 + TARGET_COL]
    else:
        df = df[FEATURE_COLS1]
    return df
def build_user_df(prior_df,user_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return user_df
    
    #Dictionnary for user average
    user_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                       .groupby(['user_id'])['answered_correctly_response'].sum())\
                       .rename(columns={'answered_correctly_response':'user_sum'})
    
    #Dictionnary for user count
    user_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                         .groupby(['user_id']).size(),columns=['user_count'])

    #Joining the df with preexisting one
    user_df = user_df.join(user_sum_prior,how='outer',rsuffix='_previous').join(user_count_prior,rsuffix='_previous')
    
    #Filling null values
    user_df['performance'].fillna(0,inplace=True)
    user_df['user_average'].fillna(0,inplace=True)
    user_df['user_count'].fillna(0,inplace=True)
    user_df['user_sum'].fillna(0,inplace=True)
    user_df['user_count_previous'].fillna(0,inplace=True)
    user_df['user_sum_previous'].fillna(0,inplace=True)
    
    #Computing the average of correct answers for the list of questions each user head in prior
    user_df = user_df.join(question_average_sum_by_user(prior_df,question_df))
    user_df['question_average_sum'].fillna(0,inplace=True)
    
    #Updating values
    user_df['user_mean_performance'] = (user_df['user_sum'] - user_df['performance'] * user_df['user_count'] + user_df['question_average_sum']) / (user_df['user_count'] + user_df['user_count_previous'])
    user_df['user_sum'] = user_df['user_sum'] + user_df['user_sum_previous']
    user_df['user_count'] = user_df['user_count'] + user_df['user_count_previous']
    user_df['user_average'] = user_df['user_sum'] / user_df['user_count']
    user_df['performance'] = user_df['user_average'] - user_df['user_mean_performance']
    user_df.drop(['user_sum_previous','user_count_previous','question_average_sum','user_mean_performance'],axis=1,inplace=True)
    
    return user_df

def build_question_df(prior_df,question_df):
    
    if (prior_df.shape[0] == 0):
        return question_df
    
    #Dictionnary for questions average
    question_sum_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                           .groupby(['content_id'])['answered_correctly_response'].sum())\
                           .rename(columns={'answered_correctly_response':'question_sum'})
    
    #Dictionnary for questions count
    question_count_prior = pd.DataFrame(prior_df.loc[prior_df['content_type_id'] == 0]\
                             .groupby(['content_id']).size(),columns=['question_count'])
    
    #Joining the two previous dataframes in one
    question_df = question_df.join(question_sum_prior,rsuffix='_previous').join(question_count_prior,rsuffix='_previous')
    
    #Filling null values
    question_df['question_average'].fillna(0,inplace=True)
    question_df['question_count'].fillna(0,inplace=True)
    question_df['question_sum'].fillna(0,inplace=True)
    question_df['question_sum_previous'].fillna(0,inplace=True)
    question_df['question_count_previous'].fillna(0,inplace=True)

    #Updating values
    question_df['question_sum'] = question_df['question_sum'] + question_df['question_sum_previous']
    question_df['question_count'] = question_df['question_count'] + question_df['question_count_previous']
    question_df['question_average'] = question_df['question_sum'] / question_df['question_count']
    question_df.drop(['question_count_previous','question_sum_previous'],inplace=True,axis=1)
    
    return question_df
def add_answers_to_prior_df(current_df,prior_df):
    prior_df_ = prior_df.copy()
    if (prior_df.shape[0] > 0):
        val = eval(current_df.iloc[0]['prior_group_answers_correct'])
        if (len(val) == prior_df.shape[0]):
            prior_df_['answered_correctly_response'] = val
    return prior_df_
def question_average_sum_by_user(df,question_df):
    my_dict = {}
    group = df.groupby(['user_id'])
    for user, val in group:
        average_sum = 0.0
        for row_index, row in val.iterrows():
            if (row['content_type_id'] == False):
                question_id = row['content_id']
                question_average = question_df.at[question_id,'question_average']
                average_sum += question_average
    #         print(f'user = {user}, id = {question_id}, average = {question_average}, average_sum={average_sum}')
        my_dict[user] = [average_sum]
    return pd.DataFrame.from_dict(my_dict,orient='index',columns=['question_average_sum'])
prior_df = pd.DataFrame()
current_df = pd.DataFrame()
TARGET_COL = ['answered_correctly']
FEATURE_COLS1 = ['row_id', 'user_count','performance', 'question_average']
FEATURE_COLS2 = ['answered_correctly_avg_u', 'answered_correctly_sum_u', 'count_u', 'answered_correctly_avg_c', 'part', 'prior_question_had_explanation', 'prior_question_elapsed_time']


In [None]:
#

In [None]:
Question_df = pd.read_csv('../input/featuresriid/question.csv')
user_df = pd.read_csv('../input/featuresriid/user.csv')

In [None]:
import lightgbm as lgb
model_lgb1 = lgb.Booster(model_file='../input/featuresriid/model_twofeatures.txt')
model_lgb2 = lgb.Booster(model_file='../input/7features760/model.txt')

In [None]:

def add_user_feats_without_update(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(df[['user_id']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def update_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    for row in df[['user_id','answered_correctly','content_type_id']].values:
        if row[2] == 0:
            answered_correctly_sum_u_dict[row[0]] += row[1]
            count_u_dict[row[0]] += 1

In [None]:
%%time
df_batch_prior = None
counter = 0
iter_nb = 0
from collections import defaultdict
TARGET = 'answered_correctly'



for test_batch in iter_test:
    counter += 1
    
    (current_df, sample_prediction_df) = test_batch
    if (iter_nb != 0):
        prior_df = add_answers_to_prior_df(current_df,prior_df)
        Question_df = build_question_df(prior_df,Question_df)
        user_df = build_user_df(prior_df,user_df,Question_df)
        answers = eval(test_batch[0]['prior_group_answers_correct'].iloc[0])
        df_batch_prior['answered_correctly'] = answers
        cursor.executescript(update_state(df_batch_prior[df_batch_prior.content_type_id == 0]))

        if not counter % 100:
            conn.commit()
    
    prior_df = current_df.copy()
    current_df = data_transform(current_df,FEATURE_COLS1,False,False)
    predictions0 = model_lgb1.predict(current_df.iloc[:,1:])
    iter_nb = 1
    print('--')
    
    (test_df, sample_prediction_df) = test_batch
    answered_correctly_sum_u_dict = defaultdict(int)
    count_u_dict = defaultdict(int)
    content_df = pd.read_csv('../input/7fetscontent/content.csv')
    qUestion_df = pd.read_csv('../input/7fetscontent/question.csv')
    if df_batch_prior is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_user_feats(previous_test_df, answered_correctly_sum_u_dict, count_u_dict)
    previous_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df, answered_correctly_sum_u_dict, count_u_dict)
    test_df = pd.merge(test_df, content_df, on='content_id',  how="left")
    test_df = pd.merge(test_df, qUestion_df, left_on='content_id', right_on='question_id', how='left')
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
    test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(25439.41)
    predictions1 =  model_lgb2.predict(test_df[FEATURE_COLS2])
    #print('--')

    

    # save prior batch for state update
    df_batch_prior = test_batch[0][batch_cols_prior].astype({k: dtypes[k] for k in batch_cols_prior})
    #print('1')
    # get state
    df_batch = test_batch[0][test_batch[0].content_type_id == 0]
    #print('2')

    records = df_batch[batch_cols].fillna(0).to_records(index=False)
    #print('3')

    df_batch = pd.read_sql(select_state(batch_cols, records), conn)

    # predict
    predictions2 = model_xgb.predict(xgb.DMatrix(df_batch[test_cols]))
    df_batch['answered_correctly'] = predictions0*0.4+predictions2*0.6#+predictions0)

    #submit
    env.predict(df_batch[['row_id', 'answered_correctly']])
    #print('ok')

In [None]:
current_df

In [None]:
predictions2,predictions1,predictions0