In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time
import pickle
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_pickle("../input/riiid-cross-validation-files/cv1_train.pickle")
val = pd.read_pickle("../input/riiid-cross-validation-files/cv1_valid.pickle")

train.drop(["max_time_stamp", "rand_time_stamp", "row_id"], axis=1, inplace=True)
val.drop(["max_time_stamp", "rand_time_stamp", "row_id"], axis=1, inplace=True)
train_df = pd.concat([train, val], ignore_index=True)
train_df = train_df.sort_values(by=["viretual_time_stamp"])
train_df.drop("viretual_time_stamp", axis=1, inplace=True)
del train, val
gc.collect()
train_df.head()

In [None]:
def get_time_lag(df):
    """
    Compute time_lag feature, same task_container_id shared same timestamp for each user
    """
    time_dict = {}
    time_lag = np.zeros(len(df), dtype=np.float32)
    for idx, row in enumerate(df[["user_id", "timestamp", "task_container_id"]].values):
        if row[0] not in time_dict:
            time_lag[idx] = 0
            time_dict[row[0]] = [row[1], row[2], 0] # last_timestamp, last_task_container_id, last_lagtime
        else:
            if row[2] == time_dict[row[0]][1]:
                time_lag[idx] = time_dict[row[0]][2]
            else:
                time_lag[idx] = row[1] - time_dict[row[0]][0]
                time_dict[row[0]][0] = row[1]
                time_dict[row[0]][1] = row[2]
                time_dict[row[0]][2] = time_lag[idx]

    df["time_lag"] = time_lag/1000/60 # convert to miniute
    df["time_lag"] = df["time_lag"].clip(0, 1440) # clip to 1440 miniute which is one day
    return time_dict

In [None]:
def pre_process(train_df, ques_path, row_start=30e6, num_rows=50e6, split_ratio=0.9, seq_len=100):
    print("Start pre-process")
    t_s = time.time()

    Features = ["timestamp", "user_id", "content_id", "content_type_id", "task_container_id", "user_answer", 
                "answered_correctly", "prior_question_elapsed_time", "prior_question_had_explanation"]
    train_df.index = train_df.index.astype('uint32')

    # shift prior elapsed_time and had_explanation to make current elapsed_time and had_explanation
    train_df = train_df[train_df.content_type_id == 0].reset_index()
    train_df["prior_question_elapsed_time"].fillna(0, inplace=True)
    train_df["prior_question_elapsed_time"] /= 1000 # convert to sec
    train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"].clip(0, 300)
    train_df["prior_question_had_explanation"].fillna(False, inplace=True)
    train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].astype('int8')

    print("Start merge dataframe")
    # merge with question dataframe to get part feature
    ques_df = pd.read_csv(ques_path)[["question_id", "part"]]
    train_df = train_df.merge(ques_df, how='left', left_on='content_id', right_on='question_id')
    train_df.drop(["question_id"], axis=1, inplace=True)
    train_df["part"] = train_df["part"].astype('uint8')
    print(train_df.head(10))
    print("Complete merge dataframe")
    print("====================")
    
    # get time_lag feature
    print("Start compute time_lag")
    time_dict = get_time_lag(train_df)
    with open("time_dict.pkl.zip", 'wb') as pick:
        pickle.dump(time_dict, pick)
    print("Complete compute time_lag")
    print("====================")
    train_df.drop("timestamp", axis=1, inplace=True)
    
    # plus 1 for cat feature which starts from 0
    train_df["content_id"] += 1
    train_df["task_container_id"] += 1
    train_df["answered_correctly"] += 1
    train_df["prior_question_had_explanation"] += 1
    train_df["user_answer"] += 1

    Train_features = ["user_id", "content_id", "part", "task_container_id", "time_lag", "prior_question_elapsed_time",
                      "answered_correctly", "prior_question_had_explanation", "user_answer"]

    if num_rows == -1:
        num_rows = train_df.shape[0]
    
    print("Start Inference group")
    infer_groups = train_df[Train_features].groupby("user_id").apply(lambda df: (
        df["content_id"].values[-seq_len:],
        df["part"].values[-seq_len:],
        df["task_container_id"].values[-seq_len:],
        df["time_lag"].values[-seq_len:],
        df["prior_question_elapsed_time"].values[-seq_len:],
        df["answered_correctly"].values[-seq_len:],
        df["prior_question_had_explanation"].values[-seq_len:],
        df["user_answer"].values[-seq_len:]
    ))
    with open("infer_groups.pkl.zip", 'wb') as pick:
        pickle.dump(infer_groups, pick)
    del infer_groups
    
    train_df = train_df.iloc[int(row_start):int(row_start+num_rows)]
    val_df = train_df[int(num_rows*split_ratio):]
    train_df = train_df[:int(num_rows*split_ratio)]

    print("Train dataframe shape after process ({}, {})/ Val dataframe shape after process({}, {})".format(train_df.shape[0], train_df.shape[1], val_df.shape[0], val_df.shape[1]))
    print("====================")

    # Check data balance
    num_new_user = val_df[~val_df["user_id"].isin(train_df["user_id"])]["user_id"].nunique()
    num_new_content = val_df[~val_df["content_id"].isin(train_df["content_id"])]["content_id"].nunique()
    train_content_id = train_df["content_id"].nunique()
    train_part = train_df["part"].nunique()
    train_correct = train_df["answered_correctly"].mean()-1
    val_correct = val_df["answered_correctly"].mean()-1
    print("Number of new users {}/ Number of new contents {}".format(num_new_user, num_new_content))
    print("Number of content_id {}/ Number of part {}".format(train_content_id, train_part))
    print("train correctness {:.3f}/val correctness {:.3f}".format(train_correct, val_correct))
    print("====================")

    print("Start train and Val grouping")
    train_group = train_df[Train_features].groupby("user_id").apply(lambda df: (
        df["content_id"].values,
        df["part"].values,
        df["task_container_id"].values,
        df["time_lag"].values,
        df["prior_question_elapsed_time"].values,
        df["answered_correctly"].values,
        df["prior_question_had_explanation"].values,
        df["user_answer"].values,
    ))
    with open("train_group.pkl.zip", 'wb') as pick:
        pickle.dump(train_group, pick)
    del train_group, train_df

    val_group = val_df[Train_features].groupby("user_id").apply(lambda df: (
        df["content_id"].values,
        df["part"].values,
        df["task_container_id"].values,
        df["time_lag"].values,
        df["prior_question_elapsed_time"].values,
        df["answered_correctly"].values,
        df["prior_question_had_explanation"].values,
        df["user_answer"].values,
    ))
    with open("val_group.pkl.zip", 'wb') as pick:
        pickle.dump(val_group, pick)
    print("Complete pre-process, execution time {:.2f} s".format(time.time()-t_s))

In [None]:
pre_process(train_df, "../input/riiid-test-answer-prediction/questions.csv", 0, -1, 0.95)