In [None]:
import psutil
import joblib
import random
from tqdm import tqdm

import numpy as np
import gc
import pandas as pd
import time

from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
def feature_time_lag(df, time_dict):

    tt = np.zeros(len(df), dtype=np.int64)

    for ind, row in enumerate(df[['user_id','timestamp','task_container_id']].values):

        if row[0] in time_dict.keys():
            if row[2]-time_dict[row[0]][1] == 0:

                tt[ind] = time_dict[row[0]][2]

            else:
                t_last = time_dict[row[0]][0]
                task_ind_last = time_dict[row[0]][1]
                tt[ind] = row[1]-t_last
                time_dict[row[0]] = (row[1], row[2], tt[ind])
        else:
            # time_dict : timestamp, task_container_id, lag_time
            time_dict[row[0]] = (row[1], row[2], -1)
            tt[ind] =  0

    df["time_lag"] = tt
    return df


In [None]:
!nvidia-smi

In [None]:
MAX_SEQ = 100
D_MODEL = 256
N_LAYER = 2
BATCH_SIZE = 256

In [None]:
%%time
import pickle
with open("../input/saint-plus-data-new/train_df.pkl","rb") as f:
    train_df = pickle.load(f)

In [None]:
question = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")

In [None]:
question.head()

In [None]:
train_df.head()

In [None]:
%%time
time_dict = dict()
train_df = feature_time_lag(train_df, time_dict)
# del time_dict

In [None]:
train_df.head()

In [None]:
train_df = train_df[["timestamp","user_id","content_id","content_type_id","answered_correctly","prior_question_elapsed_time","prior_question_had_explanation","time_lag"]]

In [None]:
train_df = train_df[train_df.content_type_id == 0]
# train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop=True)

In [None]:
train_df.prior_question_elapsed_time = train_df.prior_question_elapsed_time.fillna(0)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value = False).astype(int)

In [None]:
train_df = train_df.merge(question[["question_id","part", "bundle_id"]], how = "left", left_on = 'content_id', right_on = 'question_id')

In [None]:
len(train_df.content_id.unique())

In [None]:
train_df.prior_question_had_explanation.value_counts()

In [None]:
## drop outlier
# temp_df = train_df.groupby("user_id").count().reset_index()
# outlier = temp_df[(temp_df.timestamp>9000)].user_id.tolist()
# train_df = train_df[~train_df.user_id.isin(outlier)]

In [None]:
skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

In [None]:
n_part = len(train_df["part"].unique())

In [None]:
n_part

In [None]:
train_df

In [None]:
train_group = train_df[['user_id', 'content_id', 'answered_correctly', 'part', 'prior_question_elapsed_time', 'time_lag', 'prior_question_had_explanation', 'bundle_id']].groupby('user_id').apply(lambda r: (
            r['content_id'].values[-MAX_SEQ:],
            r['answered_correctly'].values[-MAX_SEQ:],
            r['part'].values[-MAX_SEQ:],
            r['prior_question_elapsed_time'].values[-MAX_SEQ:],
            r['time_lag'].values[-MAX_SEQ:],
            r['prior_question_had_explanation'].values[-MAX_SEQ:],
            r['bundle_id'].values[-MAX_SEQ:]))


In [None]:
# del train_df
# gc.collect()

In [None]:
joblib.dump(train_group,"./group.pkl.zip")