In [None]:
# Models
# --------
# RIIID PyTorch Transformer (SAINT-like)

# CV=0.7901, LB=0.795
# Trained Weights: fold1/stage2/snapshots/model_best.pt
# Remove weights to start training from scratch.

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
import os, sys, random, gc, math, glob, time, pathlib
import numpy as np
import pandas as pd
import io, timeit, os, gc, pickle, psutil
from multiprocessing import Pool
from tqdm.notebook import tqdm
import joblib
from matplotlib import cm
from datetime import datetime, timedelta
import re, shutil
from multiprocessing import Pool, cpu_count
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold, TimeSeriesSplit, KFold, GroupKFold, ShuffleSplit
from sklearn.cluster import KMeans
from sklearn import metrics
from joblib import Parallel, delayed
from functools import partial
from collections import OrderedDict, defaultdict
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 4000)
pd.options.display.float_format = '{:,.6f}'.format

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
DEFAULT_FIG_WIDTH = 28 # 20
sns.set_context("paper", font_scale=1.2) 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler, RandomSampler, SequentialSampler
from torchvision import transforms, utils
from torch.autograd import Variable
from torch.optim import Adam, SGD, AdamW

In [None]:
print('Python     : ' + sys.version.split('\n')[0])
print('Numpy      : ' + np.__version__)
print('Pandas     : ' + pd.__version__)
print('PyTorch    : ' + torch.__version__)

In [None]:
def seed_everything(s):
    random.seed(s)
    os.environ['PYTHONHASHSEED'] = str(s)
    np.random.seed(s)
    # Torch
    torch.manual_seed(s)
    torch.cuda.manual_seed(s)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(s)

seed = 2020
seed_everything(seed)
RANDOM = np.random.RandomState(seed)

In [None]:
#DEVICE = "cpu"
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(DEVICE))

In [None]:
ROW_ID = "row_id"

CONTENT_TYPE_ID = "content_type_id" # False = Question, True = Lecture
CONTENT_ID = "content_id" # Question/Lecture id
ANSWERED_CORRECTLY = "answered_correctly" # -1 for lecture
USER_ID = "user_id" # User unique identifier (around 400k users)
TASK_CONTAINER_ID = "task_container_id"
TIMESTAMP = "timestamp" # the time between this user interaction and the first event from that user
USER_ANSWER = "user_answer" # Can be compared with correct_answer
PRIOR_QUESTION_ELAPSED_TIME = "prior_question_elapsed_time" # the time is the total time a user took to solve all the questions in the previous bundle.
PRIOR_QUESTION_HAD_EXPLANATION = "prior_question_had_explanation"

# Questions
QUESTION_ID = "question_id" # foreign key for the train/test content_id column, when the content type is question
BUNDLE_ID = "bundle_id" # code for which questions are served together
CORRECT_ANSWER = "correct_answer" # the answer to the question. Can be compared with the train user_answer column to check if the user was right.
PART = "part" # top level category code for the question
TAGS = "tags" # one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together
TAGS_COMBO = "tags_combo"

# Lectures
TAG = "tag"
LECTURE_ID = "lecture_id"
TYPE_OF = "type_of"

SECTION = "section"
RANK = "rank"
LAG = "lag"

TARGET = ANSWERED_CORRECTLY

In [None]:
FEATURES = [CONTENT_ID, TARGET, PART, PRIOR_QUESTION_ELAPSED_TIME, LAG, PRIOR_QUESTION_HAD_EXPLANATION]

# Sampler
MAX_INTERACTIONS_CAP = 2*50000

MIN_INTERACTIONS_TRAIN = 3
MAX_INTERACTIONS_TRAIN = MAX_INTERACTIONS_CAP
MIN_INTERACTIONS_VALID = 3
MAX_INTERACTIONS_VALID = MAX_INTERACTIONS_CAP

PIVOT_TS = False
GROUPED = True
LECTURES_ENABLED = False 
DUMP = False # True (on first run to dump features)
TRAIN_SPLIT_LEN = 288
META = [] # [ROW_ID]

In [None]:
class MaskedBCEWithLogitsLoss(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes
        if self.num_classes > 1:
            self.loss_ce = nn.CrossEntropyLoss() # Input: (N, C) Target: (N)  0 <= target[i] <= C-1 
        else:
            self.loss_ce = nn.BCEWithLogitsLoss()

    def forward(self, input, target, mask=None):
        if self.num_classes > 1:
            input_ = input # Already flatten (BS*seq_len, C)
            target_ = target # Already flatten (BS*seq_len)
            if mask is not None:
                input_ = input_[mask == False]
                target_ = target_[mask == False]
        else:
            # Flatten to (BS*seq_len)
            input_ = input.reshape(-1)
            target_ = target.reshape(-1)
            if mask is not None:
                mask_ = mask.reshape(-1)
                input_ = input_[mask_ == False]
                target_ = target_[mask_ == False]

        return self.loss_ce(input_, target_)

In [None]:
class raw_conf:

    mtype = "SAINT"
    backbone = "transformer" 

    pad_mode = "token" # "random" # "reflect" # "token"
    pad_right = True
    flatten = True
    sampler = None # "prob" # "random"

    seq_len = 256 # 100
    embedding_dim = 256 # embed_dim must be divisible by num_heads
    exercices_id_size = 13523 if LECTURES_ENABLED is False else (13523 + 418) # LECTURES do not help
    exercices_part_size = 7 if PART in FEATURES else None
    response_size = 3 if LECTURES_ENABLED is True else 2 
    elapsed_time_max_size = 300 if PRIOR_QUESTION_ELAPSED_TIME in FEATURES else None
    elapsed_time_size = 73 if PRIOR_QUESTION_ELAPSED_TIME in FEATURES else None
    elapsed_time_cat = True # False
    lag_time_cat = True
    lag_time_max_size = 1440 if LAG in FEATURES else None
    lag_time_size = 366 if lag_time_cat is True else 0.0
    explanation_size = 2 if PRIOR_QUESTION_HAD_EXPLANATION in FEATURES else None
    position_encoding_enabled = True
    
    # Model
    nhead = 8 # 4
    num_encoder_layers = 4
    num_decoder_layers = 4
    dim_feedforward = 2048
    dropout = 0.1
    activation = None
    num_classes = 1
    loss = MaskedBCEWithLogitsLoss(num_classes)
    post_activation = "sigmoid" if num_classes == 1 else "softmax"
    post_activation_dim = 1 if post_activation == "sigmoid" else 2
    custom_encoder = True # False # It removes one BatchNorm
    custom_decoder = True # False # It removes one BatchNorm
    
    optimizer = "Adam" # "Noam" for stage1 # "Adam" for stage2
    warm_up_step_count = 10*1000
    warm_up_scale = 1.5
    scheduler = "Cosine" if optimizer == "Adam" or optimizer == "SGD"  else None
    lr = 0.0001 if optimizer == "Adam" or optimizer == "Noam" else 0.05 # 0.0012 # 0.0003
    min_lr = 0.00005 if optimizer == "Adam" or optimizer == "Noam" else 0.03 # .000005 # 0.0001
    beta1 = 0.9
    train_verbose = True # False
    valid_verbose = True # False
    train_probs_threshold = 0.5
    METRIC_ = "max"

    L_DEVICE = DEVICE
    WORKERS = 0 # 4 # 0
    BATCH_SIZE = 128
    ITERATIONS_LOGS = 50 # 1
    CYCLES = 3
    EPOCHS_PER_CYCLE = 12
    EPOCHS = CYCLES * EPOCHS_PER_CYCLE

    pin_memory = True

conf = raw_conf()
if torch.cuda.is_available():
    conf.map_location=lambda storage, loc: storage.cuda()
else:
    conf.map_location='cpu'

In [None]:
step = np.arange(1, 12000)
lr = conf.embedding_dim**(-0.5) * np.minimum(step**(-0.5)/(conf.warm_up_scale), step*(conf.warm_up_step_count**(-1.5)))
d = plt.plot(lr)
d = plt.title(max(lr))

In [None]:
def save_dict(tmp_dict, filename):
    pickle.dump(tmp_dict, open(filename, 'wb'))

def load_dict(filename):
    return pickle.load(tmp_dict, open(filename, 'rb'))

In [None]:
HOME = "./"
DATA_HOME ="./data/" 
TRAIN_DATA_HOME = DATA_HOME

FEATURES_NAME = "features_saint_qcut_256"
TRAIN_FEATURES_PATH = HOME + FEATURES_NAME + "/"
TRAIN_FILE_LECTURES = TRAIN_DATA_HOME + "lectures.parquet"
TRAIN_FILE_QUESTIONS = TRAIN_DATA_HOME + "questions.parquet"

MODEL_NAME = "%s_%s_%d_%d_%s_%s_v13.4" % (conf.mtype, conf.backbone, conf.seq_len, conf.embedding_dim, conf.post_activation, "Q" if LECTURES_ENABLED is False else "QL")
MODEL_PATH = HOME + MODEL_NAME
STAGE = "stage3" # "stage1"
MODEL_BEST = 'model_best.pt'

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

TRAIN = True # True # True
RESUME_FOLD = 0
RESUME = True # False # True
PRETRAINED = None
PRETRAINED_STAGE = "stage2" # None #
FREEZE_BACKBONE = False # True
PRETRAINED_BACKBONE_STAGE = None

In [None]:
# Load data
train_pd = None
FOLD = 1
if not os.path.exists(TRAIN_FEATURES_PATH + "train_cleaned.pkl"):
    train_pd = pd.read_pickle(DATA_HOME + "cv%d_train.pickle" % FOLD)
    train_pd.head()

In [None]:
# Load question data
def load_questions():
    train_questions_pd = pd.read_parquet(TRAIN_FILE_QUESTIONS)
    train_questions_pd[PART] = train_questions_pd[PART].astype(np.int8)
    train_questions_pd[BUNDLE_ID] = train_questions_pd[BUNDLE_ID].astype(np.int32)
    tag = train_questions_pd[TAGS].str.split(" ", n = 10, expand = True) 
    tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
    train_questions_pd =  pd.concat([train_questions_pd,tag], axis=1)
    train_questions_pd['tags1'] = train_questions_pd['tags1'].astype(np.float32).astype('Int16')
    train_questions_pd['tags2'] = train_questions_pd['tags2'].astype(np.float32).astype('Int16')
    train_questions_pd['tags3'] = train_questions_pd['tags3'].astype(np.float32).astype('Int16')
    train_questions_pd[SECTION] = 3
    train_questions_pd.loc[(train_questions_pd[PART] >= 5) & (train_questions_pd[PART] <= 7), SECTION] = 2
    train_questions_pd.loc[(train_questions_pd[PART] >= 1) & (train_questions_pd[PART] <= 4), SECTION] = 1
    train_questions_pd[SECTION] = train_questions_pd[SECTION].astype(np.int8)
    train_questions_pd.rename(columns={"question_id": CONTENT_ID}, inplace=True)

    unique_tags_combos_keys = {v:i for i,v in enumerate(train_questions_pd[TAGS].unique())} # '51 131 162 38': 0, '131 36 81': 1, ...
    train_questions_pd[TAGS_COMBO] = train_questions_pd[TAGS].apply(lambda x : unique_tags_combos_keys[x])
    train_questions_pd[TAGS_COMBO] = pd.to_numeric(train_questions_pd[TAGS_COMBO], downcast='integer')

    for col in ["tags1", "tags2", "tags3", "tags4", "tags5", "tags6"]:
        if col in train_questions_pd.columns:
            train_questions_pd[col] = train_questions_pd[col].astype('category').cat.codes
    train_questions_pd.drop(columns=[TAGS, CORRECT_ANSWER, TAGS_COMBO, BUNDLE_ID, "tags2", "tags3", "tags4", "tags5", "tags6", SECTION], inplace=True)
    return train_questions_pd

questions_df = load_questions()
print(questions_df[CONTENT_ID].min(), questions_df[CONTENT_ID].max(), questions_df[CONTENT_ID].nunique())
questions_df.head()

In [None]:
typeof_dict = {
    'concept': 0,
    'solving question': 1,
    'intention': 2,
    'starter': 3,
}

def load_lectures():
    # Content-id in lectures DOES NOT match to contentid in questions
    train_lectures_pd = pd.read_parquet(TRAIN_FILE_LECTURES)
    train_lectures_pd[PART] = train_lectures_pd[PART].astype(np.int8)
    train_lectures_pd[TYPE_OF] = train_lectures_pd[TYPE_OF].map(typeof_dict)
    train_lectures_pd.rename(columns={"lecture_id": CONTENT_ID}, inplace=True)
    train_lectures_pd.drop(columns=[TAG, TYPE_OF], inplace=True)
    return train_lectures_pd

train_lectures_pd = load_lectures()
print(train_lectures_pd.shape, train_lectures_pd[CONTENT_ID].nunique())
train_lectures_pd.head()

In [None]:
if LECTURES_ENABLED is True and DUMP is True:
    lect_dict = {}
    print(train_lectures_pd[CONTENT_ID].min(), train_lectures_pd[CONTENT_ID].max(), train_lectures_pd[CONTENT_ID].nunique())
    for i, lecture_id in enumerate(sorted(train_lectures_pd[CONTENT_ID].unique())):
        lect_dict[lecture_id] = i
    print("lect_dict", len(lect_dict))
    save_dict(lect_dict, MODEL_PATH + "/lect_dict.pkl")

In [None]:
test_cols = [ROW_ID, TIMESTAMP, USER_ID, CONTENT_ID, CONTENT_TYPE_ID, PRIOR_QUESTION_ELAPSED_TIME, PRIOR_QUESTION_HAD_EXPLANATION] # 'answered_correctly_avg_c' TASK_CONTAINER_ID
ext_cols = []
train_pd = train_pd[test_cols + ext_cols + [TARGET]] if train_pd is not None else None

if LECTURES_ENABLED is True and DUMP is True:
    # Merge lectures parts
    train_lec_part = pd.merge(train_pd.loc[train_pd.content_type_id == True].reset_index(drop=True), train_lectures_pd, on=[CONTENT_ID], how="left")
    # Update target and content_id for lectures
    train_lec_part[TARGET] = np.int8(2)
    train_lec_part["offset_id"] = train_lec_part[CONTENT_ID].map(lect_dict).astype(np.int32) + np.int32(13523)
    # Merge question parts
    train_que_part = pd.merge(train_pd.loc[train_pd.content_type_id == False].reset_index(drop=True), questions_df[[CONTENT_ID, PART]], on=[CONTENT_ID], how="left")
    train_que_part["offset_id"] = train_que_part[CONTENT_ID]
    train_pd = pd.concat([train_que_part, train_lec_part], axis=0).sort_values([USER_ID, TIMESTAMP]).reset_index(drop=True)
    train_pd[CONTENT_ID] = train_pd["offset_id"]
    train_pd.drop(columns=["offset_id"], inplace=True)
    print(train_pd[CONTENT_ID].min(), train_pd[CONTENT_ID].max(), train_pd[CONTENT_ID].nunique())
    del train_que_part, train_lec_part
gc.collect()
#train_pd.head()

In [None]:
def cleanup(df, elapsed_time_seconds_mean=None, lag_minutes_mean=None, quantile_transformer=None, lag_bins=None, elapsed_time_bins=None, standard_scaler=None):
    # Keep questions only?
    if LECTURES_ENABLED is False:
        df = df[df[CONTENT_TYPE_ID] == False].reset_index(drop=True)

    if PART in FEATURES and questions_df is not None and PART not in df.columns:
        df = pd.merge(df, questions_df, on=[CONTENT_ID], how="left")

    # Explanation
    if PRIOR_QUESTION_HAD_EXPLANATION in FEATURES:
        df[PRIOR_QUESTION_HAD_EXPLANATION].fillna(False, inplace=True)
        df[PRIOR_QUESTION_HAD_EXPLANATION] = df[PRIOR_QUESTION_HAD_EXPLANATION].astype(np.int8)

    if PRIOR_QUESTION_ELAPSED_TIME in FEATURES:
        # Compute mean elapsed time to fill NaN
        # Convert elapsed time to seconds
        df[PRIOR_QUESTION_ELAPSED_TIME] = df[PRIOR_QUESTION_ELAPSED_TIME]/1000.0 # In seconds
        
        df[PRIOR_QUESTION_ELAPSED_TIME] = np.round(df[PRIOR_QUESTION_ELAPSED_TIME])

        if elapsed_time_bins is None:
            df[PRIOR_QUESTION_ELAPSED_TIME], elapsed_time_bins = pd.qcut(df[PRIOR_QUESTION_ELAPSED_TIME], conf.elapsed_time_max_size, duplicates="drop", retbins=True)
            df[PRIOR_QUESTION_ELAPSED_TIME] = df[PRIOR_QUESTION_ELAPSED_TIME].cat.codes # codes 
            print("Max ELAPSED_TIME code:", df[PRIOR_QUESTION_ELAPSED_TIME].max(), "Min ELAPSED_TIME code:", df[PRIOR_QUESTION_ELAPSED_TIME].min(), "Total ELAPSED_TIME:", df[PRIOR_QUESTION_ELAPSED_TIME].nunique(), "Total elapsed_time_bins:", len(elapsed_time_bins))  
            df[PRIOR_QUESTION_ELAPSED_TIME] = (df[PRIOR_QUESTION_ELAPSED_TIME].replace(-1, len(elapsed_time_bins)-1)).astype(np.int32) # Replace -1 = NaN id (72)
        else:
            df[PRIOR_QUESTION_ELAPSED_TIME] = pd.cut(df[PRIOR_QUESTION_ELAPSED_TIME], bins=elapsed_time_bins, labels=False, include_lowest=True) # apply bins and category code
            df[PRIOR_QUESTION_ELAPSED_TIME] = (df[PRIOR_QUESTION_ELAPSED_TIME].fillna(len(elapsed_time_bins)-1)).astype(np.int32) # Replace NaN id (72)

    # Compute lag between 2 exercices
    if LAG in FEATURES:
        df[LAG] = df.groupby(USER_ID)[TIMESTAMP].shift()
        df[LAG] = np.clip(((df[TIMESTAMP] - df[LAG])/(1000.0)), 0, float('inf'))

        if conf.lag_time_cat is True:
            df[LAG] = np.round(df[LAG])
            if lag_bins is None:
                df[LAG], lag_bins = pd.qcut(df[LAG], conf.lag_time_max_size, duplicates="drop", retbins=True) # duplicated -> reduce about 1/2 of N_ltg
                df[LAG] = df[LAG].cat.codes # codes 
                print("Max LAG code:", df[LAG].max(), "Min LAG code:", df[LAG].min(), "Total LAG:", df[LAG].nunique(), "Total lag_bins:", len(lag_bins))  
                df[LAG] = (df[LAG].replace(-1, len(lag_bins)-1)).astype(np.int32) # Replace -1 = NaN id (365)
            else:
                df[LAG] = pd.cut(df[LAG], bins=lag_bins, labels=False, include_lowest=True) # apply bins and category code
                df[LAG] = (df[LAG].fillna(len(lag_bins)-1)).astype(np.int32) # Replace NaN id (365)
        else:
            print("Standard scaler") # Does not help
            if standard_scaler is None:
                standard_scaler = StandardScaler()
                standard_scaler.fit(df[LAG].values.reshape(-1, 1))
            df[LAG] = standard_scaler.transform(df[LAG].values.reshape(-1, 1))
            df[LAG].fillna(0.0, inplace=True)
            df[LAG] = df[LAG].astype("float32")

    # Add position
    df[RANK] = df.groupby([USER_ID])[TIMESTAMP].rank(method="first", ascending=True).astype(np.int32)
    
    # Need to start with zero index
    if PART in FEATURES:
        df[PART] = df[PART] - 1

    return df, elapsed_time_seconds_mean, lag_minutes_mean, quantile_transformer, lag_bins, elapsed_time_bins, standard_scaler

if DUMP is True:
    train_pd, elapsed_time_seconds_mean_, lag_minutes_mean_, quantile_transformer_, lag_bins_, elapsed_time_bins_, standard_scaler_= cleanup(train_pd, elapsed_time_seconds_mean=None, lag_minutes_mean=None, quantile_transformer=None, lag_bins=None, elapsed_time_bins=None, standard_scaler=None)
    if quantile_transformer_ is not None:
        save_dict(quantile_transformer_, MODEL_PATH + "/quantiles.pkl")
    if lag_bins_ is not None:
        print("lag_bins:", len(lag_bins_))
        assert(conf.lag_time_size == len(lag_bins_))
        save_dict(lag_bins_, MODEL_PATH + "/lag_bins.pkl")   
    if elapsed_time_bins_ is not None:
        print("elapsed_time_bins:", len(elapsed_time_bins_))
        assert(conf.elapsed_time_size == len(elapsed_time_bins_))
        save_dict(elapsed_time_bins_, MODEL_PATH + "/elapsed_time_bins.pkl")  
    if standard_scaler_ is not None:
        print("standard_scaler:", standard_scaler_)
        save_dict(standard_scaler_, MODEL_PATH + "/standard_scaler.pkl")          
    print(train_pd.shape)
else:
    elapsed_time_seconds_mean_ = 25.439438358434938
    lag_minutes_mean_ = 327.18331043496374 if LECTURES_ENABLED is True else 333.3975388518916 # (questions only)

In [None]:
valid_pd = pd.read_pickle(DATA_HOME + "cv%d_valid.pickle" % FOLD)
valid_pd = valid_pd[test_cols + ext_cols + [TARGET]]
if LECTURES_ENABLED is True and DUMP is True:
    valid_lec_part = pd.merge(valid_pd.loc[valid_pd.content_type_id == True].reset_index(drop=True), train_lectures_pd, on=[CONTENT_ID], how="left")
    valid_lec_part[TARGET] = np.int8(2)
    valid_lec_part["offset_id"] = valid_lec_part[CONTENT_ID].map(lect_dict).astype(np.int32) + np.int32(13523)
    valid_que_part = pd.merge(valid_pd.loc[valid_pd.content_type_id == False].reset_index(drop=True), questions_df[[CONTENT_ID, PART]], on=[CONTENT_ID], how="left")
    valid_que_part["offset_id"] = valid_que_part[CONTENT_ID]
    valid_pd = pd.concat([valid_que_part, valid_lec_part], axis=0).sort_values([USER_ID, TIMESTAMP]).reset_index(drop=True)
    valid_pd[CONTENT_ID] = valid_pd["offset_id"]
    valid_pd.drop(columns=["offset_id"], inplace=True)
    print(valid_pd[CONTENT_ID].min(), valid_pd[CONTENT_ID].max(), valid_pd[CONTENT_ID].nunique())
    del valid_que_part, valid_lec_part
_ = gc.collect()

In [None]:
if DUMP is True:
    valid_pd, _, _, _, _, _, _ = cleanup(valid_pd, elapsed_time_seconds_mean=elapsed_time_seconds_mean_, lag_minutes_mean=lag_minutes_mean_, quantile_transformer = quantile_transformer_, lag_bins=lag_bins_, elapsed_time_bins=elapsed_time_bins_, standard_scaler=standard_scaler_)
    print(valid_pd.shape)

In [None]:
if DUMP is True:
    emb_cols = FEATURES + [RANK, USER_ID] + META
    for col in emb_cols:
        print(col, "train uniques:", train_pd[col].nunique() if col in train_pd.columns else None) 
        print(col, "valid uniques:", valid_pd[col].nunique() if col in valid_pd.columns else None) 

In [None]:
if not os.path.exists(TRAIN_FEATURES_PATH + "train_cleaned.pkl"):
    train_count_pd = train_pd.groupby([USER_ID])[ROW_ID].count().reset_index()
    train_count_pd.columns = [USER_ID, "ft_row_id_agg_nunique_per_user_id"]
    train_pd = pd.merge(train_pd, train_count_pd, on=[USER_ID], how="left")
    valid_count_pd = valid_pd.groupby([USER_ID])[ROW_ID].count().reset_index()
    valid_count_pd.columns = [USER_ID, "ft_row_id_agg_nunique_per_user_id"]
    valid_pd = pd.merge(valid_pd, valid_count_pd, on=[USER_ID], how="left")
    del train_count_pd, valid_count_pd
    gc.collect()

In [None]:
# Filter sequences (to control padding rate)
emb_cols = FEATURES + [RANK, USER_ID] + META

if not os.path.exists(TRAIN_FEATURES_PATH + "train_cleaned.pkl"):
    train_pd = train_pd[train_pd["ft_row_id_agg_nunique_per_user_id"] >= MIN_INTERACTIONS_TRAIN]
    valid_pd = valid_pd[valid_pd["ft_row_id_agg_nunique_per_user_id"] >= MIN_INTERACTIONS_VALID]
    print(train_pd.shape)
    train_pd = train_pd[train_pd[RANK] <= MAX_INTERACTIONS_TRAIN]
    valid_pd = valid_pd[valid_pd[RANK] <= MAX_INTERACTIONS_VALID]
    print(train_pd.shape)
    for col in emb_cols:
        print(col, "train uniques:", train_pd[col].nunique() if col in train_pd.columns else None) 
        print(col, "valid uniques:", valid_pd[col].nunique() if col in valid_pd.columns else None) 

In [None]:
# Convert to TimeSeries dataset
def to_series(df, pivot=False):
    if pivot is True:
        series = []
        for col in FEATURES:
            a = df[[USER_ID, RANK, col]]
            a = a.set_index([USER_ID, RANK])
            a = a.unstack(fill_value=-1).reset_index()
            a["name"] = col
            a = a.set_index(["name", USER_ID])
            a.columns = [i for i in range(a.shape[1])]
            series.append(a)
        df = pd.concat(series, axis=0)
    else:
        df.set_index([USER_ID, TIMESTAMP], inplace=True)
        df = df.sort_index()
        df = df.reset_index()
        df.drop(columns=[TIMESTAMP], inplace=True)
        df = df.set_index(USER_ID)
        df = df[FEATURES + META]
    if GROUPED == True:
        df = df[FEATURES + META].groupby(USER_ID).apply(lambda r: [r[c].values for c in FEATURES + META])
    return df

In [None]:
if os.path.exists(TRAIN_FEATURES_PATH + "train_cleaned.pkl"):
    del train_pd
    train_pd = pd.read_pickle(TRAIN_FEATURES_PATH + "train_cleaned.pkl")
    print("Train reloaded")
else:
    train_pd = to_series(train_pd, pivot=PIVOT_TS)
    train_pd.to_pickle(TRAIN_FEATURES_PATH + "train_cleaned.pkl")

In [None]:
if os.path.exists(TRAIN_FEATURES_PATH + "valid_cleaned.pkl"):
    del valid_pd
    valid_pd = pd.read_pickle(TRAIN_FEATURES_PATH + "valid_cleaned.pkl")
    print("Valid reloaded")
else:
    valid_pd = to_series(valid_pd, pivot=PIVOT_TS)
    valid_pd.to_pickle(TRAIN_FEATURES_PATH + "valid_cleaned.pkl")
print(valid_pd.shape)
valid_pd.head(10)

In [None]:
# Dataset/DataLoader:
class RIIIDDataset(Dataset):
    def __init__(self, df, conf, factory, subset="train", categoricals=None, transform=None, augmentations=None, weights=False, flatten=False, min_len=3, force_len=None, verbose=False):
        super().__init__()
        self.history_users = None
        self.categoricals = categoricals
        self.subset = subset
        self.transform = transform
        self.augmentations = augmentations
        self.conf = conf
        self.factory = factory
        self.verbose = verbose

        if flatten is True and GROUPED is True:
            self.dex = []
            self.df = {}
            split_len = self.conf.seq_len if force_len is None else force_len
            total = 0
            total_short = 0
            count_short = 0
            for i, user_id in enumerate(df.index):
                feats = df[user_id]
                if(i % 10000 == 0):
                    print('Processed %d users, features=%d' % (i, len(feats)))
                if len(feats[0]) >= min_len:
                    if len(feats[0]) > split_len:
                        total_questions = len(feats[0])
                        last_pos = total_questions // split_len
                        for seq in range(last_pos):
                            index = f"{user_id}_{seq}"
                            self.dex.append(index)
                            start = seq * split_len
                            end = (seq + 1) * split_len
                            self.df[index] = [feats[f][start:end] for f, _ in enumerate(FEATURES + META)]
                            total = total + split_len
                        if len(feats[0][end:]) >= min_len:
                            index = f"{user_id}_{last_pos + 1}"
                            self.dex.append(index)
                            self.df[index] = [feats[f][end:] for f, _ in enumerate(FEATURES + META)]
                            total = total + len(feats[0][end:])
                            total_short = total_short + len(feats[0][end:])
                            count_short = count_short + 1
                    else:
                        index = f'{user_id}'
                        self.dex.append(index)
                        self.df[index] = [feats[f][:] for f, _ in enumerate(FEATURES + META)]
                        total = total + len(feats[0])
                        total_short = total_short + len(feats[0])
                        count_short = count_short + 1
                        
            print("Total users_seq:", len(self.dex), "total:", total, "total_short:", total_short, "ratio_short:", total_short/count_short, "split len:", split_len)
        else:
            self.df = df
            self.dex = self.df if subset == "test" else self.df.reset_index()[USER_ID].unique()


        def zero_offset(max_offset, user_id):
            return 0
        
        def random_offset(max_offset, user_id):
            return np.random.randint(max_offset)

        if subset == 'train':
            self.get_offset = random_offset
        elif subset == 'valid':
            self.get_offset = zero_offset # lambda x: 0
        elif subset == 'ho':
            self.get_offset = zero_offset # lambda x: 0
        elif subset == 'test':
            self.get_offset = zero_offset # lambda x: 0            
        else:
            raise RuntimeError("Unknown subset")

        self.uweights = self.compute_weights(self.df) if subset != "test" and weights is True else None

    def compute_weights(self, df_):
        if isinstance(df_, dict):
            ret = []
            for seq_id_, v in df_.items():
                ret.append((seq_id_, len(v[0])))
            df_dist = pd.DataFrame(ret)
        else:
            if GROUPED is False:
                df_dist = df_.value_counts(subset=[USER_ID]).reset_index()
            else:
                ret = []
                for seq_id_, v in zip(df_.index, df_):
                    ret.append((seq_id_, len(v[0])))
                df_dist = pd.DataFrame(ret)     
        df_dist.columns = [USER_ID, "prob"]
        df_dist = df_dist[df_dist["prob"] <= MAX_INTERACTIONS_CAP].reset_index(drop=True)
        df_dist["prob"] = df_dist["prob"]/df_dist["prob"].sum()
        df_dist = df_dist.set_index(USER_ID).sort_index()
        print("compute_weights")
        display(df_dist.head())
        return df_dist
   
    def __len__(self):
        return len(self.dex)

    def cleanup(self):
        if self.factory is not None:
            self.factory.cleanup()
       
    def get_sample(self, row, categoricals, user_id):        

        # build slice_pd with (features, seq_len)
        slice_pd = None
        mask = np.zeros((self.conf.seq_len), dtype=bool) # All False

        if GROUPED == True:
            series_len = len(row[0]) # Questions for a given user
            start_index = 0
            stop_index = series_len
            pad_row, pad_size = None, None

            # build slice_pd with (features, seq_len)
            if series_len < self.conf.seq_len:
                # Not enough data, left padding
                pad_size = self.conf.seq_len-series_len
                if self.conf.pad_mode == "random":
                    # Pick another user in existing dataset
                    iidx = self.get_offset(len(self.dex), user_id)
                    random_user_id_row = self.df.loc[self.dex[iidx]]
                    while (len(random_user_id_row[0]) < pad_size):
                        iidx = self.get_offset(len(self.dex), user_id) if self.subset == "train" else iidx + 1 # No random for valid, just the next one
                        random_user_id_row = self.df.loc[self.dex[iidx]]
                    pad_row = random_user_id_row
                elif self.conf.pad_mode == "token":
                    pad_row = np.zeros((len(row), pad_size), dtype=np.int16)
                    for i, f in enumerate(FEATURES + META):
                        if f == CONTENT_ID:
                            pad_row[i, :] = self.conf.exercices_id_size
                        elif f == PART:
                            pad_row[i, :] = self.conf.exercices_part_size 
                        elif f == TARGET: 
                            pad_row[i, :] = self.conf.response_size + 1  # + 1 as we've start_token already                         
                        elif f == LAG:
                            pad_row[i, :] = self.conf.lag_time_size + 1  # + 1 as we've start_token already                                            
                        elif f == PRIOR_QUESTION_ELAPSED_TIME:
                            pad_row[i, :] = self.conf.elapsed_time_size + 1 # + 1 as we've start_token already
                        elif f == PRIOR_QUESTION_HAD_EXPLANATION:
                            pad_row[i, :] = self.conf.explanation_size + 1  # + 1 as we've start_token already 
                        elif f == ROW_ID:
                            pad_row[i, :] = -1                          
                else:
                    # Zeros
                    pad_row = np.zeros((len(row), pad_size), dtype=np.int16)                
                # Right/Left pad
                mask = np.concatenate([np.zeros((self.conf.seq_len-pad_size), dtype=bool), np.ones((pad_size), dtype=bool)]) if self.conf.pad_right is True else np.concatenate([np.ones((pad_size), dtype=bool), np.zeros((self.conf.seq_len-pad_size), dtype=bool)])# Right pad
            elif series_len > self.conf.seq_len:       
                # Pick a random value and get seq_len
                start_index = self.get_offset(series_len - self.conf.seq_len, user_id)
                stop_index = start_index + self.conf.seq_len

            sample = {}
            for f_idx, input in enumerate(FEATURES + META, 0):
                # Pick column
                series_data = row[f_idx][start_index:stop_index]
                if pad_row is not None:
                    # Right/Left pad
                    series_data = np.concatenate([row[f_idx][start_index:stop_index], pad_row[f_idx][0:pad_size]]) if self.conf.pad_right is True else np.concatenate([pad_row[f_idx][0:pad_size], row[f_idx][start_index:stop_index]])
                labels = None
                if input == TARGET:
                    labels = torch.from_numpy(series_data).long()
                item = torch.from_numpy(series_data)
                if self.transform is not None: 
                    item = self.transform(item)
                sample[input] = item
                if labels is not None:
                    sample["labels"] = labels
            if LECTURES_ENABLED is True:
                mask[sample[CONTENT_ID] >= 13523] = True
            sample["mask"] = mask             

        return sample

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        if self.subset == "test":
            # For test, we need one question only to answer (with related user)
            row = self.df.iloc[idx:idx+1,:]
        else:
            # For train/valid we need series per user
            user_id = self.dex[idx]
            if isinstance(self.df, dict):                
                row = self.df[user_id]
            else:
                row = self.df.loc[user_id]

        sample =  self.get_sample(row, self.categoricals, user_id)
        return sample

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."

    def __init__(self, model_size, factor, warmup, optimizer, warmup_scale=1.0):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.warmup_scale = warmup_scale
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
               (self.model_size ** (-0.5) *
                min(step ** (-0.5) / self.warmup_scale, step * self.warmup ** (-1.5)))

class NoamOptimizer:
    def __init__(self, model, lr, model_size, warmup, warmup_scale):
        self._adam = torch.optim.Adam(model.parameters(), lr=lr)
        self._opt = NoamOpt(model_size=model_size, factor=1, warmup=warmup, optimizer=self._adam, warmup_scale=warmup_scale)

    def zero_grad(self):
        self._opt.zero_grad()

    def step(self):
        self._opt.step()
    
    def get_last_lr(self):
        return self._opt._rate

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model) # torch.Size([max_len, d_model])
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # torch.Size([max_len, 1]) # 0,1,2,3,4,...max_len-1
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # torch.Size([d_model/2])
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1) # torch.Size([max_len, 1, d_model])
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class CustomEncoder(nn.Module):
    def __init__(self, d_model=512, nhead=6, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation ="relu", verbose=False):
        super().__init__()

        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = None # nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) # Stack encoder layers

    def forward(self, src, mask=None, src_key_padding_mask=None):
        memory = self.encoder(src, mask=mask, src_key_padding_mask=src_key_padding_mask)
        return memory

class CustomDecoder(nn.Module):
    def __init__(self, d_model=512, nhead=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation ="relu", verbose=False):
        super().__init__()

        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = None # nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) # Stack decoder layers

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):       
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
        return output

In [None]:
# Model
class RIIIDModel(nn.Module):
    def __init__(self, cfg, verbose=False):
        super().__init__()
        self.response_size = cfg.response_size
        self.lag_time_size = cfg.lag_time_size
        self.elapsed_time_size = cfg.elapsed_time_size
        self.explanation_size = cfg.explanation_size
        self.seq_len = cfg.seq_len
        self.embedding_dim = cfg.embedding_dim
        self.elapsed_time_cat = cfg.elapsed_time_cat
        self.lag_time_cat = cfg.lag_time_cat
        self.num_classes = cfg.num_classes
        self.verbose = verbose
        self.pad_mode = cfg.pad_mode

        self.pos_encoder1 = None
        self.pos_encoder2 = None
        self.position_size_embedding = None

        additional_token_dim = 1 if self.pad_mode == "token" else 0

        # Exercices embeddings
        self.exercices_id_embedding = nn.Embedding(cfg.exercices_id_size + additional_token_dim, self.embedding_dim)
        self.exercices_part_embedding = nn.Embedding(cfg.exercices_part_size + additional_token_dim, self.embedding_dim) if cfg.exercices_part_size is not None else None

        # Response embeddings
        self.response_embedding = nn.Embedding(cfg.response_size + 1 + additional_token_dim, self.embedding_dim) # +1 to include start token
        
        if self.elapsed_time_cat is True:
            self.elapsed_time_embedding = nn.Embedding(cfg.elapsed_time_size + 1 + additional_token_dim, self.embedding_dim) if cfg.elapsed_time_size is not None else None # +1 to include start token
        else:
            self.elapsed_time_embedding = nn.Linear(1, self.embedding_dim, bias=False) if cfg.elapsed_time_size is not None else None # Continuous embedding
        
        if self.lag_time_cat is True:
            self.lag_time_embedding = nn.Embedding(cfg.lag_time_size + 1 + additional_token_dim, self.embedding_dim) if cfg.lag_time_size is not None else None # +1 to include start token
        else:
            self.lag_time_embedding = nn.Linear(1, self.embedding_dim, bias=False) if cfg.lag_time_size is not None else None # Continuous embedding
            
        self.explanation_embedding = nn.Embedding(cfg.explanation_size + 1 + additional_token_dim, self.embedding_dim) if cfg.explanation_size is not None else None # +1 to include start token
        
        input_features_dim = self.embedding_dim

        # Position encoder (relative or absolute position of the tokens in the sequence)
        if cfg.position_encoding_enabled is True:
            self.pos_encoder1 = PositionalEncoding(input_features_dim, cfg.dropout)
            self.pos_encoder2 = self.pos_encoder1
        else:
            # Relative position
            self.position_size_embedding = nn.Embedding(cfg.seq_len + 1 + additional_token_dim, self.embedding_dim) # +1 to include start token

        # Transformer with default encoder/decoder        
        self.transformer = nn.Transformer(d_model=input_features_dim, 
                                          nhead=cfg.nhead, 
                                          num_encoder_layers=cfg.num_encoder_layers,
                                          num_decoder_layers=cfg.num_decoder_layers, 
                                          dim_feedforward=cfg.dim_feedforward, 
                                          dropout=cfg.dropout, 
                                          activation='relu', 
                                          custom_encoder = CustomEncoder(d_model=input_features_dim, nhead=cfg.nhead , num_encoder_layers=cfg.num_encoder_layers, 
                                                                         dim_feedforward=cfg.dim_feedforward, dropout=cfg.dropout, activation ="relu") if cfg.custom_encoder is True else None, 
                                          custom_decoder = CustomDecoder(d_model=input_features_dim, nhead=cfg.nhead, num_decoder_layers=cfg.num_decoder_layers, 
                                                                         dim_feedforward=cfg.dim_feedforward, dropout=cfg.dropout, activation='relu') if cfg.custom_decoder is True else None)
                
        # Decoder
        self.fc = nn.Linear(input_features_dim, self.num_classes)

        self.init_weights()


    def init_weights(self):
        # Xavier uniform initialization
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    # If a BoolTensor is provided, the positions with the value of True will be ignored while the position with the value of False will be unchanged.
    # tensor([[False,  True,  True,  True],
    #         [False, False,  True,  True],
    #         [False, False, False,  True],
    #         [False, False, False, False]])    
    def generate_mask(self, size, diagonal=1):        
        return torch.triu(torch.ones(size, size)==1, diagonal=diagonal)

    def forward(self, data, src_mask=None, tgt_mask=None, mem_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        
        # Each input is (BS, seq_len)
        # Content
        data_content_id = data[CONTENT_ID].long()
        # Answers
        data_response = data[TARGET].long()        

        # Optional features
        data_part = data[PART].long() if self.exercices_part_embedding is not None else None
        
        if self.elapsed_time_cat is True:
            data_elapsed_time = data[PRIOR_QUESTION_ELAPSED_TIME].long() if self.elapsed_time_embedding is not None else None
        else:
            data_elapsed_time = data[PRIOR_QUESTION_ELAPSED_TIME].float().unsqueeze(2) if self.elapsed_time_embedding is not None else None
        if self.lag_time_cat is True:
            data_lag_time = data[LAG].long() if self.lag_time_embedding is not None else None
        else:
            data_lag_time = data[LAG].float().unsqueeze(2) if self.lag_time_embedding is not None else None
        
        data_explanation = data[PRIOR_QUESTION_HAD_EXPLANATION].long() if self.explanation_embedding is not None else None


        # Start token(s)
        # --------------

        # Add start token to correctness
        data_response = torch.roll(data_response, shifts=(0, 1), dims=(0, 1)) # Shift right the sequence
        data_response[:,0] = self.response_size # Start token (2)

        # Add start token to lag time
        if data_lag_time is not None:
            data_lag_time = torch.roll(data_lag_time, shifts=(0, 1), dims=(0, 1)) # Shift right the sequence
            data_lag_time[:,0] = self.lag_time_size # Start token

        # Add start token to elapsed time
        if data_elapsed_time is not None:
            data_elapsed_time = torch.roll(data_elapsed_time, shifts=(0, 1), dims=(0, 1)) # Shift right the sequence
            if self.elapsed_time_cat is True:
                data_elapsed_time[:,0] = self.elapsed_time_size # Start token
            else:
                data_elapsed_time[:,0] = 0.0

        # Add start token to explanation
        if data_explanation is not None:
            data_explanation = torch.roll(data_explanation, shifts=(0, 1), dims=(0, 1)) # Shift right the sequence
            data_explanation[:,0] = self.explanation_size # Start token
        
        # Questions, Part, Elapsed time, Lag embeddings
        x_content_id = self.exercices_id_embedding(data_content_id) # (BS, seq_len, embedding_dim)

        x_exercices_part = self.exercices_part_embedding(data_part) if self.exercices_part_embedding is not None else None # (BS, seq_len, embedding_dim)
        x_elapsed_time = self.elapsed_time_embedding(data_elapsed_time) if self.elapsed_time_embedding is not None else None # (BS, seq_len, embedding_dim)
        x_lag_time = self.lag_time_embedding(data_lag_time) if self.lag_time_embedding is not None else None # (BS, seq_len, embedding_dim)
        x_explanation = self.explanation_embedding(data_explanation) if self.explanation_embedding is not None else None # (BS, seq_len, embedding_dim)

        # Response embeddings
        x_correctness = self.response_embedding(data_response) # (BS, seq_len, embedding_dim)

        x_position = None

        # Ei (sum of embeddings)
        x_exercices = x_content_id
        
        if x_exercices_part is not None:
            x_exercices = x_exercices + x_exercices_part  # (BS, seq_len, embedding_dim)
        
        x_position_exercices = self.pos_encoder1(x_exercices) if self.pos_encoder1 is not None else x_exercices # (BS, seq_len, embedding_dim)

        # Ri (sum of embeddings) [S, R1, Rk-1], S is start token
        
        x_responses = x_correctness 
        
        if x_lag_time is not None:
            x_responses = x_responses + x_lag_time

        if x_elapsed_time is not None:
            x_responses = x_responses + x_elapsed_time # (BS, seq_len, embedding_dim)

        if x_explanation is not None:
            x_responses = x_responses + x_explanation # (BS, seq_len, embedding_dim)        
        
        x_position_responses = self.pos_encoder2(x_responses) if self.pos_encoder2 is not None else x_responses # (BS, seq_len, embedding_dim)

        # Transformer src: (S,N,E), tgt:(T,N,E), src_mask:(S,S), tgt_mask:(T,T)
        # where S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number
        # output: (T,N,E)
        # src_key_padding_mask: (N,S), tgt_key_padding_mask: (N,T), memory_key_padding_mask: (N,S)    
        x_position_exercices = x_position_exercices.transpose(1,0) # (seq_len, BS, embedding_dim)
        x_position_responses = x_position_responses.transpose(1,0) # (seq_len, BS, embedding_dim)
        
        x_transformer = self.transformer(src=x_position_exercices, tgt=x_position_responses, src_mask=src_mask, tgt_mask=tgt_mask, memory_mask=mem_mask, 
                                         src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask) # (seq_len, BS, embedding_dim)
        x_transformer = x_transformer.transpose(1,0) # (BS, seq_len, embedding_dim)
     
        output = self.fc(x_transformer)
        output = output.squeeze(dim=2)

        return output

In [None]:
def build_model(cfg, device, encoder_weights=None, backbone_weights=None, freeze_backbone=False, verbose=False):

    model = RIIIDModel(cfg, verbose=verbose)

    # Load weights
    if (encoder_weights is not None) and (os.path.exists(encoder_weights)):
        print("Load weights before optimizer from: %s" % encoder_weights)
        model.load_state_dict(torch.load(encoder_weights, map_location=cfg.map_location), strict=False)
        if freeze_backbone is True:
            print("Freeze backbone")
            model.freeze_backbone()

    model = model.to(device)

    if cfg.optimizer == "Noam":
        optimizer = NoamOptimizer(model=model, lr=cfg.lr, model_size=cfg.embedding_dim, warmup=cfg.warm_up_step_count, warmup_scale=cfg.warm_up_scale)
    else:
        if cfg.optimizer == "SGD":
            optimizer = SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr, momentum=0.9)
        else:
            optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr, betas=(cfg.beta1, 0.999))
    
    # Loss
    loss = cfg.loss

    loss = loss.to(device)

    return model, loss, optimizer

In [None]:
def accuracy(y_true, y_pred, mask=None):
    score = 0.0

    # Flatten to (cumulated_BS * seq_len)
    if mask is not None:
        mask = mask.reshape(-1)
        y_true = y_true.reshape(-1)[mask == False]
        y_pred = y_pred.reshape(-1)[mask == False]         
        score = (y_true == y_pred).sum()
        score = score/len(y_pred)
    else:
        y_true = y_true.reshape(-1)
        y_pred = y_pred.reshape(-1)
        score = (y_true == y_pred).sum()
        score = score/len(y_pred)      

    return score

def auc(y_true, y_pred, mask=None): 
    # Flatten to (cumulated_BS * seq_len)
    if mask is not None:
        mask = mask.reshape(-1)
        y_true = y_true.reshape(-1)[mask == False]
        y_pred = y_pred.reshape(-1)[mask == False]       
        score = metrics.roc_auc_score(y_true, y_pred)
    else:
        y_true = y_true.reshape(-1)
        y_pred = y_pred.reshape(-1)
        score = metrics.roc_auc_score(y_true, y_pred)

    return score        

METRIC = auc
METRIC_NAME = METRIC.__name__
METRICS = [accuracy]
METRICS_PROBS = [METRIC]

In [None]:
def format_logs(logs):
    str_logs = ['{} - {:.4}'.format(k, v) for k, v in logs.items()]
    s = ', '.join(str_logs)
    return s

# Train loop
def train_loop_fn(batches, preprocessing, model, optimizer, criterion, device, stage="Train", verbose=True):
    model.train()
    count, train_loss = 0, 0.0
    all_predicted_classes, all_predicted_probs, all_target_classes = None, None, None
    all_padding_mask = None

    src_mask = None
    with tqdm(batches, desc=stage, file=sys.stdout, disable=not(verbose)) as iterator:
        for x, batch in enumerate(iterator, 1):
            try:
                for k, v in batch.items():
                    batch[k] = v.to(device) if k != ROW_ID else None
                samples_data, labels_data, padding_mask_data = batch, batch["labels"], batch.get("mask")
                padding_mask_data_ = padding_mask_data

                src_mask = model.generate_mask(conf.seq_len).to(device) if src_mask is None else src_mask

                optimizer.zero_grad() # reset gradient

                # Preprocessing
                with torch.no_grad():
                    data = preprocessing(samples_data) if preprocessing is not None else samples_data

                # Model
                output = model(data, src_mask=src_mask, tgt_mask=src_mask, mem_mask=src_mask, 
                                src_key_padding_mask=padding_mask_data_, tgt_key_padding_mask=padding_mask_data_, memory_key_padding_mask=padding_mask_data_) # forward pass
                
                if conf.num_classes > 1:
                    padding_mask_data_copy = padding_mask_data
                    loss = criterion(output.view(-1, conf.num_classes), labels_data.view(-1), mask=padding_mask_data_copy.view(-1)) # Flatten as CrossEntropy requires (BS, C)
                else:
                    loss = criterion(output, labels_data.float(), mask=padding_mask_data)

                if (conf.ITERATIONS_LOGS > 0) and (x % conf.ITERATIONS_LOGS == 0):
                    loss_value = loss.item()
                    if ~np.isnan(loss_value): train_loss += loss_value
                    else: print("Warning: NaN loss")                
                
                loss.backward() # backward pass

                # Update weights
                optimizer.step()     

                if (conf.ITERATIONS_LOGS > 0) and (x % conf.ITERATIONS_LOGS == 0):
                    # Labels predictions
                    predicted_probs = torch.sigmoid(output) if conf.post_activation == "sigmoid" else output
                    predicted_probs = torch.softmax(output, dim=conf.post_activation_dim).detach().cpu().numpy()[:,:,1] if conf.post_activation == "softmax" else predicted_probs.detach().cpu().numpy()
                    predicted_classes = torch.argmax(output[:,:,0-2:], dim=conf.post_activation_dim).detach().cpu().numpy() if conf.post_activation == "softmax" else np.where(predicted_probs > conf.train_probs_threshold, 1, 0)
                    target_classes = labels_data.detach().cpu().numpy()
                    padding_mask = padding_mask_data.cpu().numpy() if padding_mask_data is not None else None

                    # Concatenate for all batches
                    all_predicted_probs = np.concatenate([all_predicted_probs, predicted_probs], axis=0) if all_predicted_probs is not None else predicted_probs
                    all_predicted_classes = np.concatenate([all_predicted_classes, predicted_classes], axis=0) if all_predicted_classes is not None else predicted_classes      
                    all_target_classes = np.concatenate([all_target_classes, target_classes], axis=0) if all_target_classes is not None else target_classes 
                    all_padding_mask = np.concatenate([all_padding_mask, padding_mask], axis=0) if all_padding_mask is not None else padding_mask  

                    count += 1

                    if verbose: 
                        scores_str = {"train_%s" % m.__name__: m(all_target_classes, all_predicted_probs, mask=all_padding_mask) for m in METRICS_PROBS}
                        if METRICS is not None: scores_str = {**scores_str, **{"train_%s" % m.__name__: m(all_target_classes, all_predicted_classes, mask=all_padding_mask) for m in METRICS}}
                        scores_str["train_loss"] = (train_loss / count)
                        iterator.set_postfix_str(format_logs(scores_str))
                    
            except Exception as ex:
                print("Training batch error:", ex)
    
    scores = {"train_%s" % m.__name__: m(all_target_classes, all_predicted_probs, mask=all_padding_mask) for m in METRICS_PROBS}
    if METRICS is not None: scores = {**scores, **{"train_%s" % m.__name__: m(all_target_classes, all_predicted_classes, mask=all_padding_mask) for m in METRICS}}
    scores["train_loss"] = (train_loss / count)
    
    return (scores, all_target_classes, all_predicted_classes, all_predicted_probs)

In [None]:
# Valid loop
def valid_loop_fn(batches, preprocessing, model, criterion, device, stage="Valid", verbose=True):
    model.eval()
    count, valid_loss = 0, 0.0
    all_predicted_classes, all_predicted_probs, all_target_classes = None, None, None
    all_padding_mask = None

    src_mask = None
    with tqdm(batches, desc=stage, file=sys.stdout, disable=not(verbose)) as iterator:
        for batch in iterator:
            try:
                for k, v in batch.items():
                    batch[k] = v.to(device) if k != ROW_ID else None
                samples_data, labels_data, padding_mask_data = batch, batch["labels"], batch.get("mask")
                padding_mask_data_ = padding_mask_data

                src_mask = model.generate_mask(conf.seq_len).to(device) if src_mask is None else src_mask                  

                with torch.no_grad():
                    # Preprocessing
                    data = preprocessing(samples_data) if preprocessing is not None else samples_data
                    # NN model        
                    output = model(data, src_mask=src_mask, tgt_mask=src_mask, mem_mask=src_mask, 
                                   src_key_padding_mask=padding_mask_data_, tgt_key_padding_mask=padding_mask_data_, memory_key_padding_mask=padding_mask_data_) # forward pass

                # Compute loss
                if conf.num_classes > 1:
                    padding_mask_data_copy = padding_mask_data             
                    loss = criterion(output.view(-1, conf.num_classes), labels_data.view(-1), mask=padding_mask_data_copy.view(-1)) # Flatten as CrossEntropy requires (BS, C)
                else:
                    loss = criterion(output, labels_data.float(), mask=padding_mask_data)

                loss_value = loss.item()
                if ~np.isnan(loss_value): valid_loss += loss_value
                else: print("Warning: NaN loss")
          
                # Labels predictions
                predicted_probs = torch.sigmoid(output) if conf.post_activation == "sigmoid" else output
                predicted_probs = torch.softmax(output, dim=conf.post_activation_dim).detach().cpu().numpy()[:,:,1] if conf.post_activation == "softmax" else predicted_probs.detach().cpu().numpy()
                predicted_classes = torch.argmax(output[:,:,0-2:], dim=conf.post_activation_dim).detach().cpu().numpy() if conf.post_activation == "softmax" else np.where(predicted_probs > conf.train_probs_threshold, 1, 0)
                target_classes = labels_data.detach().cpu().numpy()
                padding_mask = padding_mask_data.cpu().numpy() if padding_mask_data is not None else None

                # Concatenate for all batches
                all_predicted_probs = np.concatenate([all_predicted_probs, predicted_probs], axis=0) if all_predicted_probs is not None else predicted_probs
                all_predicted_classes = np.concatenate([all_predicted_classes, predicted_classes], axis=0) if all_predicted_classes is not None else predicted_classes      
                all_target_classes = np.concatenate([all_target_classes, target_classes], axis=0) if all_target_classes is not None else target_classes      
                all_padding_mask = np.concatenate([all_padding_mask, padding_mask], axis=0) if all_padding_mask is not None else padding_mask      

                count += 1

                if verbose: 
                    scores_str = {"valid_%s" % m.__name__: m(all_target_classes, all_predicted_probs, mask=all_padding_mask) for m in METRICS_PROBS}
                    if METRICS is not None: scores_str = {**scores_str, **{"valid_%s" % m.__name__: m(all_target_classes, all_predicted_classes, mask=all_padding_mask) for m in METRICS}}
                    scores_str["valid_loss"]= (valid_loss / count)
                    iterator.set_postfix_str(format_logs(scores_str))
                
            except Exception as ex:
                print("Validation batch error:", ex)
    
    scores = {"valid_%s" % m.__name__: m(all_target_classes, all_predicted_probs, mask=all_padding_mask) for m in METRICS_PROBS}
    if METRICS is not None: scores = {**scores, **{"valid_%s" % m.__name__: m(all_target_classes, all_predicted_classes, mask=all_padding_mask) for m in METRICS}}
    scores["valid_loss"]= (valid_loss / count)    

    return (scores, all_target_classes, all_predicted_classes, all_predicted_probs, all_padding_mask)

In [None]:
# Train one fold
factory = None
def run_stage(X_train, X_valid, stage, fold, device):
    
    # Datasets
    train_dataset = RIIIDDataset(X_train, conf, factory, subset="train", categoricals=None, weights=True if conf.sampler == "prob" else False, flatten = conf.flatten, force_len=TRAIN_SPLIT_LEN)
    valid_dataset = RIIIDDataset(X_valid, conf, factory, subset="valid", categoricals=None, flatten = conf.flatten)

    train_sampler = WeightedRandomSampler(weights=train_dataset.uweights["prob"].values, replacement=True, num_samples=len(train_dataset)) if conf.sampler == "prob" else None
        
    # Dataloaders
    train_loader = DataLoader(train_dataset, batch_size=conf.BATCH_SIZE, sampler=train_sampler, num_workers=conf.WORKERS, drop_last = False, pin_memory=conf.pin_memory, shuffle=True if train_sampler is None else False)
    valid_loader = DataLoader(valid_dataset, batch_size=conf.BATCH_SIZE, shuffle=False, num_workers=conf.WORKERS, drop_last = False, pin_memory=conf.pin_memory)

    # Build model
    snapshot_path = "%s/fold%d/%s/snapshots" % (MODEL_PATH, fold, stage)
    if not os.path.exists(snapshot_path):
        os.makedirs(snapshot_path)
    cnn_model, criterion, optimizer = build_model(conf, device, freeze_backbone=FREEZE_BACKBONE, 
                                                  encoder_weights=os.path.join(snapshot_path.replace(stage, PRETRAINED_STAGE), MODEL_BEST) if PRETRAINED_STAGE is not None else None,
                                                  backbone_weights=os.path.join(snapshot_path.replace(stage, PRETRAINED_BACKBONE_STAGE), MODEL_BEST) if PRETRAINED_BACKBONE_STAGE is not None else "imagenet")
    
    if RESUME == True:
        resume_path = os.path.join(snapshot_path, MODEL_BEST)
        if os.path.exists(resume_path):
            cnn_model.load_state_dict(torch.load(resume_path, map_location=conf.map_location))
            print("Resuming, model weights loaded: %s" % resume_path)
    
    scheduler = None
    if conf.scheduler is not None:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=conf.EPOCHS_PER_CYCLE+1, eta_min=conf.min_lr)
    print(criterion, optimizer, scheduler)

    metric = METRIC_NAME
    valid_loss_min = np.Inf
    metric_loss_criterion = np.Inf if conf.METRIC_ == "min" else -np.Inf
    history = []
    for epoch in tqdm(range(1, conf.EPOCHS + 1)):

        lr = optimizer.param_groups[0]['lr'] if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) else scheduler.get_last_lr()[0] if scheduler is not None else optimizer.get_last_lr()
        info = "[%d], lr=%.7f" % (epoch, lr)

        # Train loop
        train_scores, _, _, _ = train_loop_fn(train_loader, None, cnn_model, optimizer, criterion, device, stage="Train%s" % info, verbose=conf.train_verbose)

        # Validation loop
        valid_scores, _, _, _, _ = valid_loop_fn(valid_loader, None, cnn_model, criterion, device, stage="Valid%s" % info, verbose=conf.valid_verbose)

        # Keep track of loss and metrics
        history.append({"epoch":epoch, "lr": lr, **train_scores, **valid_scores})

        if conf.scheduler is not None:
            scheduler.step()

        metric_loss = valid_scores["valid_%s" % metric]
        if (conf.METRIC_ == "min" and metric_loss < metric_loss_criterion and epoch > 1) or (conf.METRIC_ == "max" and metric_loss > metric_loss_criterion and epoch > 1):
            print("Epoch%s, Valid loss from: %.4f to %.4f, Metric improved from %.4f to %.4f, saving model ..." % (info, valid_loss_min, valid_scores["valid_loss"], metric_loss_criterion, metric_loss))
            metric_loss_criterion = metric_loss
            valid_loss_min = valid_scores["valid_loss"]
            torch.save(cnn_model.state_dict(), os.path.join(snapshot_path, MODEL_BEST))

    return (history)

In [None]:
if TRAIN is True:
    print(FEATURES)
    print('Fold', FOLD, 'train users:', train_pd.reset_index()[USER_ID].nunique(), 'valid users:', valid_pd.reset_index()[USER_ID].nunique()) 
    print('Fold', FOLD, 'train size:', train_pd.shape, 'valid size:', valid_pd.shape)
    _ = run_stage(train_pd, valid_pd, STAGE, FOLD, conf.L_DEVICE)