In [19]:
!nvidia-smi

Wed Apr 26 12:26:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   42C    P8    26W / 480W |    681MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [20]:
import os
import re
import gc
import pdb
import sys
import json
import math
import time
import wandb
import pickle
import shutil
import joblib
import random
import pathlib
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel
import multiprocessing
from functools import partial

import scipy
import itertools
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OrdinalEncoder, StandardScaler, RobustScaler

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


sys.path.append("/home/working/")
from kagglib.utils.utils import  Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new, seed_everything
from kagglib.utils.exp_manage import set_wandb


%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')
sns.set(font_scale = 2)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: TOKENIZERS_PARALLELISM=true
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Setup & data load

In [21]:
class Config:
    AUTHOR = "shu421"

    EXP = "exp069"
    COMPETITION = "predict-student-performance-from-game-play"
    DATASET_PATH = []
    BASE_PATH = "/home/working/"
    api_path = "/root/.kaggle/kaggle.json"
    AUTHOR = "shu421"

    seed = 42
    n_folds = 5
    es_patience = 15
    train_folds = [0]

    # weight and bias
    wandb = False

    # model
    epochs = 200
    max_length = {"0-4":368, "5-12":768, "13-22":1024}
    batch_size = 64
    num_workers = 4

    params = {
        "hidden_size": 256,
        "dropout":  0.2,
        "nhead": 8,
        "num_layers": 1,
        "dim_feedforward": 512,
    }

    # optimizer
    lr = 2e-5
    betas = (0.9, 0.999)
    weight_decay = 0.01
    eps = 1e-6
    lr_weight_decay = 0.95
    min_lr = 1e-6

    padding_mode = "back" # front: (front padding, back sequence), back: (back padding, front sequence)

    FEAT_PATH = Path(BASE_PATH) / f"output/{EXP}/feat"
    if not FEAT_PATH.is_dir():
        FEAT_PATH.mkdir(parents=True)

    LOG_PATH = Path(BASE_PATH) / f"output/log"
    if not LOG_PATH.is_dir():
        LOG_PATH.mkdir(parents=True)
cfg = setup(Config)

In [22]:
# set log functions
LOGGER = get_logger(Path(cfg.LOG_PATH) / f"{cfg.EXP}.log")
if cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP, group=cfg.MODEL_NAME, config_path="/root/.kaggle/wandb.json")

In [23]:
def get_whole_df():
    train_df = pl.read_csv(Path(cfg.INPUT)/ "train.csv")
    train_labels_df = pl.read_csv(Path(cfg.INPUT)/ 'train_labels.csv')
    test_df = pl.read_csv(Path(cfg.INPUT)/ 'test.csv')
    sub_df = pl.read_csv(Path(cfg.INPUT)/ 'sample_submission.csv')
    return train_df, train_labels_df, test_df, sub_df

def preprocess_df(train_df, train_labels_df, test_df):
    # cleaning
    train_labels_df = train_labels_df.with_columns(pl.col("session_id").apply(lambda x: int(x.split("_")[0])).alias("session"))
    train_labels_df = train_labels_df.with_columns(pl.col("session_id").apply(lambda x: int(x.split("_")[-1][1:])).alias("q"))
    train_labels_df = train_labels_df.to_pandas()
    train_labels_df = train_labels_df.sort_values(["session", "q"]).reset_index(drop=True)

    # Cast the "page" column to a Float32 type
    train_df = train_df.with_columns(pl.col("page").cast(pl.Float32, strict=False))
    train_df = train_df.with_columns((pl.col("event_name") + "_" + pl.col("name")).alias("event_comb"))

    # Sort the dataframe by session_id and elapsed_time
    train_df = train_df.sort(["session_id", "index"])

    return train_df, train_labels_df, test_df

def get_processed_df():
    train_df, train_labels_df, test_df, sub_df = get_whole_df()
    train_df, train_labels_df, test_df = preprocess_df(train_df, train_labels_df, test_df)
    return train_df, train_labels_df, test_df, sub_df

# Setup & Preprocessing

In [24]:
train_df, train_labels_df, test_df, sub_df = get_processed_df()

In [25]:
train_df

session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group,event_comb
i64,i64,i64,str,str,i64,f32,f64,f64,f64,f64,f64,str,str,str,str,i64,i64,i64,str,str
20090312431273200,0,0,"""cutscene_click...","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""undefined""","""intro""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""cutscene_click..."
20090312431273200,1,1323,"""person_click""","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""Whatcha doing ...","""gramps""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,2,831,"""person_click""","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""Just talking t...","""gramps""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,3,1147,"""person_click""","""basic""",0,,-413.991405,-159.314686,380.0,494.0,,"""I gotta run to...","""gramps""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,4,1863,"""person_click""","""basic""",0,,-412.991405,-159.314686,381.0,494.0,,"""Can I come, Gr...","""gramps""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,5,3423,"""person_click""","""basic""",0,,-412.991405,-157.314686,381.0,492.0,,"""Sure thing, Jo...","""gramps""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,6,5197,"""person_click""","""basic""",0,,478.485079,-199.971679,593.0,485.0,,"""See you later,...","""teddy""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,7,6180,"""person_click""","""basic""",0,,503.355128,-168.619913,609.0,453.0,,"""I get to go to...","""teddy""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,8,7014,"""person_click""","""basic""",0,,510.733442,-157.720642,615.0,442.0,,"""Now where did ...","""teddy""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."
20090312431273200,9,7946,"""person_click""","""basic""",0,,512.048005,-153.743631,616.0,438.0,,"""\u00f0\u0178\u...","""teddy""","""tunic.historic...","""tunic.historic...",0,0,1,"""0-4""","""person_click_b..."


# Feature Engineering

In [26]:
cat_cols = [
    "event_comb", 
    "room_fqid", 
    "text_fqid", 
    ]
num_cols = [
    "elapsed_time_diff",
    ]
feat_cols = cat_cols + num_cols

In [27]:
def get_factorize_map(values: pl.Series, sort: bool = True):
    """Factorize array and return numeric representation map.

    Parameters:
        values: 1-D sequence to factorize
        sort: whether to sort unique values

    Return:
        val2code: mapping from value to numeric code
    """
    vals_uniq = values.unique()
    if sort:
        vals_uniq = vals_uniq.sort()
    val2code = {val: code+2 for code, val in enumerate(vals_uniq)}

    return val2code

In [28]:
def feature_engineering(input_df, grp=None, is_train=True) -> pl.DataFrame:
    """_summary_

    Args:
        input_df (pl.DataFrame): _description_
        grp (_type_, optional): _description_. Defaults to None.
        fold (_type_, optional): _description_. Defaults to None.
        is_train (bool, optional): _description_. Defaults to True.

    Returns:
        output_df (pl.DataFrame): _description_
    """
    ############################ create diff cols ############################
    columns = [
        (
            (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
            .fill_null(0)
            .clip(0, 6e5)
            .over(["session_id", "level"])
            .alias("elapsed_time_diff")
        ),
    ]
    input_df = input_df.with_columns(columns)

    if is_train:
        # ordinal encoding
        for c in cat_cols:
            cat2code = get_factorize_map(input_df.get_column(c))
            pickle.dump(cat2code, open(Path(f"{cfg.FEAT_PATH}/cat2code_{c}_grp{grp}.pkl"), "wb"))
            input_df = input_df.with_columns(pl.col(c).map_dict(cat2code).alias(c))

    else:
        # ordinal encoding
        for c in cat_cols:
            cat2code = pickle.load(open(Path(f"{cfg.FEAT_PATH}/cat2code_{c}_grp{grp}.pkl"), "rb"))
            input_df = input_df.with_columns(pl.col(c).map_dict(cat2code).alias(c))

    output_df = input_df.groupby("session_id", maintain_order=True).agg(pl.col(feat_cols)).sort("session_id")

    return output_df

# Dataset

In [29]:
class PSPDataset(Dataset):
    def __init__(self, cat_feat: pd.DataFrame, num_feat: pd.DataFrame,  target: np.array = None, max_length=None, mode="train"):
        self.cat_feat = cat_feat
        self.num_feat = num_feat
        self.target = target
        self.max_length = max_length
        self.mode = mode

    def __len__(self):
        return len(self.cat_feat)

    def __getitem__(self, idx):
        """
        Args:
            idx (List): 取得するデータのindex

        Returns:
            inputs (Tuple or Dict): モデルに入力するデータ
                {cat_feat, num_feat, attention_mask}, target if mode == "train" else {cat_feat, num_feat, attention_mask}
        """
        # max_lengthが指定されていなかったらミニバッチ内の最大の長さを取得
        if not self.max_length:
            # 最長の長さでpaddingとattention maskの作成
            self.max_length = self.cat_feat.apply(len).max()

        row = self.cat_feat.iloc[idx]
        cat_feat_tensor = torch.tensor(np.stack(row), dtype=torch.long)
        row = self.num_feat.iloc[idx]
        num_feat_tensor = torch.tensor(np.stack(row), dtype=torch.float32)

        # padding
        cat_feat_padded, attention_mask = self.get_padded_data(cat_feat_tensor, mode=cfg.padding_mode)
        num_feat_padded, attention_mask = self.get_padded_data(num_feat_tensor, mode=cfg.padding_mode)

        inputs = {
            "cat_feat": cat_feat_padded.permute(1, 0), # (seq_len, feature_size)
            "num_feat": num_feat_padded.permute(1, 0),  # (seq_len, feature_size)
            "attention_mask": attention_mask, # (seq_len,)
        }

        if self.mode == "train":
            target_data = torch.tensor(self.target[idx], dtype=torch.float32)
            return inputs, target_data
        else:
            return inputs
        
    def get_padded_data(self, data, mode="front"):
        pad_token_id = 0  # paddingのIDが0

        if mode=="front":
            padding = (self.max_length - len(data), 0)
            data_padded = F.pad(data, padding, mode='constant', value=pad_token_id)
            attention_mask = torch.tensor([0] * (self.max_length - len(data.squeeze(0))) + [1] * len(data.squeeze(0)), dtype=torch.long)
        elif mode=="back":
            padding = (0, self.max_length - len(data))
            data_padded = F.pad(data, padding, mode='constant', value=pad_token_id)
            attention_mask = torch.tensor([1] * len(data.squeeze(0)) + [0] * (self.max_length - len(data.squeeze(0))), dtype=torch.long)
        return data_padded, attention_mask

In [30]:
def collate_fn(batch, max_length):
    inputs, targets = zip(*batch)
    # if cfg.max_length:
    #     max_length = cfg.max_length
    # else:
    #     max_length = max([input["attention_mask"].sum() for input in inputs])

    # 事前に設定したmax_lengthとミニバッチ内の最大の長さのうち短い方を取得
    # max_length = np.minimum(max_length, max([input["attention_mask"].sum() for input in inputs]))

    if cfg.padding_mode == "front":
        inputs_stacked = {
            "cat_feat": torch.stack([_input["cat_feat"][-max_length:, :] for _input in inputs], dim=0),
            "num_feat": torch.stack([_input["num_feat"][-max_length:, :] for _input in inputs], dim=0),
            "attention_mask": torch.stack([_input["attention_mask"][-max_length:] for _input in inputs], dim=0),
        }
    elif cfg.padding_mode == "back":
        inputs_stacked = {
            "cat_feat": torch.stack([_input["cat_feat"][:max_length, :] for _input in inputs], dim=0),
            "num_feat": torch.stack([_input["num_feat"][:max_length, :] for _input in inputs], dim=0),
            "attention_mask": torch.stack([_input["attention_mask"][:max_length] for _input in inputs], dim=0),
        }
    
    if targets[0] is not None:
        targets_stacked = torch.stack(targets, dim=0)
        return inputs_stacked, targets_stacked
    else:
        return inputs_stacked

# Model

In [31]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        )
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [32]:
class PSPTransformerModel(nn.Module):
    def __init__(self, cfg, n_cat_feat, n_num_feat, num_targets, hidden_size=256, dropout=0.1, nhead=8, num_layers=1, dim_feedforward=512):
        super(PSPTransformerModel, self).__init__()
        self.cfg = cfg
        self.n_cat_feat1 = n_cat_feat[0]
        self.n_cat_feat2 = n_cat_feat[1]
        self.n_cat_feat3 = n_cat_feat[2]
        self.n_num_feat = n_num_feat
        self.num_targets = num_targets
        self.embedding1 = nn.Embedding(self.n_cat_feat1, hidden_size, padding_idx=0)
        self.embedding2 = nn.Embedding(self.n_cat_feat2, hidden_size, padding_idx=0)
        self.embedding3 = nn.Embedding(self.n_cat_feat3, hidden_size, padding_idx=0)
        self.cat_linear = nn.Linear(hidden_size*len(n_cat_feat), hidden_size)
        self.num_linear = nn.Linear(n_num_feat, hidden_size)

        self.dropout = nn.Dropout(0.1)
        
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, batch_first=True)

        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True, norm_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, enable_nested_tensor=False)

        self.pool = MeanPooling()
        self.head = nn.Linear(hidden_size, num_targets)

    
    def forward(self, cat_feat, num_feat, attention_mask):
        # cat feat
        x_cat1 = self.embedding1(cat_feat[:, :, 0])
        x_cat2 = self.embedding2(cat_feat[:, :, 1])
        x_cat3 = self.embedding3(cat_feat[:, :, 2])
        x_cat = torch.cat([
            x_cat1,
            x_cat2,
            x_cat3,
            ], dim=-1)
        x_cat = self.cat_linear(x_cat)
        
        # num feat
        x_num = self.num_linear(num_feat)
        
        # concat
        x = x_cat + x_num
        x = self.dropout(x)

        # lstm encoding
        x, _ = self.lstm(x)

        # transformer encoder
        src_key_padding_mask = torch.where(attention_mask == 1, False, True)
        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)

        # pooling
        x = self.pool(x, attention_mask)
        x = self.head(x)
        return x

# Train funcs

In [33]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [34]:
def train_fn(model, data_loader, criterion, optimizer, scheduler=None):
    model.train()
    loss_meter = AverageMeter()
    for step, (inputs, labels) in enumerate(data_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.device)
        labels = labels.to(cfg.device)
        optimizer.zero_grad()
        output = model(**inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        loss_meter.update(loss.item(), output.size(0))
    return loss_meter.avg

def valid_fn(model, data_loader, criterion, scheduler=None):
    model.eval()
    loss_meter = AverageMeter()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for step, (inputs, labels) in enumerate(data_loader):
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            output = model(**inputs)
            loss = criterion(output, labels)
            loss_meter.update(loss.item(), output.size(0))
            y_true.append(labels.detach().cpu().numpy())
            output = torch.sigmoid(output.detach().cpu())
            y_pred.append(output.numpy())
            if scheduler is not None:
                scheduler.step(loss)
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    score = f1_score(y_true.flatten(), (y_pred>0.63).flatten().astype(int), average="macro")
    return loss_meter.avg, score, y_pred

# Train

In [35]:
def scaling(input_df, grp, i_fold, is_train=True):
    if is_train:
        _input_df = input_df["elapsed_time_diff"].explode().reset_index()
        scaler = StandardScaler()
        _input_df["elapsed_time_diff"] = scaler.fit_transform(_input_df["elapsed_time_diff"].to_numpy().reshape(-1, 1))
        input_df["elapsed_time_diff"] = _input_df.groupby("index").agg(list)["elapsed_time_diff"]
        pickle.dump(scaler, open(f"{cfg.FEAT_PATH}/scaler_grp{grp}_fold{i_fold}.pkl", "wb"))
    
    else:
        scaler = pickle.load(open(f"{cfg.FEAT_PATH}/scaler_grp{grp}_fold{i_fold}.pkl", "rb"))
        _input_df = input_df["elapsed_time_diff"].explode().reset_index()
        _input_df["elapsed_time_diff"] = scaler.transform(_input_df["elapsed_time_diff"].to_numpy().reshape(-1, 1))
        input_df["elapsed_time_diff"] = _input_df.groupby("index").agg(list)["elapsed_time_diff"]
    
    return input_df

In [36]:
seed_everything(cfg.seed)

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=cfg.seed)
models = []
cv_score = []

ALL_USERS = train_df.get_column("session_id").unique().to_numpy()
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)

limits = {"0-4":(1, 4), "5-12":(4, 14), "13-22":(14, 19)}

s = time.time()

for grp in limits.keys():
    LOGGER.info(f"{'='*30} level_group: {grp} {'='*30}")

    a, b = limits[grp]
    q_list = list(range(a, b))
    _train_df = train_df.filter(pl.col("level_group")==grp)

    # カテゴリのサイズを取得
    n_cat_cols = [int(_train_df.get_column(col).n_unique() + 2) for col in cat_cols]
    pickle.dump(n_cat_cols, open(f"{cfg.FEAT_PATH}/n_cat_cols_grp{grp}.pkl", "wb"))

    train_feat_df = feature_engineering(_train_df, grp=grp, is_train=True)

    for i_fold, (train_idx, valid_idx) in enumerate(sgkf.split(train_labels_df, y=train_labels_df["correct"], groups=train_labels_df["session"])):
        if i_fold in cfg.train_folds:
            LOGGER.info(f"{'='*50} Fold{i_fold} {'='*50}")

            train_users = train_labels_df.loc[train_idx, "session"].unique().tolist()
            valid_users = train_labels_df.loc[valid_idx, "session"].unique().tolist()

            X_train = train_feat_df.filter(pl.col("session_id").is_in(train_users))
            X_valid = train_feat_df.filter(pl.col("session_id").is_in(valid_users))
            y_train = train_labels_df[(train_labels_df["q"].isin(q_list)) & (train_labels_df["session"].isin(train_users))]["correct"].to_numpy().reshape(-1, len(q_list))
            y_valid = train_labels_df[(train_labels_df["q"].isin(q_list)) & (train_labels_df["session"].isin(valid_users))]["correct"].to_numpy().reshape(-1, len(q_list))

            X_train = X_train.to_pandas()
            X_valid = X_valid.to_pandas()

            X_train = scaling(X_train, grp=grp, i_fold=i_fold, is_train=True)
            X_valid = scaling(X_valid, grp=grp, i_fold=i_fold, is_train=False)

            if i_fold==0:
                LOGGER.info(f"num features: {len(feat_cols)}")
                display(X_train)

            X_train = X_train[feat_cols]
            X_valid = X_valid[feat_cols]

            train_dataset = PSPDataset(X_train[cat_cols], X_train[num_cols], y_train, max_length=cfg.max_length[grp])
            valid_dataset = PSPDataset(X_valid[cat_cols], X_valid[num_cols], y_valid, max_length=cfg.max_length[grp])

            custom_collate_fn = partial(collate_fn, max_length=cfg.max_length[grp])
            train_loader = DataLoader(
                train_dataset, 
                batch_size=cfg.batch_size, 
                shuffle=True, 
                drop_last=True,
                pin_memory=True,
                collate_fn=custom_collate_fn, 
                num_workers=cfg.num_workers
                )
            valid_loader = DataLoader(
                valid_dataset, 
                batch_size=cfg.batch_size, 
                shuffle=False, 
                drop_last=False,
                pin_memory=True,
                collate_fn=custom_collate_fn, 
                num_workers=cfg.num_workers
                )
            
            model = PSPTransformerModel(
                cfg=cfg, 
                n_cat_feat=n_cat_cols,
                n_num_feat=len(num_cols),
                num_targets=len(q_list),
                **cfg.params,
                )
            model = model.to(cfg.device)

            criterion = nn.BCEWithLogitsLoss()
            optimizer = optim.AdamW(model.parameters(), lr=cfg.lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)
            scheduler = None
            # scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=len(train_loader)*5, T_mult=1, eta_min=cfg.min_lr, last_epoch=-1)
            # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=len(valid_loader)*10, verbose=False)

            best_score = 0
            best_loss = np.inf
            best_epoch = 0
            best_pred = None

            es_count = 0
            pbar = tqdm(range(cfg.epochs))
            for epoch in pbar:
                train_loss = train_fn(model, train_loader, criterion, optimizer, scheduler)
                valid_loss, valid_score, valid_pred = valid_fn(model, valid_loader, criterion, scheduler)

                if valid_score > best_score:
                    best_score = valid_score
                    best_loss = valid_loss
                    best_epoch = epoch
                    best_pred = valid_pred
                    torch.save(model.state_dict(), f"{cfg.EXP_MODEL}/fold{i_fold}_grp{grp}.pth")
                    es_count = 0
                else:
                    es_count += 1
                if es_count >= cfg.es_patience:
                    LOGGER.info(f"early stopping at epoch {epoch}")
                    break
                    
                LOGGER.info(f"epoch:{epoch}/{cfg.epochs}| train_loss:{train_loss: .5f} valid_loss:{valid_loss: .5f} valid_score:{valid_score: .5f}")
                pbar.set_postfix(
                    train_loss=train_loss,
                    valid_loss=valid_loss,
                    es_count=es_count,
                    lr=optimizer.param_groups[0]["lr"],
                )
            
            oof.loc[valid_users, np.array(q_list)-1] = best_pred
            LOGGER.info(f"best score:{best_score: .5f} at epoch{best_epoch}")
            LOGGER.info(f"best loss:{best_loss: .5f} at epoch{best_epoch}")

            # del model, train_dataset, valid_dataset, train_loader, valid_loader

LOGGER.info(f"elapsed time: {time.time()-s:.2f}sec")

num features: 4


Unnamed: 0,session_id,event_comb,room_fqid,text_fqid,elapsed_time_diff
0,20090312431273200,"[3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 8, 20,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 12, 2, 9...","[-0.13376725127121394, -0.046253932169729874, ..."
1,20090312433251036,"[3, 20, 20, 20, 20, 20, 3, 3, 20, 20, 20, 20, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 7, 7, 11, 11, 11, 11, 7, 2,...","[-0.13376725127121394, -0.11934706717134734, -..."
2,20090312455206810,"[3, 20, 20, 20, 20, 20, 3, 3, 3, 3, 3, 3, 20, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 11, 11, 1...","[-0.13376725127121394, -0.11531206152872411, -..."
3,20090313091715820,"[3, 3, 20, 20, 20, 20, 20, 3, 20, 20, 20, 20, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 7, 6, 6, 6, 6, 6, 7, 11, 11, 11, 11, 2, 2,...","[-0.13376725127121394, 0.18268302732402403, -0..."
4,20090313571836404,"[3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 8, 20,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 12, 2, 2...","[-0.13376725127121394, -0.061004854437024615, ..."
...,...,...,...,...,...
18845,22100215241104530,"[3, 3, 3, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 7, 7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 9,...","[-0.13376725127121394, -0.11729649053329291, 1..."
18846,22100215342220508,"[3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 8, 8, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 2, 2, 2,...","[-0.13376725127121394, 0.3769586268713096, -0...."
18847,22100215460321130,"[3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 8, 20,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 12, 2, 2...","[-0.13376725127121394, 0.017776977044356747, -..."
18848,22100219442786200,"[3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 8, 8, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 6, 6, 6, 6, 6, 11, 11, 11, 11, 2, 2, 2, 2,...","[-0.13376725127121394, -0.009409700318235814, ..."


  0%|          | 0/200 [00:00<?, ?it/s]

epoch:0/200| train_loss: 0.31800 valid_loss: 0.30020 valid_score: 0.64585
epoch:1/200| train_loss: 0.29926 valid_loss: 0.29320 valid_score: 0.60909
epoch:2/200| train_loss: 0.29060 valid_loss: 0.28242 valid_score: 0.67306
epoch:3/200| train_loss: 0.28224 valid_loss: 0.28009 valid_score: 0.68568
epoch:4/200| train_loss: 0.27869 valid_loss: 0.27747 valid_score: 0.67342
epoch:5/200| train_loss: 0.27666 valid_loss: 0.27536 valid_score: 0.67249
epoch:6/200| train_loss: 0.27447 valid_loss: 0.27470 valid_score: 0.68667
epoch:7/200| train_loss: 0.27326 valid_loss: 0.27280 valid_score: 0.69365
epoch:8/200| train_loss: 0.27318 valid_loss: 0.27315 valid_score: 0.68192
epoch:9/200| train_loss: 0.27240 valid_loss: 0.27736 valid_score: 0.69783
epoch:10/200| train_loss: 0.27185 valid_loss: 0.27201 valid_score: 0.69795
epoch:11/200| train_loss: 0.27160 valid_loss: 0.27130 valid_score: 0.69008
epoch:12/200| train_loss: 0.27091 valid_loss: 0.27104 valid_score: 0.68781
epoch:13/200| train_loss: 0.26981 v

Unnamed: 0,session_id,event_comb,room_fqid,text_fqid,elapsed_time_diff
0,20090312431273200,"[8, 20, 20, 8, 8, 8, 8, 7, 6, 8, 8, 8, 8, 3, 3...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 5, 6, 6, ...","[2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 23, 23...","[-0.13841199263991408, -0.06748998182652594, -..."
1,20090312433251036,"[8, 8, 8, 7, 6, 8, 8, 19, 8, 8, 8, 8, 8, 8, 8,...","[2, 2, 2, 2, 2, 8, 10, 10, 10, 10, 10, 10, 10,...","[2, 2, 2, 2, 2, 2, 2, 36, 2, 2, 2, 2, 2, 2, 2,...","[-0.13841199263991408, -0.014277589732146385, ..."
2,20090312455206810,"[8, 8, 8, 7, 7, 6, 8, 8, 3, 3, 3, 3, 3, 8, 8, ...","[2, 2, 2, 2, 2, 2, 8, 5, 6, 6, 6, 6, 6, 6, 6, ...","[2, 2, 2, 2, 2, 2, 2, 2, 23, 23, 23, 23, 23, 2...","[-0.13841199263991408, -0.11736093642675292, -..."
3,20090313091715820,"[8, 8, 8, 8, 7, 7, 5, 8, 8, 8, 7, 6, 8, 8, 8, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 5, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.13841199263991408, -0.030149417829371056, ..."
4,20090313571836404,"[8, 8, 8, 7, 7, 6, 8, 8, 3, 3, 3, 3, 8, 20, 8,...","[2, 2, 2, 2, 2, 2, 8, 5, 6, 6, 6, 6, 6, 6, 6, ...","[2, 2, 2, 2, 2, 2, 2, 2, 23, 23, 23, 23, 2, 17...","[-0.13841199263991408, -0.051952297478716526, ..."
...,...,...,...,...,...
18845,22100215241104530,"[8, 8, 8, 20, 20, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.13841199263991408, 0.08287470541033937, -0..."
18846,22100215342220508,"[8, 20, 20, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 6, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, ...","[2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.13841199263991408, 0.04411403047816964, 2...."
18847,22100215460321130,"[8, 8, 8, 8, 8, 7, 7, 7, 7, 6, 8, 8, 8, 8, 8, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.13841199263991408, -0.04668953342542624, -..."
18848,22100219442786200,"[8, 8, 8, 8, 8, 7, 7, 7, 7, 6, 8, 6, 8, 8, 8, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.13841199263991408, -0.11619143330379954, -..."


  0%|          | 0/200 [00:00<?, ?it/s]

epoch:0/200| train_loss: 0.58065 valid_loss: 0.56676 valid_score: 0.65584
epoch:1/200| train_loss: 0.56324 valid_loss: 0.56396 valid_score: 0.66894
epoch:2/200| train_loss: 0.55902 valid_loss: 0.55645 valid_score: 0.67273
epoch:3/200| train_loss: 0.55472 valid_loss: 0.55271 valid_score: 0.67274
epoch:4/200| train_loss: 0.55187 valid_loss: 0.55748 valid_score: 0.67791
epoch:5/200| train_loss: 0.54991 valid_loss: 0.54845 valid_score: 0.68213
epoch:6/200| train_loss: 0.54796 valid_loss: 0.54803 valid_score: 0.67319
epoch:7/200| train_loss: 0.54692 valid_loss: 0.54739 valid_score: 0.68452
epoch:8/200| train_loss: 0.54596 valid_loss: 0.54498 valid_score: 0.68461
epoch:9/200| train_loss: 0.54424 valid_loss: 0.54505 valid_score: 0.68363
epoch:10/200| train_loss: 0.54281 valid_loss: 0.54264 valid_score: 0.68232
epoch:11/200| train_loss: 0.54286 valid_loss: 0.54305 valid_score: 0.68548
epoch:12/200| train_loss: 0.54172 valid_loss: 0.54486 valid_score: 0.68535
epoch:13/200| train_loss: 0.54190 v

Unnamed: 0,session_id,event_comb,room_fqid,text_fqid,elapsed_time_diff
0,20090312431273200,"[8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 8, 3, 3, 3, 3, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 6, 6, 6, 6,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 12, 12, 12, ...","[-0.12966938473403214, -0.08860172388352183, -..."
1,20090312433251036,"[8, 8, 8, 8, 7, 6, 8, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 2, 2, 2, 2, 2, 11, 6, 6, 6, 6, 6, 6, 6, 6,...","[2, 2, 2, 2, 2, 2, 2, 12, 12, 12, 12, 12, 12, ...","[-0.12966938473403214, -0.11037639006546882, -..."
2,20090312455206810,"[8, 8, 8, 7, 7, 7, 5, 8, 6, 8, 3, 3, 3, 3, 3, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 6, 6, 6, 6, 6,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 12, 12, 12, 12,...","[-0.12966938473403214, -0.10733434111357916, -..."
3,20090313091715820,"[8, 20, 8, 8, 8, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8,...","[2, 2, 2, 2, 2, 2, 2, 11, 11, 11, 11, 11, 11, ...","[2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.12966938473403214, 0.07150611568961777, -0..."
4,20090313571836404,"[8, 8, 6, 8, 3, 3, 3, 3, 3, 3, 3, 3, 8, 20, 20...","[2, 2, 2, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,...","[2, 2, 2, 2, 12, 12, 12, 12, 12, 12, 12, 12, 2...","[-0.12966938473403214, 0.006342224983349961, 0..."
...,...,...,...,...,...
18845,22100215241104530,"[8, 20, 8, 8, 7, 7, 7, 7, 6, 8, 3, 3, 3, 3, 3,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 6, 6, 6, 6, 6,...","[2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 12, 12, 12, 12,...","[-0.12966938473403214, -0.003024083631678706, ..."
18846,22100215342220508,"[8, 20, 8, 8, 20, 8, 8, 7, 7, 6, 8, 3, 3, 3, 3...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 11, 6, 6, 6, 6,...","[2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 12, 12, 12, ...","[-0.12966938473403214, 0.11585598725137745, -0..."
18847,22100215460321130,"[8, 8, 8, 7, 7, 7, 6, 8, 7, 7, 7, 7, 6, 8, 7, ...","[2, 2, 2, 2, 2, 2, 2, 14, 14, 14, 14, 14, 14, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.12966938473403214, -0.05097638158383402, 0..."
18848,22100219442786200,"[8, 8, 7, 6, 8, 3, 3, 3, 3, 3, 3, 3, 3, 8, 3, ...","[2, 2, 2, 2, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,...","[2, 2, 2, 2, 2, 12, 12, 12, 12, 12, 12, 12, 12...","[-0.12966938473403214, 0.20791799500593272, 0...."


  0%|          | 0/200 [00:00<?, ?it/s]

epoch:0/200| train_loss: 0.53938 valid_loss: 0.52552 valid_score: 0.63737
epoch:1/200| train_loss: 0.52584 valid_loss: 0.52352 valid_score: 0.64120
epoch:2/200| train_loss: 0.52315 valid_loss: 0.51976 valid_score: 0.64213
epoch:3/200| train_loss: 0.52157 valid_loss: 0.51840 valid_score: 0.64061
epoch:4/200| train_loss: 0.51803 valid_loss: 0.51604 valid_score: 0.64187
epoch:5/200| train_loss: 0.51511 valid_loss: 0.51288 valid_score: 0.64469
epoch:6/200| train_loss: 0.51319 valid_loss: 0.51301 valid_score: 0.64858
epoch:7/200| train_loss: 0.51189 valid_loss: 0.51284 valid_score: 0.63868
epoch:8/200| train_loss: 0.51213 valid_loss: 0.51003 valid_score: 0.65119
epoch:9/200| train_loss: 0.51053 valid_loss: 0.51000 valid_score: 0.64709
epoch:10/200| train_loss: 0.50917 valid_loss: 0.51431 valid_score: 0.63746
epoch:11/200| train_loss: 0.50882 valid_loss: 0.51006 valid_score: 0.64869
epoch:12/200| train_loss: 0.50865 valid_loss: 0.51217 valid_score: 0.64099
epoch:13/200| train_loss: 0.50764 v

In [37]:
if len(cfg.train_folds) == cfg.n_folds:
    oof_labels_df = oof.copy()
else:
    oof = oof[oof.index.isin(valid_users)].copy()
    oof_labels_df = oof.copy()
    _train_labels_df = train_labels_df[train_labels_df["session"].isin(valid_users)].copy()

for q in range(18):
    # GET TRUE LABELS
    tmp = _train_labels_df[_train_labels_df["q"] == q+1]
    oof_labels_df[q] = tmp["correct"].to_numpy()

# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    preds = (oof.to_numpy().reshape((-1))>threshold).astype('int')
    m = f1_score(oof_labels_df.to_numpy().reshape((-1)), preds, average="macro")
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

LOGGER.info('When using optimal threshold...')
for k in range(18):
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(oof_labels_df[k].to_numpy(), (oof[k].to_numpy()>best_threshold).astype('int'), average="macro")
    LOGGER.info(f'Q{k}: F1 = {m: .05f}')
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(oof_labels_df.to_numpy().reshape((-1)), (oof.to_numpy().reshape((-1))>best_threshold).astype('int'), average='macro')
LOGGER.info(f"Overall F1: {m: .05f}, Best Threshold: {best_threshold: .05f}")

When using optimal threshold...
Q0: F1 =  0.66351
Q1: F1 =  0.53872
Q2: F1 =  0.53502
Q3: F1 =  0.67032
Q4: F1 =  0.65389
Q5: F1 =  0.63292
Q6: F1 =  0.62395
Q7: F1 =  0.57219
Q8: F1 =  0.62218
Q9: F1 =  0.60770
Q10: F1 =  0.61912
Q11: F1 =  0.50262
Q12: F1 =  0.47192
Q13: F1 =  0.62860
Q14: F1 =  0.59873
Q15: F1 =  0.46803
Q16: F1 =  0.56419
Q17: F1 =  0.49202
Overall F1:  0.69332, Best Threshold:  0.63000


In [38]:
oof.to_csv(f"{cfg.EXP_PREDS}/oof.csv", index=False)
oof_labels_df.to_csv(f"{cfg.EXP_PREDS}/oof_labels.csv", index=False)

# 閾値最適化

In [39]:
from scipy.optimize import minimize

def f1_score_macro_for_thresholds(y_true, y_pred_prob, thresholds):
    y_pred_binary = (y_pred_prob >= thresholds).astype(int)
    score = f1_score(y_true.flatten(), y_pred_binary.flatten(), average="macro")
    return score

def optimize_thresholds(y_true, y_pred_prob, method="Powell"):
    n_labels = y_pred_prob.shape[1]
    init_thresholds = np.full(n_labels, 0.6)

    objective = lambda thresholds: -f1_score_macro_for_thresholds(y_true, y_pred_prob, thresholds)
    result = minimize(objective, init_thresholds, bounds=[(0, 1)] * n_labels, method=method)

    return result.x

In [40]:
y_true = oof_labels_df.to_numpy()
y_pred = oof.to_numpy()

all_thresholds = optimize_thresholds(y_true, y_pred, "Powell")
pickle.dump(all_thresholds, open(f"{cfg.FEAT_PATH}/all_thresholds.pkl", "wb"))
best_score =f1_score_macro_for_thresholds(y_true, y_pred, all_thresholds)
LOGGER.info(f"Optimized thresholds cv: {best_score:.05f}")

Optimized thresholds cv: 0.69547
