In [1]:
!nvidia-smi

Tue Jun 20 07:50:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   36C    P8    21W / 480W |    737MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os
import re
import gc
import pdb
import sys
import json
import math
import time
import wandb
import pickle
import shutil
import joblib
import random
import pathlib
import requests
import warnings
from glob import glob
from typing import List
from pathlib import Path
from tqdm.auto import tqdm
from pandarallel import pandarallel
import multiprocessing

import scipy
import itertools
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error, f1_score, fbeta_score, recall_score, precision_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from tslearn.clustering import TimeSeriesKMeans

import numba



sys.path.append("/home/working/")
from kagglib.utils.utils import  Timer, reduce_mem_usage, get_logger, decorate, setup, dataset_create_new
from kagglib.utils.exp_manage import set_wandb
from kagglib.tabular.blocks import AbstractBaseBlock, IdentityBlock, LabelEncodingBlock, SVDBlock, run_blocks
from kagglib.tabular.model_selection import train_cv, predict_cv
from kagglib.tabular.model import XGBoost, LightGBM
from kagglib.tabular.model import get_model, save_model, load_model

from src.utils import f1_score_macro_for_thresholds, optimize_thresholds


%load_ext autoreload
%autoreload 2
%env TOKENIZERS_PARALLELISM=true

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 300)
pandarallel.initialize(progress_bar=True)
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')
sns.set(font_scale = 2)

2023-06-27 04:20:30.696390: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 04:20:30.766285: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-27 04:20:31.250496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u



env: TOKENIZERS_PARALLELISM=true
INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Setup & data load

In [7]:
class Config:
    AUTHOR = "shu421"

    EXP = "exp108"
    COMPETITION = "predict-student-performance-from-game-play"
    DATASET_PATH = []
    BASE_PATH = "/home/working/"
    api_path = "/root/.kaggle/kaggle.json"
    AUTHOR = "shu421"

    seed = 42
    n_folds = 5

    # weight and bias
    wandb = False

    gbdt_model = "XGBoost"
    stopping_rounds = 50
    log_evaluation = 1000
    model_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "booster": "gbtree",
        "seed": seed,
        "learning_rate": 0.01,
        "max_depth": 4, 
        "min_child_weight": 4,
        "gamma": 0.3780614323132923,
        "subsample": 0.6328765591576581,
        "colsample_bytree": 0.6548384506036804,
        "reg_alpha": 3.0449249728703658e-05, 
        "reg_lambda": 2.7993749510577772,
        }
    train_params = {
        "num_boost_round": 99999,
        "verbose_eval": log_evaluation,
    }

    FEAT_PATH = Path(BASE_PATH) / f"output/{EXP}/feat"
    if not FEAT_PATH.is_dir():
        FEAT_PATH.mkdir(parents=True)

    LOG_PATH = Path(BASE_PATH) / f"output/log"
    if not LOG_PATH.is_dir():
        LOG_PATH.mkdir(parents=True)
cfg = setup(Config)

In [8]:
# set log functions
LOGGER = get_logger(Path(cfg.LOG_PATH) / f"{cfg.EXP}.log")
if cfg.wandb:
    run = set_wandb(cfg, name=cfg.EXP, group=cfg.MODEL_NAME, config_path="/root/.kaggle/wandb.json")

In [9]:
def get_whole_df():
    # train_df = pl.read_csv(Path(cfg.INPUT)/ "train.csv")
    train_df = pd.read_csv(Path(cfg.INPUT)/ "train.csv")
    train_labels_df = pl.read_csv(Path(cfg.INPUT)/ 'train_labels.csv')
    test_df = pl.read_csv(Path(cfg.INPUT)/ 'test.csv')
    sub_df = pl.read_csv(Path(cfg.INPUT)/ 'sample_submission.csv')
    return train_df, train_labels_df, test_df, sub_df

def preprocess_df(train_df, train_labels_df, test_df):
    # cleaning
    train_labels_df = train_labels_df.with_columns(pl.col("session_id").apply(lambda x: int(x.split("_")[0])).alias("session"))
    train_labels_df = train_labels_df.with_columns(pl.col("session_id").apply(lambda x: int(x.split("_")[-1][1:])).alias("q"))

    # Cast the "page" column to a Float32 type
    # train_df = train_df.with_columns(pl.col("page").cast(pl.Float32, strict=False))


    # Sort the dataframe by session_id and elapsed_time
    train_df = train_df.sort_values(["session_id", "index"])

    return train_df, train_labels_df, test_df

def get_processed_df():
    train_df, train_labels_df, test_df, sub_df = get_whole_df()
    train_df, train_labels_df, test_df = preprocess_df(train_df, train_labels_df, test_df)
    return train_df, train_labels_df, test_df, sub_df

# Setup & Preprocessing

In [10]:
train_df, train_labels_df, test_df, sub_df = get_processed_df()

In [11]:
train_df

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26296941,22100221145014656,1600,5483231,navigate_click,undefined,22,,343.887291,36.701026,483.0,273.0,,,,tunic.capitol_2.hall,,0,0,1,13-22
26296942,22100221145014656,1601,5485166,navigate_click,undefined,22,,332.696070,141.493178,545.0,221.0,,,chap4_finale_c,tunic.capitol_2.hall,,0,0,1,13-22
26296943,22100221145014656,1602,5485917,navigate_click,undefined,22,,369.912859,140.569205,611.0,217.0,,,,tunic.capitol_2.hall,,0,0,1,13-22
26296944,22100221145014656,1603,5486753,navigate_click,undefined,22,,252.299653,123.805889,526.0,232.0,,,chap4_finale_c,tunic.capitol_2.hall,,0,0,1,13-22


In [8]:
cat_cols = [
    "event_name", 
    "name", 
    "fqid", 
    "room_fqid", 
    ]

numeric_cols = [
    "elapsed_time",
    "level",
    "page",
    "room_coor_x",
    "room_coor_y",
    "screen_coor_x",
    "screen_coor_y",
    "hover_duration",
]
diff_cols = [
    "elapsed_time_diff",
    "room_coor_x_diff",
    "room_coor_y_diff",
    "screen_coor_x_diff",
    "screen_coor_y_diff",
    "room_coor_move",
]
numeric_cols += diff_cols


def get_cols(input_df, grp):

    event_name_cols = input_df["event_name"].unique().tolist()
    name_cols = input_df["name"].unique().tolist()
    text_cols = input_df["text"].unique().tolist()
    fqid_cols = input_df["fqid"].unique().tolist()
    room_fqid_cols = input_df["room_fqid"].unique().tolist()
    text_fqid_cols = input_df["text_fqid"].unique().tolist()

    for col in [event_name_cols, name_cols, text_cols, fqid_cols, room_fqid_cols, text_fqid_cols]:
        try:
            col.remove(None)
        except:
            pass

    pickle.dump(event_name_cols, open(f"{cfg.FEAT_PATH}/event_name_cols_{grp}.pkl", "wb"))
    pickle.dump(name_cols, open(f"{cfg.FEAT_PATH}/name_cols_{grp}.pkl", "wb"))
    pickle.dump(text_cols, open(f"{cfg.FEAT_PATH}/text_cols_{grp}.pkl", "wb"))
    pickle.dump(fqid_cols, open(f"{cfg.FEAT_PATH}/fqid_cols_{grp}.pkl", "wb"))
    pickle.dump(room_fqid_cols, open(f"{cfg.FEAT_PATH}/room_fqid_cols_{grp}.pkl", "wb"))
    pickle.dump(text_fqid_cols, open(f"{cfg.FEAT_PATH}/text_fqid_cols_{grp}.pkl", "wb"))

    if grp=="0-4":
        level_cols = [1, 2, 3, 4]
    elif grp=="5-12":
        level_cols = [5, 6, 7, 8, 9, 10, 11, 12]
    elif grp=="13-22":
        level_cols = [13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

    return event_name_cols, name_cols, text_cols, fqid_cols, room_fqid_cols, text_fqid_cols, level_cols

In [9]:
@numba.jit('Tuple((f8[:], f8[:]))(i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], f8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:], i8[:])', nopython=True)
def feature_engineering(
    event_name,
    name,
    fqid,
    room_fqid,
    text,
    text_fqid,
    elapsed_time,
    level,
    level_group,
    page,
    room_coor_x,
    room_coor_y,
    screen_coor_x,
    screen_coor_y,
    hover_duration,
    elapsed_time_diff,
    room_coor_x_diff,
    room_coor_y_diff,
    screen_coor_x_diff,
    screen_coor_y_diff,
    room_coor_move,
    event_name_cols,
    name_cols,
    text_cols,
    fqid_cols,
    room_fqid_cols,
    text_fqid_cols,
    level_cols,
    level_group_cols,
):
    feat = [
        np.nanquantile(elapsed_time, 0.1),
        np.nanquantile(elapsed_time, 0.2),
        np.nanquantile(elapsed_time, 0.4),
        np.nanquantile(elapsed_time, 0.6),
        np.nanquantile(elapsed_time, 0.8),
        np.nanquantile(elapsed_time, 0.9),
        np.nanquantile(level, 0.1),
        np.nanquantile(level, 0.2),
        np.nanquantile(level, 0.4),
        np.nanquantile(level, 0.6),
        np.nanquantile(level, 0.8),
        np.nanquantile(level, 0.9),
        np.nanquantile(page, 0.1),
        np.nanquantile(page, 0.2),
        np.nanquantile(page, 0.4),
        np.nanquantile(page, 0.6),
        np.nanquantile(page, 0.8),
        np.nanquantile(page, 0.9),
        np.nanquantile(room_coor_x, 0.1),
        np.nanquantile(room_coor_x, 0.2),
        np.nanquantile(room_coor_x, 0.4),
        np.nanquantile(room_coor_x, 0.6),
        np.nanquantile(room_coor_x, 0.8),
        np.nanquantile(room_coor_x, 0.9),
        np.nanquantile(room_coor_y, 0.1),
        np.nanquantile(room_coor_y, 0.2),
        np.nanquantile(room_coor_y, 0.4),
        np.nanquantile(room_coor_y, 0.6),
        np.nanquantile(room_coor_y, 0.8),
        np.nanquantile(room_coor_y, 0.9),
        np.nanquantile(screen_coor_x, 0.1),
        np.nanquantile(screen_coor_x, 0.2),
        np.nanquantile(screen_coor_x, 0.4),
        np.nanquantile(screen_coor_x, 0.6),
        np.nanquantile(screen_coor_x, 0.8),
        np.nanquantile(screen_coor_x, 0.9),
        np.nanquantile(screen_coor_y, 0.1),
        np.nanquantile(screen_coor_y, 0.2),
        np.nanquantile(screen_coor_y, 0.4),
        np.nanquantile(screen_coor_y, 0.6),
        np.nanquantile(screen_coor_y, 0.8),
        np.nanquantile(screen_coor_y, 0.9),
        np.nanquantile(hover_duration, 0.1),
        np.nanquantile(hover_duration, 0.2),
        np.nanquantile(hover_duration, 0.4),
        np.nanquantile(hover_duration, 0.6),
        np.nanquantile(hover_duration, 0.8),
        np.nanquantile(hover_duration, 0.9),
        np.nanquantile(elapsed_time_diff, 0.1),
        np.nanquantile(elapsed_time_diff, 0.2),
        np.nanquantile(elapsed_time_diff, 0.4),
        np.nanquantile(elapsed_time_diff, 0.6),
        np.nanquantile(elapsed_time_diff, 0.8),
        np.nanquantile(elapsed_time_diff, 0.9),
        np.nanquantile(room_coor_x_diff, 0.1),
        np.nanquantile(room_coor_x_diff, 0.2),
        np.nanquantile(room_coor_x_diff, 0.4),
        np.nanquantile(room_coor_x_diff, 0.6),
        np.nanquantile(room_coor_x_diff, 0.8),
        np.nanquantile(room_coor_x_diff, 0.9),
        np.nanquantile(room_coor_y_diff, 0.1),
        np.nanquantile(room_coor_y_diff, 0.2),
        np.nanquantile(room_coor_y_diff, 0.4),
        np.nanquantile(room_coor_y_diff, 0.6),
        np.nanquantile(room_coor_y_diff, 0.8),
        np.nanquantile(room_coor_y_diff, 0.9),
        np.nanquantile(screen_coor_x_diff, 0.1),
        np.nanquantile(screen_coor_x_diff, 0.2),
        np.nanquantile(screen_coor_x_diff, 0.4),
        np.nanquantile(screen_coor_x_diff, 0.6),
        np.nanquantile(screen_coor_x_diff, 0.8),
        np.nanquantile(screen_coor_x_diff, 0.9),
        np.nanquantile(screen_coor_y_diff, 0.1),
        np.nanquantile(screen_coor_y_diff, 0.2),
        np.nanquantile(screen_coor_y_diff, 0.4),
        np.nanquantile(screen_coor_y_diff, 0.6),
        np.nanquantile(screen_coor_y_diff, 0.8),
        np.nanquantile(screen_coor_y_diff, 0.9),
        np.nanquantile(room_coor_move, 0.1),
        np.nanquantile(room_coor_move, 0.2),
        np.nanquantile(room_coor_move, 0.4),
        np.nanquantile(room_coor_move, 0.6),
        np.nanquantile(room_coor_move, 0.8),
        np.nanquantile(room_coor_move, 0.9),
        len(np.unique(event_name)),
        len(np.unique(name)),
        len(np.unique(fqid)),
        len(np.unique(room_fqid)),
    ]
    save_feat = [
        np.nansum(elapsed_time),
        np.nansum(level),
        np.nansum(page),
        np.nansum(room_coor_x),
        np.nansum(room_coor_y),
        np.nansum(screen_coor_x),
        np.nansum(screen_coor_y),
        np.nansum(hover_duration),
        np.nansum(elapsed_time_diff),
        np.nansum(room_coor_x_diff),
        np.nansum(room_coor_y_diff),
        np.nansum(screen_coor_x_diff),
        np.nansum(screen_coor_y_diff),
        np.nansum(room_coor_move),
        np.nanstd(elapsed_time),
        np.nanstd(level),
        np.nanstd(page),
        np.nanstd(room_coor_x),
        np.nanstd(room_coor_y),
        np.nanstd(screen_coor_x),
        np.nanstd(screen_coor_y),
        np.nanstd(hover_duration),
        np.nanstd(elapsed_time_diff),
        np.nanstd(room_coor_x_diff),
        np.nanstd(room_coor_y_diff),
        np.nanstd(screen_coor_x_diff),
        np.nanstd(screen_coor_y_diff),
        np.nanstd(room_coor_move),
    ]

    feat += save_feat

    ############################ elapsed_time_diff, hover_duration ############################
    # event_name
    for c in event_name_cols:
        idx = event_name == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(12)]
            save_feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanmean(elapsed_time_diff[idx]),
                np.nanquantile(elapsed_time_diff[idx], 0.1),
                np.nanquantile(elapsed_time_diff[idx], 0.2),
                np.nanquantile(elapsed_time_diff[idx], 0.4),
                np.nanquantile(elapsed_time_diff[idx], 0.6),
                np.nanquantile(elapsed_time_diff[idx], 0.8),
                np.nanquantile(elapsed_time_diff[idx], 0.9),
                np.nanmean(hover_duration[idx]),
            ]
            _save_feat = [
                np.nansum(elapsed_time_diff[idx]),
                np.nanstd(elapsed_time_diff[idx]),
                np.nansum(hover_duration[idx]),
                np.nanstd(hover_duration[idx]),
            ]
            feat += _save_feat
            save_feat += _save_feat

    # name
    for c in name_cols:
        idx = name == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(12)]
            save_feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanmean(elapsed_time_diff[idx]),
                np.nanmean(hover_duration[idx]),
                np.nanquantile(elapsed_time_diff[idx], 0.1),
                np.nanquantile(elapsed_time_diff[idx], 0.2),
                np.nanquantile(elapsed_time_diff[idx], 0.4),
                np.nanquantile(elapsed_time_diff[idx], 0.6),
                np.nanquantile(elapsed_time_diff[idx], 0.8),
                np.nanquantile(elapsed_time_diff[idx], 0.9),
            ]
            _save_feat = [
                np.nansum(elapsed_time_diff[idx]),
                np.nanstd(elapsed_time_diff[idx]),
                np.nansum(hover_duration[idx]),
                np.nanstd(hover_duration[idx]),
            ]
            feat += _save_feat
            save_feat += _save_feat

    # fqid
    for c in fqid_cols:
        idx = fqid == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(12)]
            save_feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanmean(elapsed_time_diff[idx]),
                np.nanmean(hover_duration[idx]),
                np.nanquantile(elapsed_time_diff[idx], 0.1),
                np.nanquantile(elapsed_time_diff[idx], 0.2),
                np.nanquantile(elapsed_time_diff[idx], 0.4),
                np.nanquantile(elapsed_time_diff[idx], 0.6),
                np.nanquantile(elapsed_time_diff[idx], 0.8),
                np.nanquantile(elapsed_time_diff[idx], 0.9),
            ]
            _save_feat = [
                np.nansum(elapsed_time_diff[idx]),
                np.nanstd(elapsed_time_diff[idx]),
                np.nansum(hover_duration[idx]),
                np.nanstd(hover_duration[idx]),
            ]
            feat += _save_feat
            save_feat += _save_feat

    # room_fqid
    for c in room_fqid_cols:
        idx = room_fqid == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(12)]
            save_feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanmean(elapsed_time_diff[idx]),
                np.nanmean(hover_duration[idx]),
                np.nanquantile(elapsed_time_diff[idx], 0.1),
                np.nanquantile(elapsed_time_diff[idx], 0.2),
                np.nanquantile(elapsed_time_diff[idx], 0.4),
                np.nanquantile(elapsed_time_diff[idx], 0.6),
                np.nanquantile(elapsed_time_diff[idx], 0.8),
                np.nanquantile(elapsed_time_diff[idx], 0.9),
            ]
            _save_feat = [
                np.nansum(elapsed_time_diff[idx]),
                np.nanstd(elapsed_time_diff[idx]),
                np.nansum(hover_duration[idx]),
                np.nanstd(hover_duration[idx]),
            ]
            feat += _save_feat
            save_feat += _save_feat

    # text
    for c in text_cols:
        idx = text == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanstd(elapsed_time_diff[idx]),
                np.nanmean(elapsed_time_diff[idx]),
                np.nanstd(hover_duration[idx]),
                np.nanmean(hover_duration[idx]),
            ]

    # text_fqid
    for c in text_fqid_cols:
        idx = text_fqid == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanstd(elapsed_time_diff[idx]),
                np.nanmean(elapsed_time_diff[idx]),
                np.nanstd(hover_duration[idx]),
                np.nanmean(hover_duration[idx]),
            ]

    # level
    for c in level_cols:
        idx = level == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(4)]
        else:
            feat += [
                np.nanstd(elapsed_time_diff[idx]),
                np.nanmean(elapsed_time_diff[idx]),
                np.nanstd(hover_duration[idx]),
                np.nanmean(hover_duration[idx]),
            ]

    # level_group
    for c in level_group_cols:
        idx = level_group == c
        count = np.sum(idx)
        feat.append(count)
        if count == 0:
            feat += [0.0 for k in range(2)]
        else:
            feat += [
                np.nanmean(elapsed_time_diff[idx]),
                np.nanmean(hover_duration[idx]),
            ]


    ############################ categorical features ############################
    for c in level_cols:
        for d in room_fqid_cols:
            idx = (c == level) & (d == room_fqid)
            count = np.sum(idx)
            feat.append(count)
            if count == 0:
                feat += [0.0]
            else:
                feat += [
                    len(np.unique(room_fqid[idx])),
                    ]

    feat = np.array(feat, dtype=np.float64)
    # feat[np.isnan(feat)] = 0.0
    save_feat = np.array(save_feat, dtype=np.float64)
    # save_feat[np.isnan(save_feat)] = 0.0
    return feat, save_feat

In [10]:
def get_date_feat(session_id: str):
    date_feat_arr = []
    date_feat_arr.append(int(session_id[:2])) # year
    date_feat_arr.append(int(session_id[2:4])) # month
    date_feat_arr.append(int(session_id[4:6])) # day
    date_feat_arr.append(int(session_id[6:8])) # hour
    return np.array(date_feat_arr)

In [11]:
def add_previous_features(train_feat_arr, grp, prev_feat_dict):
    if grp == "5-12":
        train_feat_arr = np.concatenate([train_feat_arr, prev_feat_dict["0-4"]], axis=1)
        train_feat_arr = np.concatenate([train_feat_arr, prev_feat_dict["5-12_first_et"]-prev_feat_dict["0-4_last_et"]], axis=1) # grp間の時間差
    elif grp == "13-22":
        train_feat_arr = np.concatenate([train_feat_arr, prev_feat_dict["0-4"], prev_feat_dict["5-12"]], axis=1)
        train_feat_arr = np.concatenate([train_feat_arr, prev_feat_dict["13-22_first_et"]-prev_feat_dict["5-12_last_et"]], axis=1) # grp間の時間差
    return train_feat_arr

def add_previous_predictions(train_feat_arr: np.array, grp, oof):
    if not grp=="0-4":
        if grp=="5-12":
            pre_limits = (1, 4)
        elif grp=="13-22":
            pre_limits = (1, 14)
        for pre_q in range(*pre_limits):
            train_feat_arr = np.concatenate([train_feat_arr, oof.loc[:, pre_q-1].to_numpy().reshape(-1, 1)], axis=1)
    return train_feat_arr

In [12]:
def get_feat_cols(input_arr):
    # 列に一意な値が1つだけである列を削除
    unique_counts = np.array([len(np.unique(input_arr[:, i])) for i in range(input_arr.shape[1])])
    keep_cols_after_unique_check = np.where(unique_counts != 1)[0].tolist()
    input_arr_after_unique_check = input_arr[:, keep_cols_after_unique_check]

    # 重複する列を削除
    _, unique_indices_after_duplicate_check = np.unique(input_arr_after_unique_check, axis=1, return_index=True)
    keep_cols_after_duplicate_check = sorted(unique_indices_after_duplicate_check.tolist())

    # 列削除後のインデックスを元の配列に対するインデックスに変換
    final_keep_cols = [keep_cols_after_unique_check[i] for i in keep_cols_after_duplicate_check]

    return final_keep_cols

In [13]:
train_labels_df = train_labels_df.to_pandas()
ALL_USERS = train_df["session_id"].unique()
train_labels_df = train_labels_df.sort_values(["session", "q"]).reset_index(drop=True)

In [14]:
# 各カテゴリごとに全ラベルに対するstr2intの辞書を作成

_cat_cols = ["event_name", "name", "fqid", "room_fqid", "text", "text_fqid"]
train_df[_cat_cols] = train_df[_cat_cols].fillna("NAN")

event_name2label = {k:v for v,k in enumerate(train_df["event_name"].unique())}
name2label = {k:v for v,k in enumerate(train_df["name"].unique())}
fqid2label = {k:v for v,k in enumerate(train_df["fqid"].unique())}
room_fqid2label = {k:v for v,k in enumerate(train_df["room_fqid"].unique())}
text2label = {k:v for v,k in enumerate(train_df["text"].unique())}
text_fqid2label = {k:v for v,k in enumerate(train_df["text_fqid"].unique())}
level_group2label = {"0-4":0, "5-12":1, "13-22":2}

pickle.dump(event_name2label, open(f"{cfg.FEAT_PATH}/event_name2label.pkl", "wb"))
pickle.dump(name2label, open(f"{cfg.FEAT_PATH}/name2label.pkl", "wb"))
pickle.dump(fqid2label, open(f"{cfg.FEAT_PATH}/fqid2label.pkl", "wb"))
pickle.dump(room_fqid2label, open(f"{cfg.FEAT_PATH}/room_fqid2label.pkl", "wb"))
pickle.dump(text2label, open(f"{cfg.FEAT_PATH}/text2label.pkl", "wb"))
pickle.dump(text_fqid2label, open(f"{cfg.FEAT_PATH}/text_fqid2label.pkl", "wb"))
pickle.dump(level_group2label, open(f"{cfg.FEAT_PATH}/level_group2label.pkl", "wb"))

In [15]:
def train_loop(cfg, train_df, train_labels_df, feat_dict=None):
    models = []
    oof = pd.DataFrame(data=np.zeros((len(ALL_USERS), 18)), index=ALL_USERS)
    
    cv_score = []
    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=cfg.seed)

    prev_feat_dict = {
        "0-4": [], 
        "0-4_first_et": [], 
        "0-4_last_et": [], 
        "5-12": [], 
        "5-12_first_et": [], 
        "5-12_last_et": [],
        "13-22": [], 
        "13-22_first_et": [],
        "13-22_last_et": [],
        }
    limits = {"0-4":(1, 4), "5-12":(4, 14), "13-22":(14, 19)}

    s = time.time()

    for grp in limits.keys():
        LOGGER.info(f"{'='*50} level_group: {grp} {'='*50}")
        train_grp_df = train_df[train_df["level_group"]==grp]

        ############### get columns in current level_group ###############
        event_name_cols, name_cols, text_cols, fqid_cols, room_fqid_cols, text_fqid_cols, level_cols = get_cols(train_grp_df, grp)

        ############### encoding ###############
        event_name_cols = np.vectorize(event_name2label.get)(event_name_cols).astype(np.int64)
        name_cols = np.vectorize(name2label.get)(name_cols).astype(np.int64)
        text_cols = np.vectorize(text2label.get)(text_cols).astype(np.int64)
        fqid_cols = np.vectorize(fqid2label.get)(fqid_cols).astype(np.int64)
        room_fqid_cols = np.vectorize(room_fqid2label.get)(room_fqid_cols).astype(np.int64)
        text_fqid_cols = np.vectorize(text_fqid2label.get)(text_fqid_cols).astype(np.int64)
        level_cols = np.vectorize(np.int64)(level_cols)
        level_group_cols = np.array([level_group2label[grp]]).astype(np.int64)

        train_feat_arr = []
        for session_id, train_session_df in tqdm(train_grp_df.groupby("session_id")):
            session_id = str(session_id)

            ############### get features ###############
            # categorical features
            event_name = train_session_df["event_name"].to_numpy()
            name = train_session_df["name"].to_numpy()
            fqid = train_session_df["fqid"].to_numpy()
            room_fqid = train_session_df["room_fqid"].to_numpy()
            text = train_session_df["text"].to_numpy()
            text_fqid = train_session_df["text_fqid"].to_numpy()
            # numeric features
            elapsed_time = train_session_df["elapsed_time"].to_numpy().astype(np.int64)
            level = train_session_df["level"].to_numpy().astype(np.int64)
            level_group = train_session_df["level_group"].to_numpy()
            page = train_session_df["page"].to_numpy().astype(np.float64)
            room_coor_x = train_session_df["room_coor_x"].to_numpy().astype(np.float64)
            room_coor_y = train_session_df["room_coor_y"].to_numpy().astype(np.float64)
            screen_coor_x = train_session_df["screen_coor_x"].to_numpy().astype(np.float64)
            screen_coor_y = train_session_df["screen_coor_y"].to_numpy().astype(np.float64)
            hover_duration = train_session_df["hover_duration"].to_numpy().astype(np.float64)
            # diff features
            elapsed_time_diff = train_session_df["elapsed_time"].diff().clip(0, None).fillna(0).to_numpy().astype(np.float64)
            room_coor_x_diff = (train_session_df["room_coor_x"] - train_session_df["room_coor_x"].shift(1)).abs().to_numpy().astype(np.float64)
            room_coor_y_diff = (train_session_df["room_coor_y"] - train_session_df["room_coor_y"].shift(1)).abs().to_numpy().astype(np.float64)
            screen_coor_x_diff = (train_session_df["screen_coor_x"] - train_session_df["screen_coor_x"].shift(1)).abs().to_numpy().astype(np.float64)
            screen_coor_y_diff = (train_session_df["screen_coor_y"] - train_session_df["screen_coor_y"].shift(1)).abs().to_numpy().astype(np.float64)
            room_coor_move = ((room_coor_x_diff**2 + room_coor_y_diff**2)**0.5).astype(np.float64)

            ############### encoding ###############
            event_name = np.vectorize(event_name2label.get)(event_name).astype(np.int64)
            name = np.vectorize(name2label.get)(name).astype(np.int64)
            text = np.vectorize(text2label.get)(text).astype(np.int64)
            fqid = np.vectorize(fqid2label.get)(fqid).astype(np.int64)
            room_fqid = np.vectorize(room_fqid2label.get)(room_fqid).astype(np.int64)
            text_fqid = np.vectorize(text_fqid2label.get)(text_fqid).astype(np.int64)
            level_group = np.vectorize(level_group2label.get)(level_group).astype(np.int64)

            ############### feature engineering ###############
            train_session_feat_arr, save_train_session_feat_arr = feature_engineering(
                event_name,
                name,
                fqid,
                room_fqid,
                text,
                text_fqid,
                elapsed_time,
                level,
                level_group,
                page,
                room_coor_x,
                room_coor_y,
                screen_coor_x,
                screen_coor_y,
                hover_duration,
                elapsed_time_diff,
                room_coor_x_diff,
                room_coor_y_diff,
                screen_coor_x_diff,
                screen_coor_y_diff,
                room_coor_move,
                event_name_cols, 
                name_cols, 
                text_cols, 
                fqid_cols, 
                room_fqid_cols, 
                text_fqid_cols, 
                level_cols, 
                level_group_cols,
                )
            # date features
            date_feat_arr = get_date_feat(str(session_id))
            train_session_feat_arr = np.concatenate([train_session_feat_arr, date_feat_arr], axis=0)
            
            # save
            train_feat_arr.append(train_session_feat_arr)
            prev_feat_dict[grp].append(save_train_session_feat_arr)
            prev_feat_dict[grp+"_first_et"].append(elapsed_time[0])
            prev_feat_dict[grp+"_last_et"].append(elapsed_time[-1])

        train_feat_arr = np.stack(train_feat_arr, axis=0)
        prev_feat_dict[grp] = np.stack(prev_feat_dict[grp], axis=0)
        prev_feat_dict[grp+"_first_et"] = np.array(prev_feat_dict[grp+"_first_et"]).reshape(-1, 1)
        prev_feat_dict[grp+"_last_et"] = np.array(prev_feat_dict[grp+"_last_et"]).reshape(-1, 1)

        # add previous features
        train_feat_arr = add_previous_features(train_feat_arr, grp, prev_feat_dict)

        # add previous prediction
        train_feat_arr = add_previous_predictions(train_feat_arr, grp, oof)
        
        # get feature user names
        feat_users = train_grp_df["session_id"].unique()

        a, b = limits[grp]
        q_list = list(range(a, b))
        for i_fold, (train_idx, valid_idx) in enumerate(sgkf.split(train_labels_df, y=train_labels_df["correct"], groups=train_labels_df["session"])):
            LOGGER.info(f"{'='*30} Fold{i_fold} {'='*30}")

            train_users = train_labels_df.loc[train_idx, "session"].unique().tolist()
            valid_users = train_labels_df.loc[valid_idx, "session"].unique().tolist()

            X_train = train_feat_arr[np.isin(feat_users, train_users)]
            X_valid = train_feat_arr[np.isin(feat_users, valid_users)]
            y_train = train_labels_df[(train_labels_df["q"].isin(q_list)) & (train_labels_df["session"].isin(train_users))]["correct"].to_numpy().reshape(-1, len(q_list))
            y_valid = train_labels_df[(train_labels_df["q"].isin(q_list)) & (train_labels_df["session"].isin(valid_users))]["correct"].to_numpy().reshape(-1, len(q_list))

            # identify features
            if not feat_dict:
                feat_cols = get_feat_cols(X_train)
                pickle.dump(feat_cols, open(f"{cfg.FEAT_PATH}/feat_cols_fold{i_fold}_grp{grp}.pkl", "wb"))
            else:
                feat_cols = feat_dict[grp]

            X_train = X_train[:, feat_cols]
            X_valid = X_valid[:, feat_cols]

            LOGGER.info(f"num features: {X_train.shape[1]}")

            # model training
            model = get_model(cfg)
            cfg.model_params["num_target"] = len(q_list)
            model.fit(X_train, y_train, X_valid, y_valid)
            save_model(model, Path(cfg.EXP_MODEL) / f"fold{i_fold}_grp{grp}.pkl")
            models.append(model)
            model = load_model(Path(cfg.EXP_MODEL) / f"fold{i_fold}_grp{grp}.pkl")

            y_pred =  model.predict(X_valid)
            oof.loc[valid_users, np.array(q_list)-1] = y_pred
            _score = f1_score(y_valid.reshape(-1, 1), np.where(y_pred>=0.63, 1, 0).reshape(-1, 1), average="macro")

            cv_score.append(_score) 
            LOGGER.info(f"grp:{grp}, fold{i_fold} f1: {_score: .05f}")


    cv_score = np.mean(cv_score)
    LOGGER.info(f"cv: {cv_score: .05f}")
    LOGGER.info(f"total time: {time.time()-s: .05f} sec")

    return oof, models

In [16]:
oof, models = train_loop(cfg, train_df, train_labels_df)



  0%|          | 0/23562 [00:00<?, ?it/s]

num features: 1447


[0]	train-logloss:0.68657	valid-logloss:0.68660
[990]	train-logloss:0.21035	valid-logloss:0.26334


grp:0-4, fold0 f1:  0.69138
num features: 1443


[0]	train-logloss:0.68656	valid-logloss:0.68664
[1000]	train-logloss:0.20888	valid-logloss:0.26859
[1023]	train-logloss:0.20776	valid-logloss:0.26858


grp:0-4, fold1 f1:  0.69261
num features: 1448


[0]	train-logloss:0.68659	valid-logloss:0.68660
[911]	train-logloss:0.21519	valid-logloss:0.25916


grp:0-4, fold2 f1:  0.69940
num features: 1441


[0]	train-logloss:0.68656	valid-logloss:0.68665
[1000]	train-logloss:0.20952	valid-logloss:0.26588
[1054]	train-logloss:0.20709	valid-logloss:0.26589


grp:0-4, fold3 f1:  0.69917
num features: 1429


[0]	train-logloss:0.68656	valid-logloss:0.68661
[1000]	train-logloss:0.20928	valid-logloss:0.26451
[1109]	train-logloss:0.20442	valid-logloss:0.26451


grp:0-4, fold4 f1:  0.70090


  0%|          | 0/23562 [00:00<?, ?it/s]

num features: 2798


[0]	train-logloss:0.69049	valid-logloss:0.69058
[1000]	train-logloss:0.44910	valid-logloss:0.53093
[1275]	train-logloss:0.43057	valid-logloss:0.53087


grp:5-12, fold0 f1:  0.69276
num features: 2761


[0]	train-logloss:0.69049	valid-logloss:0.69062
[1000]	train-logloss:0.44880	valid-logloss:0.53115
[1123]	train-logloss:0.44031	valid-logloss:0.53108


grp:5-12, fold1 f1:  0.69621
num features: 2765


[0]	train-logloss:0.69051	valid-logloss:0.69058
[1000]	train-logloss:0.44907	valid-logloss:0.52909
[1171]	train-logloss:0.43740	valid-logloss:0.52896


grp:5-12, fold2 f1:  0.69221
num features: 2808


[0]	train-logloss:0.69050	valid-logloss:0.69062
[829]	train-logloss:0.46048	valid-logloss:0.53329


grp:5-12, fold3 f1:  0.68993
num features: 2804


[0]	train-logloss:0.69048	valid-logloss:0.69057
[1000]	train-logloss:0.44884	valid-logloss:0.53174
[1133]	train-logloss:0.43987	valid-logloss:0.53168


grp:5-12, fold4 f1:  0.69115


  0%|          | 0/23562 [00:00<?, ?it/s]

num features: 3980


[0]	train-logloss:0.68990	valid-logloss:0.68998
[674]	train-logloss:0.44043	valid-logloss:0.49863


grp:13-22, fold0 f1:  0.66299
num features: 3920


[0]	train-logloss:0.68993	valid-logloss:0.68999
[1000]	train-logloss:0.41763	valid-logloss:0.49045
[1068]	train-logloss:0.41291	valid-logloss:0.49052


grp:13-22, fold1 f1:  0.67288
num features: 3968


[0]	train-logloss:0.68991	valid-logloss:0.69002
[893]	train-logloss:0.42355	valid-logloss:0.49873


grp:13-22, fold2 f1:  0.66462
num features: 3981


[0]	train-logloss:0.68993	valid-logloss:0.69003
[719]	train-logloss:0.43604	valid-logloss:0.49939


grp:13-22, fold3 f1:  0.66811
num features: 3983


[0]	train-logloss:0.68990	valid-logloss:0.68999
[724]	train-logloss:0.43608	valid-logloss:0.50032


grp:13-22, fold4 f1:  0.66871
cv:  0.68554
total time:  1277.46074 sec


In [17]:
oof_labels_df = oof.copy()
for q in range(18):
    # GET TRUE LABELS
    tmp = train_labels_df[train_labels_df["q"] == q+1]
    oof_labels_df[q] = tmp["correct"].to_numpy()

# FIND BEST THRESHOLD TO CONVERT PROBS INTO 1s AND 0s
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    preds = (oof.to_numpy().reshape((-1))>threshold).astype('int')
    m = f1_score(oof_labels_df.to_numpy().reshape((-1)), preds, average="macro")
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

LOGGER.info('When using optimal threshold...')
for k in range(18):
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(oof_labels_df[k].to_numpy(), (oof[k].to_numpy()>best_threshold).astype('int'), average="macro")
    LOGGER.info(f'Q{k}: F1 = {m: .05f}')
    
# COMPUTE F1 SCORE OVERALL
m = f1_score(oof_labels_df.to_numpy().reshape((-1)), (oof.to_numpy().reshape((-1))>best_threshold).astype('int'), average='macro')
LOGGER.info(f"Overall F1: {m: .05f}, Best Threshold: {best_threshold: .05f}")

When using optimal threshold...
Q0: F1 =  0.67352
Q1: F1 =  0.51321
Q2: F1 =  0.51935
Q3: F1 =  0.68187
Q4: F1 =  0.64078
Q5: F1 =  0.65111
Q6: F1 =  0.63732
Q7: F1 =  0.57597
Q8: F1 =  0.63669
Q9: F1 =  0.58878
Q10: F1 =  0.61461
Q11: F1 =  0.52045
Q12: F1 =  0.47431
Q13: F1 =  0.64565
Q14: F1 =  0.62054
Q15: F1 =  0.50365
Q16: F1 =  0.55982
Q17: F1 =  0.49075
Overall F1:  0.70029, Best Threshold:  0.62000


In [21]:
oof.to_csv(f"{cfg.EXP_PREDS}/oof.csv", index=True)
oof_labels_df.to_csv(f"{cfg.EXP_PREDS}/oof_labels.csv", index=True)

# 閾値最適化

In [19]:
y_true = oof_labels_df.to_numpy()
y_pred = oof.to_numpy()

all_thresholds = optimize_thresholds(y_true, y_pred, "Powell")
pickle.dump(all_thresholds, open(f"{cfg.FEAT_PATH}/all_thresholds.pkl", "wb"))
best_score = f1_score_macro_for_thresholds(y_true, y_pred, all_thresholds)
LOGGER.info(f"Optimized thresholds cv: {best_score:.05f}")

Optimized thresholds cv: 0.70111


In [20]:
all_thresholds

array([0.65567359, 0.09022315, 0.58778995, 0.65880086, 0.63708516,
       0.60494846, 0.61980858, 0.64679696, 0.61697314, 0.62286921,
       0.61054012, 0.60749739, 0.57262681, 0.6226808 , 0.61050282,
       0.62374213, 0.61610198, 0.62311219])