In [1]:
import os
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
from scipy.stats import rankdata
import lightgbm as lgb
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml

In [2]:
em_enable = False
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/lgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
textstat_fs = ["syllables_per_word", "syllables_per_sent", "words_per_sent", "flesch_reading_ease", 
           "flesch_kincaid_grade", "gunning_fog", "smog_index", "automated_readability_index", 
           "coleman_liau_index", "linsear_write_formula", "dale_chall_readability_score"]
dtfy_fs = ['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 
           'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 
           'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 
           'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']
hatebert_fs = ["hb_bert_off", "hb_bert_abu", "hb_hatebert_off", "hb_hatebert_abu"]
tweeteval_fs = ["te_roberta_off", "te_roberta_emo_anger", "te_roberta_snt_neg", 
                "te_roberta_iro", "te_xlm_roberta_snt_neg"]
features = char_fs + textstat_fs + dtfy_fs + hatebert_fs + tweeteval_fs
if em_enable:
    em_size = 384
    em_cols = [f"zz{i:04d}" for i in range(em_size)]
    features += em_cols
features.sort()
print(f"{len(features)} features")


class Conf(NamedTuple):
    job_dir: str = job_dir
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_boost_round: int = 500
    lr: Tuple[float, float] = (1e-2, 1e-2)
    feature_fraction: Tuple[float, float] = (1, 1)
    label: str = "label"
    query: str = "worker"
    objective: str = "lambdarank"
    n_trials: int = 1
    features: List[str] = features
        

conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

46 features
Conf(job_dir='models/lgb/20211226_124215', device=device(type='cuda'), num_boost_round=500, lr=(0.01, 0.01), feature_fraction=(1, 1), label='label', query='worker', objective='lambdarank', n_trials=1, features=['automated_readability_index', 'coleman_liau_index', 'dale_chall_readability_score', 'digit_frac', 'dtm_identity_attack', 'dtm_insult', 'dtm_obscene', 'dtm_severe_toxicity', 'dtm_sexual_explicit', 'dtm_threat', 'dtm_toxicity', 'dto_identity_attack', 'dto_insult', 'dto_obscene', 'dto_severe_toxicity', 'dto_threat', 'dto_toxicity', 'dtu_identity_attack', 'dtu_insult', 'dtu_obscene', 'dtu_severe_toxicity', 'dtu_sexual_explicit', 'dtu_threat', 'dtu_toxicity', 'flesch_kincaid_grade', 'flesch_reading_ease', 'gunning_fog', 'hb_bert_abu', 'hb_bert_off', 'hb_hatebert_abu', 'hb_hatebert_off', 'length', 'letter_frac', 'linsear_write_formula', 'punc_frac', 'smog_index', 'space_frac', 'syllables_per_sent', 'syllables_per_word', 'te_roberta_emo_anger', 'te_roberta_iro', 'te_robert

In [3]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
train = pd.read_parquet("input/tra.parquet")
# Sort by query groups
train.sort_values("worker", inplace=True, ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 433 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    label                         5710 non-null   int32  
 1    bws                           5710 non-null   float32
 2    worker                        5710 non-null   int8   
 3    length                        5710 non-null   int16  
 4    digit_frac                    5710 non-null   float32
 5    letter_frac                   5710 non-null   float32
 6    space_frac                    5710 non-null   float32
 7    punc_frac                     5710 non-null   float32
 8    upper_frac                    5710 non-null   float32
 9    dto_toxicity                  5710 non-null   float32
 10   dto_severe_toxicity           5710 non-null   float32
 11   dto_obscene                   5710 non-null   float32
 12   dto_threat                    5710 non-null   

In [5]:
val = pd.read_parquet("input/val.parquet")
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    dto_toxicity                  14251 non-null  float32
 8    dto_severe_toxicity           14251 non-null  float32
 9    dto_obscene                   14251 non-null  float32
 10   dto_threat                    14251 non-null  float32
 11   dto_insult                    14251 non-null  float32
 12   dto_identity_attack           14251 non-null

In [6]:
vd = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
vd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB


In [7]:
def group_sizes(groups: List[int]) -> List[int]:
    """Groups must be a sorted list."""
    n = 1
    res = []
    for i in range(1, len(groups)):
        if groups[i] != groups[i-1]:
            res.append(n)
            n = 0
        n += 1
    res.append(n)
    return res


def val_score(preds: Dict[str, int], validation_data: pd.DataFrame) -> float:
    scores = []
    for t in validation_data.itertuples():
        less = getattr(t, "less_toxic")
        more = getattr(t, "more_toxic")
        s = 0
        if preds[less] < preds[more]:
            s = 1
        scores.append(s)
    return np.mean(scores)

In [8]:
class LgbObjective:
    def __init__(
        self,
        train,
        val,
        validation_data,
        conf: Conf,
    ):
        self.conf = conf
        x_train = train[self.conf.features].to_numpy()
        y_train = train[self.conf.label].to_numpy()
        self.label_gain = list(train[self.conf.label])
        self.label_gain.sort()
        self.label_gain.append(len(train) + 1)
        group = group_sizes(train[self.conf.query])
        self.ds = lgb.Dataset(x_train, label=y_train, group=group)
        self.val_texts = list(val["text"])
        self.x_val = val[self.conf.features].to_numpy()
        self.validation_data = validation_data
        self.history: List[Dict[str, Union[str, int, float]]] = []

    def __call__(self, trial):
        hist = {
            "trial_id": trial.number,
            "feature_fraction": trial.suggest_uniform(
                "feature_fraction", self.conf.feature_fraction[0], self.conf.feature_fraction[1]
            ),
            "lr": trial.suggest_loguniform(
                "lr", self.conf.lr[0], self.conf.lr[1]
            ),
        }
        b = lgb.train(
            {
                'objective': self.conf.objective,
                #'lambda_l1': 1,
                'feature_fraction': hist['feature_fraction'],
                'learning_rate': hist['lr'],
                "label_gain": self.label_gain,
                "force_col_wise": True,
                "verbose": 1,
            },
            self.ds,
            num_boost_round=conf.num_boost_round,
        )
        directory = f"{self.conf.job_dir}/trial_{hist['trial_id']}"
        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
        b.save_model(f"{directory}/model.txt")
        y_pred = b.predict(self.x_val)
        y_pred = rankdata(y_pred, method="ordinal")
        preds = {}
        for i in range(len(y_pred)):
            preds[self.val_texts[i]] = y_pred[i] 
        hist["score"] = val_score(preds, validation_data=self.validation_data)
        self.history.append(hist)
        return hist["score"]

In [9]:
obj = LgbObjective(
    train=train,
    val=val,
    validation_data=vd,
    conf=conf,
)
study = optuna.create_study(direction="maximize")
study.optimize(obj, n_trials=conf.n_trials)

[32m[I 2021-12-26 12:42:15,855][0m A new study created in memory with name: no-name-d10368e6-5f9d-47b0-b200-8245220a1b92[0m


[LightGBM] [Info] Total Bins 11448
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 46


[32m[I 2021-12-26 12:42:17,764][0m Trial 0 finished with value: 0.6996479341038927 and parameters: {'feature_fraction': 1.0, 'lr': 0.01}. Best is trial 0 with value: 0.6996479341038927.[0m


In [10]:
df = pd.DataFrame.from_records(obj.history)
df.sort_values("score", ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/cv.csv"
df.to_csv(_path, index=False)
print(f"Saved {_path}")
df.head(conf.n_trials)

Saved models/lgb/20211226_124215/cv.csv


Unnamed: 0,trial_id,feature_fraction,lr,score
0,0,1.0,0.01,0.699648


In [11]:
x_train = train[conf.features].to_numpy()
y_train = train[conf.label].to_numpy()
label_gain = list(train[conf.label])
label_gain.sort()
label_gain.append(len(train) + 1)
print(f"label_gain min={min(label_gain)}, max={max(label_gain)}")
group = group_sizes(train[conf.query])
print(f"group={group}")

label_gain min=1, max=5711
group=[5710]


In [12]:
%%time
best = df.iloc[0]
b = lgb.train(
    {
        'objective': conf.objective,
        #'lambda_l1': 1,
        'feature_fraction': best['feature_fraction'],
        'learning_rate': best['lr'],
        "label_gain": label_gain,
        "force_col_wise": True,
        "verbose": 1,
    },
    lgb.Dataset(x_train, label=y_train, group=group),
    num_boost_round=conf.num_boost_round,
)
_path = f"{job_dir}/model.txt"
b.save_model(_path)
print(f"Saved {_path}")

[LightGBM] [Info] Total Bins 11448
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 46
Saved models/lgb/20211226_124215/model.txt
Wall time: 1.81 s


In [13]:
%%time
scores = b.feature_importance()
assert len(scores) == len(features)
rows = []
for i, score in enumerate(scores):
    row = {'importance': score, 'feature': features[i]}
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df.sort_values('importance', ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/features.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/lgb/20211226_124215/features.csv
Wall time: 6 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
importance,907,858,837,778,712,672,646,555,547,545,515,503,481,477,430,391,339,325,312,279,258,258,237,221,214,210,198,197,186,175,157,154,149,146,143,137,136,131,130,129,110,76,61,41,37,0
feature,hb_hatebert_abu,dtu_severe_toxicity,dto_obscene,dto_identity_attack,dto_insult,words_per_sent,dtu_insult,dtu_obscene,te_xlm_roberta_snt_neg,dto_toxicity,te_roberta_snt_neg,length,te_roberta_emo_anger,te_roberta_off,dto_severe_toxicity,syllables_per_sent,dtu_identity_attack,dtu_toxicity,hb_hatebert_off,dtm_identity_attack,dtm_toxicity,hb_bert_abu,te_roberta_iro,hb_bert_off,dtm_threat,upper_frac,dto_threat,punc_frac,dtu_threat,dale_chall_readability_score,linsear_write_formula,dtu_sexual_explicit,automated_readability_index,syllables_per_word,flesch_kincaid_grade,coleman_liau_index,letter_frac,smog_index,space_frac,dtm_severe_toxicity,gunning_fog,dtm_insult,dtm_sexual_explicit,flesch_reading_ease,dtm_obscene,digit_frac
