In [1]:
import os
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
from scipy.stats import rankdata
import lightgbm as lgb
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml

In [2]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/lgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
em_size = 384
em_cols = [f"zz{i:04d}" for i in range(em_size)]
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
textstat_fs = ["syllable_count", "lexicon_count", "sentence_count", "flesch_reading_ease", 
           "flesch_kincaid_grade", "gunning_fog", "smog_index", "automated_readability_index", 
           "coleman_liau_index", "linsear_write_formula", "dale_chall_readability_score"]
dtfy_fs = ['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 
           'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 
           'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 
           'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']
hatebert_fs = ["hb_bert_off", "hb_bert_abu", "hb_hatebert_off", "hb_hatebert_abu"]
tweeteval_fs = ["te_roberta_off", "te_roberta_emo_anger", "te_roberta_snt_neg", "te_roberta_iro"]
features = char_fs + textstat_fs + dtfy_fs + hatebert_fs + tweeteval_fs + em_cols
features.sort()
print(f"{len(features)} features")


class Conf(NamedTuple):
    job_dir: str = job_dir
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_boost_round: int = 500
    lr: Tuple[float, float] = (1e-3, 1e-3)
    feature_fraction: Tuple[float, float] = (1, 1)
    label: str = "label"
    query: str = "worker"
    objective: str = "lambdarank"
    n_trials: int = 1
    features: List[str] = features
        

conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

429 features
Conf(job_dir='models/lgb/20211223_124026', device=device(type='cuda'), num_boost_round=500, lr=(0.001, 0.001), feature_fraction=(1, 1), label='label', query='worker', objective='lambdarank', n_trials=1, features=['automated_readability_index', 'coleman_liau_index', 'dale_chall_readability_score', 'digit_frac', 'dtm_identity_attack', 'dtm_insult', 'dtm_obscene', 'dtm_severe_toxicity', 'dtm_sexual_explicit', 'dtm_threat', 'dtm_toxicity', 'dto_identity_attack', 'dto_insult', 'dto_obscene', 'dto_severe_toxicity', 'dto_threat', 'dto_toxicity', 'dtu_identity_attack', 'dtu_insult', 'dtu_obscene', 'dtu_severe_toxicity', 'dtu_sexual_explicit', 'dtu_threat', 'dtu_toxicity', 'flesch_kincaid_grade', 'flesch_reading_ease', 'gunning_fog', 'hb_bert_abu', 'hb_bert_off', 'hb_hatebert_abu', 'hb_hatebert_off', 'length', 'letter_frac', 'lexicon_count', 'linsear_write_formula', 'punc_frac', 'sentence_count', 'smog_index', 'space_frac', 'syllable_count', 'te_roberta_emo_anger', 'te_roberta_iro'

In [3]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
train = pd.read_parquet("input/tra.parquet")
# Sort by query groups
train.sort_values("worker", inplace=True, ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 432 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    label                         5710 non-null   int32  
 1    bws                           5710 non-null   float32
 2    worker                        5710 non-null   int8   
 3    length                        5710 non-null   int16  
 4    digit_frac                    5710 non-null   float32
 5    letter_frac                   5710 non-null   float32
 6    space_frac                    5710 non-null   float32
 7    punc_frac                     5710 non-null   float32
 8    upper_frac                    5710 non-null   float32
 9    flesch_reading_ease           5710 non-null   float32
 10   flesch_kincaid_grade          5710 non-null   float32
 11   syllable_count                5710 non-null   int16  
 12   lexicon_count                 5710 non-null   

In [5]:
val = pd.read_parquet("input/val.parquet")
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 430 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    flesch_reading_ease           14251 non-null  float32
 8    flesch_kincaid_grade          14251 non-null  float32
 9    syllable_count                14251 non-null  int16  
 10   lexicon_count                 14251 non-null  int16  
 11   sentence_count                14251 non-null  int16  
 12   gunning_fog                   14251 non-null

In [6]:
vd = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
vd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB


In [7]:
def group_sizes(groups: List[int]) -> List[int]:
    """Groups must be a sorted list."""
    n = 1
    res = []
    for i in range(1, len(groups)):
        if groups[i] != groups[i-1]:
            res.append(n)
            n = 0
        n += 1
    res.append(n)
    return res


def val_score(preds: Dict[str, int], validation_data: pd.DataFrame) -> float:
    scores = []
    for t in validation_data.itertuples():
        less = getattr(t, "less_toxic")
        more = getattr(t, "more_toxic")
        s = 0
        if preds[less] < preds[more]:
            s = 1
        scores.append(s)
    return np.mean(scores)

In [8]:
class LgbObjective:
    def __init__(
        self,
        train,
        val,
        validation_data,
        conf: Conf,
    ):
        self.conf = conf
        x_train = train[self.conf.features].to_numpy()
        y_train = train[self.conf.label].to_numpy()
        self.label_gain = list(train[self.conf.label])
        self.label_gain.sort()
        self.label_gain.append(len(train) + 1)
        group = group_sizes(train[self.conf.query])
        self.ds = lgb.Dataset(x_train, label=y_train, group=group)
        self.val_texts = list(val["text"])
        self.x_val = val[self.conf.features].to_numpy()
        self.validation_data = validation_data
        self.history: List[Dict[str, Union[str, int, float]]] = []

    def __call__(self, trial):
        hist = {
            "trial_id": trial.number,
            "feature_fraction": trial.suggest_uniform(
                "feature_fraction", self.conf.feature_fraction[0], self.conf.feature_fraction[1]
            ),
            "lr": trial.suggest_loguniform(
                "lr", self.conf.lr[0], self.conf.lr[1]
            ),
        }
        b = lgb.train(
            {
                'objective': self.conf.objective,
                #'lambda_l1': 1,
                'feature_fraction': hist['feature_fraction'],
                'learning_rate': hist['lr'],
                "label_gain": self.label_gain,
                "force_col_wise": True,
                "verbose": 1,
            },
            self.ds,
            num_boost_round=conf.num_boost_round,
        )
        directory = f"{self.conf.job_dir}/trial_{hist['trial_id']}"
        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
        b.save_model(f"{directory}/model.txt")
        y_pred = b.predict(self.x_val)
        y_pred = rankdata(y_pred, method="ordinal")
        preds = {}
        for i in range(len(y_pred)):
            preds[self.val_texts[i]] = y_pred[i] 
        hist["score"] = val_score(preds, validation_data=self.validation_data)
        self.history.append(hist)
        return hist["score"]

In [9]:
obj = LgbObjective(
    train=train,
    val=val,
    validation_data=vd,
    conf=conf,
)
study = optuna.create_study(direction="maximize")
study.optimize(obj, n_trials=conf.n_trials)

[32m[I 2021-12-23 12:40:26,751][0m A new study created in memory with name: no-name-e7b6dd4d-3349-45c6-8b55-10a1690bc83a[0m


[LightGBM] [Info] Total Bins 108712
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 429


[32m[I 2021-12-23 12:40:32,360][0m Trial 0 finished with value: 0.6877906204331075 and parameters: {'feature_fraction': 1.0, 'lr': 0.001}. Best is trial 0 with value: 0.6877906204331075.[0m


In [10]:
df = pd.DataFrame.from_records(obj.history)
df.sort_values("score", ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/cv.csv"
df.to_csv(_path, index=False)
print(f"Saved {_path}")
df.head(conf.n_trials)

Saved models/lgb/20211223_124026/cv.csv


Unnamed: 0,trial_id,feature_fraction,lr,score
0,0,1.0,0.001,0.687791


In [11]:
x_train = train[conf.features].to_numpy()
y_train = train[conf.label].to_numpy()
label_gain = list(train[conf.label])
label_gain.sort()
label_gain.append(len(train) + 1)
print(f"label_gain min={min(label_gain)}, max={max(label_gain)}")
group = group_sizes(train[conf.query])
print(f"group={group}")

label_gain min=1, max=5711
group=[5710]


In [12]:
%%time
best = df.iloc[0]
b = lgb.train(
    {
        'objective': conf.objective,
        #'lambda_l1': 1,
        'feature_fraction': best['feature_fraction'],
        'learning_rate': best['lr'],
        "label_gain": label_gain,
        "force_col_wise": True,
        "verbose": 1,
    },
    lgb.Dataset(x_train, label=y_train, group=group),
    num_boost_round=conf.num_boost_round,
)
_path = f"{job_dir}/model.txt"
b.save_model(_path)
print(f"Saved {_path}")

[LightGBM] [Info] Total Bins 108712
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 429
Saved models/lgb/20211223_124026/model.txt
Wall time: 5.66 s


In [13]:
%%time
scores = b.feature_importance()
assert len(scores) == len(features)
rows = []
for i, score in enumerate(scores):
    row = {'importance': score, 'feature': features[i]}
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df.sort_values('importance', ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/features.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/lgb/20211223_124026/features.csv
Wall time: 13 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428
importance,1402,828,551,519,507,485,347,257,246,227,218,215,215,208,177,173,163,159,158,148,139,136,135,134,133,117,116,113,109,106,103,97,97,86,85,81,77,75,74,74,71,70,68,67,67,67,65,63,61,61,60,59,57,56,55,54,53,52,50,50,50,50,49,46,45,45,45,45,45,44,43,42,42,42,42,42,42,41,41,41,41,40,39,39,39,39,39,39,38,38,37,37,37,37,36,36,36,33,33,31,31,31,30,30,30,29,29,29,28,28,28,28,28,28,27,27,27,26,26,26,26,25,25,25,25,25,25,25,24,24,24,23,23,23,22,21,21,21,21,20,20,20,20,20,20,19,19,19,19,19,19,19,19,18,18,18,18,18,18,18,17,17,17,17,16,16,16,16,16,16,16,16,15,15,15,15,15,15,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12,12,12,12,12,12,12,11,11,11,11,11,11,11,11,11,11,11,11,11,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
feature,dto_insult,dtm_severe_toxicity,zz0341,te_roberta_off,dto_obscene,dto_identity_attack,dto_severe_toxicity,dto_toxicity,zz0285,lexicon_count,te_roberta_emo_anger,dtm_obscene,zz0118,zz0326,hb_bert_off,zz0054,zz0172,zz0031,dtu_toxicity,zz0231,length,zz0334,dtu_obscene,zz0002,zz0197,zz0380,zz0262,zz0275,te_roberta_snt_neg,zz0200,dtm_toxicity,zz0120,dtu_severe_toxicity,coleman_liau_index,zz0317,zz0021,zz0339,dtu_threat,zz0255,zz0368,zz0190,zz0191,te_roberta_iro,zz0330,zz0222,zz0319,zz0188,zz0214,zz0121,zz0307,zz0015,zz0357,zz0161,zz0033,zz0273,zz0203,zz0095,zz0346,zz0299,hb_bert_abu,zz0361,zz0117,zz0150,punc_frac,zz0066,zz0004,zz0163,zz0181,zz0100,zz0179,zz0370,zz0144,zz0048,zz0248,hb_hatebert_off,zz0332,zz0229,zz0376,zz0354,zz0240,zz0258,zz0105,zz0029,dtm_sexual_explicit,zz0201,zz0189,zz0098,zz0216,zz0359,zz0233,zz0108,zz0265,zz0338,zz0225,zz0374,zz0023,zz0342,zz0219,zz0223,hb_hatebert_abu,zz0097,zz0065,zz0130,zz0129,zz0017,zz0034,zz0156,zz0006,gunning_fog,zz0220,zz0001,zz0152,zz0151,zz0165,zz0280,zz0162,zz0348,dtu_insult,zz0194,zz0158,zz0300,zz0353,zz0020,zz0377,dtu_identity_attack,zz0287,zz0230,zz0232,zz0291,zz0060,zz0218,zz0294,automated_readability_index,zz0042,zz0113,zz0344,zz0314,zz0160,zz0378,zz0119,zz0115,zz0096,zz0363,zz0209,zz0141,zz0124,zz0043,zz0251,zz0367,zz0327,zz0288,zz0264,zz0016,zz0309,zz0185,zz0180,zz0003,zz0080,zz0242,zz0366,zz0049,zz0044,zz0328,zz0228,zz0358,zz0213,zz0139,zz0343,zz0091,zz0041,dtm_threat,zz0122,zz0252,zz0263,zz0237,zz0236,zz0111,zz0310,zz0254,zz0140,zz0375,linsear_write_formula,zz0192,zz0019,zz0027,zz0311,zz0109,zz0010,zz0159,zz0143,zz0053,zz0250,zz0058,zz0301,zz0204,zz0186,zz0289,zz0062,zz0247,zz0246,zz0371,zz0128,zz0293,flesch_reading_ease,zz0018,zz0166,zz0072,zz0199,zz0147,zz0047,zz0164,zz0335,zz0127,zz0304,zz0099,zz0360,zz0055,zz0078,zz0270,zz0260,zz0168,zz0315,zz0142,zz0136,zz0306,zz0373,zz0244,zz0208,zz0379,zz0013,zz0183,zz0350,zz0052,zz0126,zz0355,zz0356,zz0059,zz0077,zz0157,zz0323,zz0014,zz0082,zz0145,zz0211,zz0196,zz0249,zz0067,zz0336,zz0112,zz0133,zz0009,zz0253,zz0040,zz0295,zz0125,zz0056,zz0134,zz0308,zz0272,zz0030,zz0212,zz0193,zz0269,zz0081,zz0005,zz0312,zz0227,zz0226,zz0331,zz0276,zz0103,zz0039,zz0274,zz0176,syllable_count,zz0069,zz0234,zz0036,zz0154,zz0305,zz0365,zz0028,zz0202,zz0267,zz0351,zz0022,zz0171,zz0101,zz0290,zz0068,zz0277,zz0313,zz0170,zz0224,zz0364,zz0046,zz0051,zz0024,zz0292,zz0241,zz0089,zz0037,dtm_insult,zz0079,zz0207,zz0102,zz0210,zz0337,zz0302,zz0259,zz0057,zz0148,sentence_count,zz0110,zz0329,zz0352,zz0074,zz0132,zz0073,zz0007,zz0303,zz0349,zz0381,zz0169,zz0131,zz0116,zz0155,zz0175,zz0177,zz0178,zz0025,zz0035,zz0038,zz0045,zz0205,zz0146,space_frac,zz0221,zz0061,zz0167,zz0012,zz0266,dtu_sexual_explicit,zz0090,zz0257,zz0076,zz0149,zz0345,zz0087,zz0084,zz0092,zz0333,zz0094,zz0064,zz0075,zz0050,zz0071,zz0137,zz0070,zz0123,zz0086,zz0206,upper_frac,zz0239,zz0238,dtm_identity_attack,zz0298,smog_index,dto_threat,zz0243,zz0271,zz0187,zz0284,zz0283,zz0282,zz0182,zz0083,dale_chall_readability_score,digit_frac,zz0382,zz0011,zz0063,flesch_kincaid_grade,zz0085,letter_frac,zz0372,zz0000,zz0369,zz0362,zz0032,zz0026,zz0008,zz0279,zz0347,zz0088,zz0174,zz0184,zz0195,zz0198,zz0215,zz0217,zz0235,zz0297,zz0296,zz0245,zz0256,zz0261,zz0268,zz0286,zz0281,zz0173,zz0316,zz0318,zz0278,zz0340,zz0093,zz0104,zz0106,zz0107,zz0114,zz0135,zz0153,zz0138,zz0325,zz0324,zz0322,zz0321,zz0320,zz0383
