In [1]:
import os
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
from scipy.stats import rankdata
import lightgbm as lgb
import torch
from typing import List, Dict, Union, Tuple, NamedTuple
from tqdm import tqdm
import scml

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/lgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")

job_dir=models/lgb/20211217_093928


In [5]:
class Conf(NamedTuple):
    num_boost_round: int
    lr: Tuple[float, float]
    feature_fraction: Tuple[float, float]
    features: List[str]
    label: str
    query: str
    job_dir: str
    n_trials: int = 1


em_size = 384
em_cols = [f"zz{i:04d}" for i in range(em_size)]
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
features = char_fs + em_cols
features.sort()
print(f"{len(features)} features\n{features}")
conf = Conf(
    num_boost_round=400,
    lr=(1e-2, 1e-2),
    feature_fraction=(1, 1),
    features=features,
    label="label",
    query="worker",
    job_dir=job_dir,
    n_trials=1,
)

390 features
['digit_frac', 'length', 'letter_frac', 'punc_frac', 'space_frac', 'upper_frac', 'zz0000', 'zz0001', 'zz0002', 'zz0003', 'zz0004', 'zz0005', 'zz0006', 'zz0007', 'zz0008', 'zz0009', 'zz0010', 'zz0011', 'zz0012', 'zz0013', 'zz0014', 'zz0015', 'zz0016', 'zz0017', 'zz0018', 'zz0019', 'zz0020', 'zz0021', 'zz0022', 'zz0023', 'zz0024', 'zz0025', 'zz0026', 'zz0027', 'zz0028', 'zz0029', 'zz0030', 'zz0031', 'zz0032', 'zz0033', 'zz0034', 'zz0035', 'zz0036', 'zz0037', 'zz0038', 'zz0039', 'zz0040', 'zz0041', 'zz0042', 'zz0043', 'zz0044', 'zz0045', 'zz0046', 'zz0047', 'zz0048', 'zz0049', 'zz0050', 'zz0051', 'zz0052', 'zz0053', 'zz0054', 'zz0055', 'zz0056', 'zz0057', 'zz0058', 'zz0059', 'zz0060', 'zz0061', 'zz0062', 'zz0063', 'zz0064', 'zz0065', 'zz0066', 'zz0067', 'zz0068', 'zz0069', 'zz0070', 'zz0071', 'zz0072', 'zz0073', 'zz0074', 'zz0075', 'zz0076', 'zz0077', 'zz0078', 'zz0079', 'zz0080', 'zz0081', 'zz0082', 'zz0083', 'zz0084', 'zz0085', 'zz0086', 'zz0087', 'zz0088', 'zz0089', 'zz009

In [6]:
train = pd.read_parquet("input/ruddit.parquet")
# Sort by query groups
train.sort_values("worker", inplace=True, ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 393 columns):
 #    Column       Non-Null Count  Dtype  
---   ------       --------------  -----  
 0    label        5710 non-null   int32  
 1    bws          5710 non-null   float32
 2    worker       5710 non-null   int8   
 3    length       5710 non-null   int16  
 4    digit_frac   5710 non-null   float32
 5    letter_frac  5710 non-null   float32
 6    space_frac   5710 non-null   float32
 7    punc_frac    5710 non-null   float32
 8    upper_frac   5710 non-null   float32
 9    zz0000       5710 non-null   float32
 10   zz0001       5710 non-null   float32
 11   zz0002       5710 non-null   float32
 12   zz0003       5710 non-null   float32
 13   zz0004       5710 non-null   float32
 14   zz0005       5710 non-null   float32
 15   zz0006       5710 non-null   float32
 16   zz0007       5710 non-null   float32
 17   zz0008       5710 non-null   float32
 18   zz0009       5710 non-null

In [7]:
val = pd.read_parquet("input/val.parquet")
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 391 columns):
 #    Column       Non-Null Count  Dtype  
---   ------       --------------  -----  
 0    text         14251 non-null  object 
 1    length       14251 non-null  int16  
 2    digit_frac   14251 non-null  float32
 3    letter_frac  14251 non-null  float32
 4    space_frac   14251 non-null  float32
 5    punc_frac    14251 non-null  float32
 6    upper_frac   14251 non-null  float32
 7    zz0000       14251 non-null  float32
 8    zz0001       14251 non-null  float32
 9    zz0002       14251 non-null  float32
 10   zz0003       14251 non-null  float32
 11   zz0004       14251 non-null  float32
 12   zz0005       14251 non-null  float32
 13   zz0006       14251 non-null  float32
 14   zz0007       14251 non-null  float32
 15   zz0008       14251 non-null  float32
 16   zz0009       14251 non-null  float32
 17   zz0010       14251 non-null  float32
 18   zz0011       14251 non-n

In [8]:
vd = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
vd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB


In [9]:
def group_sizes(groups: List[int]) -> List[int]:
    """Groups must be a sorted list."""
    n = 1
    res = []
    for i in range(1, len(groups)):
        if groups[i] != groups[i-1]:
            res.append(n)
            n = 0
        n += 1
    res.append(n)
    return res


def val_score(preds: Dict[str, int], validation_data: pd.DataFrame) -> float:
    scores = []
    for t in validation_data.itertuples():
        less = getattr(t, "less_toxic")
        more = getattr(t, "more_toxic")
        s = 0
        if preds[less] < preds[more]:
            s = 1
        scores.append(s)
    return np.mean(scores)

In [10]:
class LgbObjective:
    def __init__(
        self,
        train,
        val,
        validation_data,
        conf: Conf,
    ):
        self.conf = conf
        x_train = train[self.conf.features].to_numpy()
        y_train = train[self.conf.label].to_numpy()
        self.label_gain = list(train[self.conf.label])
        self.label_gain.sort()
        self.label_gain.append(len(train) + 1)
        group = group_sizes(train[self.conf.query])
        self.ds = lgb.Dataset(x_train, label=y_train, group=group)
        self.val_texts = list(val["text"])
        self.x_val = val[self.conf.features].to_numpy()
        self.validation_data = validation_data
        self.history: List[Dict[str, Union[str, int, float]]] = []

    def __call__(self, trial):
        hist = {
            "trial_id": trial.number,
            "feature_fraction": trial.suggest_uniform(
                "feature_fraction", self.conf.feature_fraction[0], self.conf.feature_fraction[1]
            ),
            "lr": trial.suggest_loguniform(
                "lr", self.conf.lr[0], self.conf.lr[1]
            ),
        }
        b = lgb.train(
            {
                'objective': "lambdarank",
                #'lambda_l1': 1,
                'feature_fraction': hist['feature_fraction'],
                'learning_rate': hist['lr'],
                "label_gain": self.label_gain,
                "force_col_wise": True,
                "verbose": 1,
            },
            self.ds,
            num_boost_round=conf.num_boost_round,
        )
        directory = f"{self.conf.job_dir}/trial_{hist['trial_id']}"
        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
        b.save_model(f"{directory}/model.txt")
        y_pred = b.predict(self.x_val)
        y_pred = rankdata(y_pred, method="ordinal")
        preds = {}
        for i in range(len(y_pred)):
            preds[self.val_texts[i]] = y_pred[i] 
        hist["score"] = val_score(preds, validation_data=self.validation_data)
        self.history.append(hist)
        return hist["score"]

In [11]:
obj = LgbObjective(
    train=train,
    val=val,
    validation_data=vd,
    conf=conf,
)
study = optuna.create_study(direction="maximize")
study.optimize(obj, n_trials=conf.n_trials)

[32m[I 2021-12-17 09:39:29,245][0m A new study created in memory with name: no-name-a3e11044-e23a-4579-8657-e148cfdbf3df[0m


[LightGBM] [Info] Total Bins 99450
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 390


[32m[I 2021-12-17 09:39:33,922][0m Trial 0 finished with value: 0.6226584296532484 and parameters: {'feature_fraction': 1.0, 'lr': 0.01}. Best is trial 0 with value: 0.6226584296532484.[0m


In [12]:
df = pd.DataFrame.from_records(obj.history)
df.sort_values("score", ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/cv.csv"
df.to_csv(_path, index=False)
print(f"Saved {_path}")
df.head(conf.n_trials)

Saved models/lgb/20211217_093928/cv.csv


Unnamed: 0,trial_id,feature_fraction,lr,score
0,0,1.0,0.01,0.622658


In [13]:
x_train = train[conf.features].to_numpy()
y_train = train[conf.label].to_numpy()
label_gain = list(train[conf.label])
label_gain.sort()
label_gain.append(len(train) + 1)
print(f"label_gain min={min(label_gain)}, max={max(label_gain)}")
group = group_sizes(train[conf.query])
print(f"group={group}")

label_gain min=1, max=5711
group=[5710]


In [14]:
%%time
best = df.iloc[0]
b = lgb.train(
    {
        'objective': "lambdarank",
        #'lambda_l1': 1,
        'feature_fraction': best['feature_fraction'],
        'learning_rate': best['lr'],
        "label_gain": label_gain,
        "force_col_wise": True,
        "verbose": 1,
    },
    lgb.Dataset(x_train, label=y_train, group=group),
    num_boost_round=conf.num_boost_round,
)
_path = f"{job_dir}/model.txt"
b.save_model(_path)
print(f"Saved {_path}")

[LightGBM] [Info] Total Bins 99450
[LightGBM] [Info] Number of data points in the train set: 5710, number of used features: 390
Saved models/lgb/20211217_093928/model.txt
Wall time: 4.49 s


In [15]:
%%time
scores = b.feature_importance()
assert len(scores) == len(features)
rows = []
for i, score in enumerate(scores):
    row = {'importance': score, 'feature': features[i]}
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df.sort_values('importance', ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/features.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/lgb/20211217_093928/features.csv
Wall time: 11 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389
importance,340,340,325,273,263,239,228,226,209,191,178,174,152,151,151,149,145,144,144,138,137,137,135,121,117,113,108,107,105,99,93,91,91,84,81,79,76,76,75,73,72,70,67,64,63,63,59,58,57,57,54,53,53,52,52,50,49,49,49,49,48,47,47,47,46,46,45,44,43,43,42,42,42,42,42,41,41,41,40,40,39,39,38,38,37,37,36,36,35,34,34,33,33,33,33,33,32,32,32,32,31,31,30,30,30,30,30,30,29,29,29,28,28,28,28,28,28,28,27,27,27,27,27,27,26,26,25,25,24,24,23,23,23,23,23,23,23,23,23,22,22,22,22,21,21,21,20,20,20,20,20,20,20,19,19,19,19,19,19,19,19,18,18,18,18,17,17,17,17,17,17,16,16,16,16,16,15,15,15,15,14,14,14,14,14,14,14,14,14,14,13,13,13,13,13,13,13,13,13,12,12,12,12,12,12,12,12,12,12,12,12,12,12,11,11,11,11,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
feature,zz0310,zz0062,zz0164,zz0115,zz0069,zz0364,zz0277,zz0304,zz0043,zz0211,zz0349,zz0315,zz0293,zz0335,zz0200,length,zz0331,zz0290,zz0125,zz0167,zz0234,zz0205,zz0254,zz0288,zz0218,zz0114,zz0149,zz0098,zz0104,zz0208,zz0110,zz0049,zz0014,zz0216,zz0345,zz0146,zz0068,zz0201,zz0137,zz0037,zz0197,zz0041,zz0204,zz0329,zz0135,zz0005,zz0166,zz0363,zz0083,zz0303,zz0095,zz0082,zz0276,zz0168,zz0307,zz0160,zz0251,zz0022,zz0033,zz0100,zz0247,zz0048,zz0133,zz0117,zz0202,zz0311,zz0131,zz0294,zz0358,zz0282,zz0280,zz0175,zz0079,zz0199,zz0222,zz0265,zz0190,zz0273,zz0240,zz0261,zz0301,zz0107,zz0263,zz0266,zz0010,zz0159,zz0054,zz0381,zz0227,zz0004,zz0147,zz0163,zz0109,zz0383,zz0209,zz0045,zz0295,zz0224,zz0374,zz0188,zz0113,zz0047,zz0088,zz0341,zz0029,zz0040,zz0112,zz0003,zz0021,zz0366,zz0219,zz0354,zz0296,zz0337,zz0320,zz0302,zz0252,zz0189,zz0248,zz0373,zz0328,zz0094,zz0275,zz0196,zz0030,zz0289,zz0207,zz0034,zz0017,zz0187,zz0061,zz0120,zz0327,zz0279,zz0073,zz0020,zz0056,zz0072,zz0019,zz0173,zz0336,zz0171,zz0231,zz0258,zz0223,zz0259,zz0151,zz0074,zz0195,zz0124,zz0144,zz0071,zz0011,zz0346,zz0122,zz0140,zz0233,zz0130,zz0370,zz0198,zz0182,zz0105,zz0351,zz0121,zz0344,zz0031,zz0382,zz0065,zz0118,zz0129,zz0002,zz0024,zz0080,zz0153,zz0285,zz0368,zz0103,zz0178,zz0191,zz0250,zz0108,letter_frac,zz0350,zz0348,zz0009,zz0338,zz0318,zz0212,zz0319,zz0323,zz0210,zz0050,zz0193,zz0239,zz0321,zz0367,zz0006,zz0145,zz0314,zz0268,zz0044,zz0075,zz0066,zz0246,zz0152,zz0186,zz0119,zz0007,zz0102,zz0176,zz0172,zz0143,zz0155,zz0267,zz0051,zz0281,zz0342,zz0334,zz0089,zz0369,zz0241,zz0256,zz0148,zz0249,zz0379,zz0347,zz0317,zz0184,zz0340,zz0028,zz0012,zz0150,zz0262,zz0225,zz0058,zz0260,zz0352,zz0355,zz0313,zz0099,zz0236,zz0023,zz0245,zz0042,zz0284,zz0298,zz0359,zz0036,zz0343,zz0111,zz0179,zz0194,zz0092,zz0154,zz0214,zz0067,zz0230,zz0087,zz0070,zz0170,zz0181,zz0180,zz0096,zz0027,zz0086,zz0001,zz0162,zz0286,zz0077,zz0283,zz0016,zz0064,zz0141,zz0316,space_frac,zz0378,zz0238,zz0362,zz0372,zz0063,zz0076,zz0169,zz0235,zz0255,zz0185,zz0305,zz0157,zz0177,zz0309,zz0060,zz0139,zz0018,zz0312,zz0380,zz0330,zz0306,zz0174,zz0322,zz0324,zz0055,zz0084,zz0106,zz0192,zz0156,zz0365,zz0356,zz0291,zz0203,zz0232,zz0090,zz0339,zz0127,zz0059,zz0138,zz0039,zz0134,zz0136,zz0097,zz0081,zz0123,upper_frac,zz0377,zz0376,zz0244,zz0308,zz0299,zz0226,zz0272,zz0243,zz0332,zz0278,zz0257,zz0126,zz0093,zz0206,zz0287,zz0165,zz0353,zz0253,zz0270,zz0013,zz0116,zz0000,zz0183,zz0015,zz0371,zz0229,zz0357,zz0101,zz0046,zz0221,zz0091,zz0213,zz0161,zz0057,zz0333,zz0297,zz0085,zz0326,zz0325,zz0025,zz0361,zz0264,zz0128,zz0132,zz0375,zz0228,zz0242,digit_frac,zz0360,zz0215,zz0078,zz0292,zz0220,zz0008,zz0038,zz0142,zz0035,zz0158,zz0217,punc_frac,zz0032,zz0271,zz0026,zz0052,zz0053,zz0237,zz0300,zz0269,zz0274
