In [1]:
import os
import pathlib
from datetime import datetime
import optuna
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import torch
from typing import List, Dict, Union, Tuple
from tqdm import tqdm
import scml

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
num_boost_round = 100
#lr = (1e-2, 1e-2)
#feature_fraction = (1, 1)
n_trials = 1
n_splits = 10
em_size = 384
features = [f"e{i}" for i in range(em_size)]
features.sort()
print(f"{len(features)} features={features}")

384 features=['e0', 'e1', 'e10', 'e100', 'e101', 'e102', 'e103', 'e104', 'e105', 'e106', 'e107', 'e108', 'e109', 'e11', 'e110', 'e111', 'e112', 'e113', 'e114', 'e115', 'e116', 'e117', 'e118', 'e119', 'e12', 'e120', 'e121', 'e122', 'e123', 'e124', 'e125', 'e126', 'e127', 'e128', 'e129', 'e13', 'e130', 'e131', 'e132', 'e133', 'e134', 'e135', 'e136', 'e137', 'e138', 'e139', 'e14', 'e140', 'e141', 'e142', 'e143', 'e144', 'e145', 'e146', 'e147', 'e148', 'e149', 'e15', 'e150', 'e151', 'e152', 'e153', 'e154', 'e155', 'e156', 'e157', 'e158', 'e159', 'e16', 'e160', 'e161', 'e162', 'e163', 'e164', 'e165', 'e166', 'e167', 'e168', 'e169', 'e17', 'e170', 'e171', 'e172', 'e173', 'e174', 'e175', 'e176', 'e177', 'e178', 'e179', 'e18', 'e180', 'e181', 'e182', 'e183', 'e184', 'e185', 'e186', 'e187', 'e188', 'e189', 'e19', 'e190', 'e191', 'e192', 'e193', 'e194', 'e195', 'e196', 'e197', 'e198', 'e199', 'e2', 'e20', 'e200', 'e201', 'e202', 'e203', 'e204', 'e205', 'e206', 'e207', 'e208', 'e209', 'e21', 'e21

In [5]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = f"models/lgb/{ts}"
pathlib.Path(job_dir).mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")

job_dir=models/lgb/20211214_155836


In [6]:
train = pd.read_parquet("input/js18.parquet")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16225 entries, 0 to 16224
Data columns (total 385 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    label   16225 non-null  int32  
 1    e0      16225 non-null  float32
 2    e1      16225 non-null  float32
 3    e2      16225 non-null  float32
 4    e3      16225 non-null  float32
 5    e4      16225 non-null  float32
 6    e5      16225 non-null  float32
 7    e6      16225 non-null  float32
 8    e7      16225 non-null  float32
 9    e8      16225 non-null  float32
 10   e9      16225 non-null  float32
 11   e10     16225 non-null  float32
 12   e11     16225 non-null  float32
 13   e12     16225 non-null  float32
 14   e13     16225 non-null  float32
 15   e14     16225 non-null  float32
 16   e15     16225 non-null  float32
 17   e16     16225 non-null  float32
 18   e17     16225 non-null  float32
 19   e18     16225 non-null  float32
 20   e19     16225 non-null  float32
 21   e20     16

In [7]:
def group_sizes(size: int) -> List[int]:
    query_max_size = 10000
    res = []
    for _ in range(int(size / query_max_size)):
        res.append(query_max_size)
    rem = size % query_max_size
    if rem != 0:
        res.append(rem)
    return res

In [8]:
x_train = train[features].to_numpy()
y_train = train["label"].to_numpy()
label_gain = list(train["label"])
label_gain.sort()
label_gain.append(len(train) + 1)
print(f"label_gain min={min(label_gain)}, max={max(label_gain)}")

label_gain min=1, max=16226


In [9]:
%%time
#best = df.iloc[0]
best = {
    "feature_fraction": 1.0,
    "lr": 1e-2,
}
b = lgb.train(
    {
        'objective': "lambdarank",
        #'lambda_l1': 1,
        'feature_fraction': best['feature_fraction'],
        'learning_rate': best['lr'],
        "label_gain": label_gain,
        "force_col_wise": True,
        "verbose": 1,
    },
    lgb.Dataset(x_train, label=y_train, group=group_sizes(len(train))),
    num_boost_round=num_boost_round,
)
_path = f"{job_dir}/model.txt"
b.save_model(_path)
print(f"Saved {_path}")

[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 16225, number of used features: 384
Saved models/lgb/20211214_155836/model.txt
Wall time: 2.13 s


In [10]:
%%time
scores = b.feature_importance()
assert len(scores) == len(features)
rows = []
for i, score in enumerate(scores):
    row = {'importance': score, 'feature': features[i]}
    rows.append(row)
df = pd.DataFrame.from_records(rows)
df.sort_values('importance', ascending=False, inplace=True, ignore_index=True)
_path = f"{job_dir}/features.csv"
df.to_csv(_path, index=True)
print(f"Saved {_path}")
df.T.head()

Saved models/lgb/20211214_155836/features.csv
Wall time: 13 ms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383
importance,153,118,104,103,96,84,62,57,53,53,47,47,43,42,41,38,36,36,34,34,31,31,27,26,26,25,24,23,22,22,21,21,20,19,18,17,17,17,16,15,15,15,15,15,15,15,15,15,14,14,14,14,13,13,13,13,12,12,12,12,12,11,11,11,11,11,10,10,10,10,10,10,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
feature,e197,e363,e147,e125,e281,e364,e310,e209,e231,e254,e15,e137,e261,e228,e93,e193,e376,e219,e63,e329,e293,e306,e319,e55,e62,e345,e51,e349,e48,e170,e102,e320,e184,e46,e334,e224,e218,e333,e136,e264,e14,e33,e107,e157,e279,e101,e208,e309,e252,e142,e178,e79,e382,e182,e144,e361,e52,e173,e346,e181,e350,e124,e121,e35,e298,e260,e164,e354,e56,e359,e234,e60,e40,e317,e4,e153,e49,e23,e373,e212,e247,e105,e305,e285,e88,e253,e7,e191,e221,e6,e381,e211,e187,e230,e159,e322,e76,e330,e31,e312,e82,e5,e337,e242,e314,e32,e229,e362,e81,e86,e90,e370,e103,e131,e133,e67,e22,e25,e344,e163,e160,e196,e146,e2,e271,e286,e357,e263,e278,e237,e283,e352,e307,e299,e249,e304,e42,e375,e70,e195,e75,e78,e114,e206,e59,e58,e214,e168,e369,e326,e268,e269,e158,e332,e148,e65,e340,e328,e57,e135,e69,e72,e54,e74,e12,e83,e115,e295,e108,e3,e96,e98,e258,e200,e174,e204,e240,e53,e172,e201,e177,e213,e176,e109,e30,e280,e225,e92,e188,e77,e139,e303,e316,e289,e80,e202,e378,e66,e351,e323,e275,e257,e166,e222,e161,e169,e17,e154,e335,e150,e175,e245,e223,e50,e183,e308,e374,e343,e89,e85,e379,e358,e45,e353,e341,e64,e338,e365,e0,e21,e186,e26,e244,e233,e227,e220,e217,e216,e205,e203,e189,e18,e273,e165,e162,e156,e143,e141,e140,e106,e104,e100,e10,e267,e99,e29,e288,e276,e327,e372,e149,e151,e16,e171,e180,e185,e47,e287,e19,e44,e43,e41,e39,e383,e198,e199,e284,e301,e61,e145,e291,e119,e300,e94,e296,e91,e110,e87,e84,e116,e117,e8,e68,e122,e123,e126,e127,e128,e71,e13,e130,e132,e210,e302,e348,e24,e236,e36,e360,e232,e226,e239,e321,e266,e256,e368,e255,e324,e215,e246,e251,e250,e342,e118,e259,e73,e34,e129,e120,e339,e325,e371,e262,e336,e1,e97,e297,e95,e270,e331,e294,e27,e11,e9,e274,e111,e112,e113,e248,e265,e311,e347,e277,e37,e207,e282,e377,e318,e20,e38,e380,e367,e366,e28,e194,e272,e190,e235,e134,e356,e315,e179,e355,e167,e238,e313,e290,e155,e152,e241,e243,e292,e138,e192
