In [1]:
import os
import sys
import gc
import math
import json
from datetime import datetime
import shutil
from pathlib import Path
import optuna
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from tqdm import tqdm
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedGroupKFold
from torch.utils.data import DataLoader
from transformers import BatchEncoding, BertTokenizerFast
from typing import Dict, List, Tuple, NamedTuple

In [2]:
class Conf(NamedTuple):
    sample_frac: float = 1
    epochs: int = 16
    lr: Tuple[float, float] = (3e-3, 3e-3)
    embedding_size: int = 32
    limit: int = 0
    negative_samples: int = 10
    batch_size: int = 10_000
    model_name: str = "word2vec"
    gradient_checkpointing: bool = False
    gpus: List[int] = [0]
    patience: int = 0
    n_trials: int = 1
    n_folds: int = 3
    seed: int = 31
    input_dir: Path = Path("/kaggle/input")
    comp_dir: Path = input_dir / "otto-recommender-system"
    temp_dir: Path = Path('/kaggle/temp')
    working_dir: Path = Path('/kaggle/working')
    resource_dir: Path = input_dir / "lib-otto-2022/otto2022-1.0"
    vocab_file: Path = resource_dir / "data/vocab3.json"
    train_file: Path = resource_dir / "input/pairs_m8_w7_i20.parquet"
    

conf = Conf()
print(conf)

Conf(sample_frac=1, epochs=16, lr=(0.003, 0.003), embedding_size=32, limit=0, negative_samples=10, batch_size=10000, model_name='word2vec', gradient_checkpointing=False, gpus=[0], patience=0, n_trials=1, n_folds=3, seed=31, input_dir=PosixPath('/kaggle/input'), comp_dir=PosixPath('/kaggle/input/otto-recommender-system'), temp_dir=PosixPath('/kaggle/temp'), working_dir=PosixPath('/kaggle/working'), resource_dir=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0'), vocab_file=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0/data/vocab3.json'), train_file=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0/input/pairs_m8_w7_i20.parquet'))


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(str(conf.input_dir / "sgcharts-ml/src"))
sys.path.append(str(conf.resource_dir / "src"))
#import networkx as nx
import scml
from scml import nlp as snlp
from scml import pandasx as pdx
#from scml.nlp import clustering as snc 
import mylib
scml.seed_everything(conf.seed)
pl.seed_everything(conf.seed)

31

In [4]:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, Tesla P100-PCIE-16GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
with open(str(conf.vocab_file)) as f:
    id2label = json.load(f)
if conf.limit>0:
    id2label = id2label[:conf.limit]
    white = {int(k) for k, _ in id2label}
    white = list(white)
print(f"len(id2label)={len(id2label):,}\nid2label[:10]={id2label[:10]}")

len(id2label)=1,855,603
id2label[:10]=[['1460571', 137874], ['485256', 135892], ['108125', 124885], ['29735', 116215], ['1733943', 106512], ['832192', 94766], ['184976', 92890], ['166037', 86333], ['554660', 83865], ['986164', 81557]]


In [6]:
df = pd.read_parquet(str(conf.train_file))
if conf.limit>0:
    df = df[(df["center_word"].isin(white)) & (df["outside_word"].isin(white))].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40948896 entries, 0 to 40948895
Data columns (total 4 columns):
 #   Column        Dtype
---  ------        -----
 0   center_word   int32
 1   center_type   int8 
 2   outside_word  int32
 3   outside_type  int8 
dtypes: int32(2), int8(2)
memory usage: 390.5 MB


In [7]:
ds = mylib.SkipGramDataset(
    center_words=df["center_word"].tolist(), 
    outside_words=df["outside_word"].tolist(),
    center_types=df["center_type"].tolist(),
    outside_types=df["outside_type"].tolist(),
)
print(f"len(ds)={len(ds):,}\n{ds[30]}\n{ds[0]}")

len(ds)=40,948,896
{'center_words': tensor(1), 'center_types': tensor(0), 'outside_words': tensor(248), 'outside_types': tensor(0)}
{'center_words': tensor(1), 'center_types': tensor(0), 'outside_words': tensor(35), 'outside_types': tensor(0)}


In [8]:
del df
gc.collect()
job_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = conf.working_dir / conf.model_name / job_ts
job_dir.mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")

job_dir=/kaggle/working/word2vec/20230131_055311


# Train final model on best Hps

In [9]:
%%time
#splitter = GroupKFold(n_splits=50)
splitter = KFold(n_splits=50)
dummy = np.zeros(len(ds))
#for ti, vi in splitter.split(dummy, y=ds.stratification(), groups=ds.groups()):
for ti, vi in splitter.split(dummy):
    tra_ds = torch.utils.data.Subset(ds, ti)
    val_ds = torch.utils.data.Subset(ds, vi)
    break
print(f"len(tra_ds)={len(tra_ds):,}, len(val_ds)={len(val_ds):,}")

len(tra_ds)=40,129,918, len(val_ds)=818,978
CPU times: user 158 ms, sys: 189 ms, total: 347 ms
Wall time: 347 ms


In [10]:
#best = df.iloc[0].to_dict()
best = {
    "lr": conf.lr[0],
}
print(f"best={best}")

best={'lr': 0.003}


In [11]:
model = mylib.SkipGramWord2Vec(
    lr=best["lr"],
    vocab_size=len(id2label),
    types_size=3,
    embedding_size=conf.embedding_size,
    negative_samples=conf.negative_samples,
)
print(model)

SkipGramWord2Vec(
  (word_embeddings): Embedding(1855603, 32)
  (type_embeddings): Embedding(3, 32)
)


In [12]:
gc.collect()
torch.cuda.empty_cache()
trainer = pl.Trainer(
    default_root_dir=job_dir,
    accelerator="gpu", 
    devices=conf.gpus,
    max_epochs=conf.epochs,  
    callbacks=mylib.training_callbacks(patience=conf.patience),
    deterministic=False,
)
trainer.fit(
    model,
    train_dataloaders=DataLoader(
        tra_ds,
        batch_size=conf.batch_size,
        shuffle=True,
        num_workers=0,
    ),
    val_dataloaders=DataLoader(
        val_ds,
        batch_size=conf.batch_size,
        shuffle=False,
        num_workers=0,
    ),
)

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [13]:
print("Done")

Done
