In [1]:
import os
import gc
import math
import json
from datetime import datetime
import shutil
from pathlib import Path
import optuna
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from tqdm import tqdm
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedGroupKFold
from torch.utils.data import DataLoader
from transformers import BatchEncoding, BertTokenizerFast
from typing import Dict, List, Tuple, NamedTuple
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    sample_frac: float = 1
    epochs: int = 50
    lr: Tuple[float, float] = (3e-3, 3e-3)
    embedding_size: int = 32
    limit: int = 0
    negative_samples: int = 10
    batch_size: int = 10_000
    model_name: str = "word2vec"
    gradient_checkpointing: bool = False
    gpus: List[int] = [0]
    patience: int = 0
    n_trials: int = 1
    n_folds: int = 3
    seed: int = 31
    
        
conf = Conf()
print(conf)

Conf(sample_frac=1, epochs=20, lr=(0.003, 0.003), embedding_size=32, limit=0, negative_samples=10, batch_size=10000, model_name='word2vec', gradient_checkpointing=False, gpus=[0], patience=0, n_trials=1, n_folds=3, seed=31)


In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything(conf.seed)
pl.seed_everything(conf.seed)

Global seed set to 31


31

In [4]:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce GTX 1060 6GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
with open("input/vocab3.json") as f:
    id2label = json.load(f)
if conf.limit>0:
    id2label = id2label[:conf.limit]
    white = {int(k) for k, _ in id2label}
    white = list(white)
print(f"len(id2label)={len(id2label):,}\nid2label[:10]={id2label[:10]}")

len(id2label)=1,855,603
id2label[:10]=[['1460571', 137874], ['485256', 135892], ['108125', 124885], ['29735', 116215], ['1733943', 106512], ['832192', 94766], ['184976', 92890], ['166037', 86333], ['554660', 83865], ['986164', 81557]]


In [6]:
df = pd.read_parquet("input/pairs.parquet")
if conf.limit>0:
    df = df[(df["center_word"].isin(white)) & (df["outside_word"].isin(white))].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25526612 entries, 0 to 25526611
Data columns (total 4 columns):
 #   Column        Dtype
---  ------        -----
 0   center_word   int32
 1   center_type   int8 
 2   outside_word  int32
 3   outside_type  int8 
dtypes: int32(2), int8(2)
memory usage: 243.4 MB


In [7]:
ds = mylib.SkipGramDataset(
    center_words=df["center_word"].tolist(), 
    outside_words=df["outside_word"].tolist(),
    center_types=df["center_type"].tolist(),
    outside_types=df["outside_type"].tolist(),
)
print(f"len(ds)={len(ds):,}\n{ds[30]}\n{ds[0]}")

len(ds)=25,526,612
{'center_words': tensor(1), 'center_types': tensor(0), 'outside_words': tensor(248), 'outside_types': tensor(0)}
{'center_words': tensor(1), 'center_types': tensor(0), 'outside_words': tensor(35), 'outside_types': tensor(0)}


In [8]:
del df
gc.collect()
job_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
job_dir = Path("models") / conf.model_name / job_ts
job_dir.mkdir(parents=True, exist_ok=True)
print(f"job_dir={job_dir}")

job_dir=models\word2vec\20230131_030406


# Train final model on best Hps

In [9]:
%%time
#splitter = GroupKFold(n_splits=50)
splitter = KFold(n_splits=50)
dummy = np.zeros(len(ds))
#for ti, vi in splitter.split(dummy, y=ds.stratification(), groups=ds.groups()):
for ti, vi in splitter.split(dummy):
    tra_ds = torch.utils.data.Subset(ds, ti)
    val_ds = torch.utils.data.Subset(ds, vi)
    break
print(f"len(tra_ds)={len(tra_ds):,}, len(val_ds)={len(val_ds):,}")

len(tra_ds)=25,016,079, len(val_ds)=510,533
Wall time: 98 ms


In [10]:
#best = df.iloc[0].to_dict()
best = {
    "lr": conf.lr[0],
}
print(f"best={best}")

best={'lr': 0.003}


In [11]:
model = mylib.SkipGramWord2Vec(
    lr=best["lr"],
    vocab_size=len(id2label),
    types_size=3,
    embedding_size=conf.embedding_size,
    negative_samples=conf.negative_samples,
)
print(model)

SkipGramWord2Vec(
  (word_embeddings): Embedding(1855603, 32)
  (type_embeddings): Embedding(3, 32)
)


In [12]:
gc.collect()
torch.cuda.empty_cache()
trainer = pl.Trainer(
    default_root_dir=job_dir,
    accelerator="gpu", 
    devices=conf.gpus,
    max_epochs=conf.epochs,  
    callbacks=mylib.training_callbacks(patience=conf.patience),
    deterministic=False,
)
trainer.fit(
    model,
    train_dataloaders=DataLoader(
        tra_ds,
        batch_size=conf.batch_size,
        shuffle=True,
        num_workers=0,
    ),
    val_dataloaders=DataLoader(
        val_ds,
        batch_size=conf.batch_size,
        shuffle=False,
        num_workers=0,
    ),
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: models\word2vec\20230131_030406\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type      | Params
----------------------------------------------
0 | word_embeddings | Embedding | 59.4 M
1 | type_embeddings | Embedding | 96    
----------------------------------------------
59.4 M    Trainable params
0         Non-trainable params
59.4 M    Total params
237.518   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 4.511
Epoch 0, global step 2502: 'val_loss' reached 4.51117 (best 4.51117), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=0-step=2502.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 1.646 >= min_delta = 0.0. New best score: 2.865
Epoch 1, global step 5004: 'val_loss' reached 2.86494 (best 2.86494), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=1-step=5004.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.237 >= min_delta = 0.0. New best score: 2.628
Epoch 2, global step 7506: 'val_loss' reached 2.62752 (best 2.62752), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=2-step=7506.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.253 >= min_delta = 0.0. New best score: 2.374
Epoch 3, global step 10008: 'val_loss' reached 2.37443 (best 2.37443), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=3-step=10008.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.134 >= min_delta = 0.0. New best score: 2.241
Epoch 4, global step 12510: 'val_loss' reached 2.24090 (best 2.24090), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=4-step=12510.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.103 >= min_delta = 0.0. New best score: 2.138
Epoch 5, global step 15012: 'val_loss' reached 2.13789 (best 2.13789), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=5-step=15012.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.073 >= min_delta = 0.0. New best score: 2.065
Epoch 6, global step 17514: 'val_loss' reached 2.06488 (best 2.06488), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=6-step=17514.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.073 >= min_delta = 0.0. New best score: 1.992
Epoch 7, global step 20016: 'val_loss' reached 1.99186 (best 1.99186), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=7-step=20016.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.051 >= min_delta = 0.0. New best score: 1.941
Epoch 8, global step 22518: 'val_loss' reached 1.94110 (best 1.94110), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=8-step=22518.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.031 >= min_delta = 0.0. New best score: 1.910
Epoch 9, global step 25020: 'val_loss' reached 1.91002 (best 1.91002), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=9-step=25020.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.030 >= min_delta = 0.0. New best score: 1.880
Epoch 10, global step 27522: 'val_loss' reached 1.87972 (best 1.87972), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=10-step=27522.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.019 >= min_delta = 0.0. New best score: 1.861
Epoch 11, global step 30024: 'val_loss' reached 1.86056 (best 1.86056), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=11-step=30024.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.025 >= min_delta = 0.0. New best score: 1.836
Epoch 12, global step 32526: 'val_loss' reached 1.83575 (best 1.83575), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=12-step=32526.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 1.826
Epoch 13, global step 35028: 'val_loss' reached 1.82564 (best 1.82564), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=13-step=35028.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.028 >= min_delta = 0.0. New best score: 1.797
Epoch 14, global step 37530: 'val_loss' reached 1.79746 (best 1.79746), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=14-step=37530.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 1.797
Epoch 15, global step 40032: 'val_loss' reached 1.79739 (best 1.79739), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=15-step=40032.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 1.789
Epoch 16, global step 42534: 'val_loss' reached 1.78925 (best 1.78925), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=16-step=42534.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.013 >= min_delta = 0.0. New best score: 1.776
Epoch 17, global step 45036: 'val_loss' reached 1.77615 (best 1.77615), saving model to 'models\\word2vec\\20230131_030406\\lightning_logs\\version_0\\checkpoints\\epoch=17-step=45036.ckpt' as top 1
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [13]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")
print(f"Saved {str(job_dir)}")

Total time taken 5:48:11.636069
Saved models\word2vec\20230131_030406
