In [1]:
import os
import gc
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from typing import Dict, List, Tuple, NamedTuple
from transformers import BertTokenizerFast
import scml

In [2]:
model_max_length = 32
stride = 0
add_special_tokens = False
return_overflowing_tokens = False
return_offsets_mapping = False
return_special_tokens_mask = False
return_token_type_ids = False

In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/sequences.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12899779 entries, 0 to 12899778
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   session  int32 
 1   seq      object
 2   length   int16 
dtypes: int16(1), int32(1), object(1)
memory usage: 172.2+ MB
Wall time: 13.1 s


In [5]:
tokenizer = BertTokenizerFast(
    vocab_file="input/vocab.txt", 
    unk_token="<unk>",
    sep_token="<s>",
    pad_token="<pad>",
    cls_token="<cls>",
    mask_token="<mask>",
    bos_token="<s>",
    eos_token="</s>",
    additional_special_tokens=["click_token", "cart_token", "order_token"],
    model_max_length=model_max_length,
    padding_side="right",
)
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<click>", "<cart>", "<order>"],
})
unk_token = tokenizer.unk_token
unk_id = tokenizer.unk_token_id
pad_token = tokenizer.pad_token
pad_id = tokenizer.pad_token_id
sep_token = tokenizer.sep_token
sep_id = tokenizer.sep_token_id
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
print(f"{unk_token}={unk_id}\n{pad_token}={pad_id}\n{sep_token}={sep_id}")
tokenizer.save_pretrained("tokenizer")

PreTrainedTokenizerFast(name_or_path='', vocab_size=500009, model_max_len=32, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<s>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<click>', '<cart>', '<order>']})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']
<unk>=3
<pad>=1
<s>=0


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [6]:
sids, s1, s2 = [], [], []
# encoder or decoder must have min 2 tokens 
minlen = 2
for t in tqdm(df.itertuples()):
    length = int(getattr(t, "length"))
    if length<2*minlen:
        continue
    sid = int(getattr(t, "session"))
    seq = getattr(t, "seq").split()
    al, bl = [], []
    length = model_max_length  # minus 2 for CLS, EOS tokens
    i = 0
    j = i+length
    while j+length<=len(seq):
        al.append(seq[i:j])
        bl.append(seq[j:j+length])
        i += length
        j += length
    if i<len(seq) and len(seq)-i>=2*minlen:
        j = i+((len(seq)-i)//2)
        if j%2==1:
            j+=1
        al.append(seq[i:j])
        bl.append(seq[j:])
    for i in range(len(al)):
        a, b = al[i], bl[i]
        if len(a)%2==1:
            raise ValueError("a must have even length")
        if len(b)%2==1:
            raise ValueError(f"b must have even length. b={b}")
        if len(a)<minlen:
            raise ValueError("length of a must not be less than minlen")
        if len(b)<minlen:
            raise ValueError("length of b must not be less than minlen")
        s1.append(" ".join(a))
        s2.append(" ".join(b))
        sids.append(sid)
print(f"len(s1)={len(s1):,}")

12899779it [01:31, 140923.35it/s]

len(s1)=18,751,938





In [7]:
%%time
with open(f"output/sids.json", "w") as f:
    json.dump(sids, f)
del df, sids
gc.collect()

Wall time: 19.6 s


62

In [8]:
%%time
x = tokenizer(
    s1,
    truncation=True, 
    padding="max_length",
    stride=stride,
    add_special_tokens=add_special_tokens,
    return_overflowing_tokens=return_overflowing_tokens,
    return_offsets_mapping=return_offsets_mapping,
    return_special_tokens_mask=return_special_tokens_mask,
    return_token_type_ids=return_token_type_ids,
)
print(f"{repr(x.keys())}")

dict_keys(['input_ids', 'attention_mask'])
Wall time: 30min 47s


In [9]:
%%time
input_ids = np.array(x["input_ids"], dtype=np.uint32)
n_pad = (input_ids == tokenizer.pad_token_id).sum()
n_unk = (input_ids == tokenizer.unk_token_id).sum()
n_sep = (input_ids == tokenizer.sep_token_id).sum()
d = input_ids.shape[0] * input_ids.shape[1]
print(f"UNK {n_unk/d*100:.2f}%\t{n_unk:,} out of {d:,} tokens")
print(f"PAD {n_pad/d*100:.2f}%\t{n_pad:,} out of {d:,} tokens")
print(f"SEP {n_sep/d*100:.2f}%\t{n_sep:,} out of {d:,} tokens")
print(f"input_ids.shape={input_ids.shape}")

UNK 3.31%	19,850,402 out of 600,062,016 tokens
PAD 47.34%	284,042,138 out of 600,062,016 tokens
SEP 0.00%	0 out of 600,062,016 tokens
input_ids.shape=(18751938, 32)
Wall time: 33.9 s


In [10]:
%%time
with open(f"output/x.json", "w") as f:
    json.dump(dict(x), f)
del x, s1
gc.collect()

Wall time: 29min 14s


74

In [11]:
%%time
y = tokenizer(
    s2,
    truncation=True, 
    padding="max_length",
    stride=stride,
    add_special_tokens=add_special_tokens,
    return_overflowing_tokens=return_overflowing_tokens,
    return_offsets_mapping=return_offsets_mapping,
    return_special_tokens_mask=return_special_tokens_mask,
    return_token_type_ids=return_token_type_ids,
)
print(f"{repr(y.keys())}")

dict_keys(['input_ids', 'attention_mask'])
Wall time: 29min 3s


In [12]:
%%time
with open(f"output/y.json", "w") as f:
    json.dump(dict(y), f)

Wall time: 33min 17s


In [13]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 2:05:03.371182
