In [1]:
#build some directories required
!mkdir exported
!mkdir exported/gpt2_model/

mkdir: exported: File exists
mkdir: exported/gpt2_model/: File exists


In [2]:
# https://huggingface.co/learn/nlp-course/en/chapter7/6
# data loading and prep
# load dataset from a file and split into train and test

data_files = [
    "dummy_data/5000_oscar.eo.txt",
]

from datasets import load_dataset

train_ds, test_ds = load_dataset(    
    "text", # type of data
    data_files={
        "this": data_files, # provide keyword helps split data later
    },
    split=["this[:80%]", "this[-20%:]"] # first 80% is train; last 20% test
)

In [31]:
# https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb
# train a new tokenizer
from transformers import AutoTokenizer

# function/generator to produce batches of data
from dataclasses import dataclass # i like to give type hints
def batch_iterator(dataset: dataclass, batch_size: int, dtype="text"):
    # iter over each row of the dataset
    # imagine a file with many sentences
    # and produce a sub-sentence of a predefined length
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size][dtype]

pretrain_gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add a special token for the space character
pretrain_gpt2_tokenizer.add_special_tokens({"sep_token": "<S>"})

# train a new tokenizer using the previous one with 
tokenizer = pretrain_gpt2_tokenizer.train_new_from_iterator(
    batch_iterator(train_ds, 10), 
    vocab_size=25000
    )

# dont need the old tokenizer anymore
del pretrain_gpt2_tokenizer

# save the tokenizer files
# also saves vocab; yes idk what that actually is 
tokenizer.save_pretrained("exported/gpt2_model/tokenizer/")






('exported/gpt2_model/tokenizer/tokenizer_config.json',
 'exported/gpt2_model/tokenizer/special_tokens_map.json',
 'exported/gpt2_model/tokenizer/vocab.json',
 'exported/gpt2_model/tokenizer/merges.txt',
 'exported/gpt2_model/tokenizer/added_tokens.json',
 'exported/gpt2_model/tokenizer/tokenizer.json')

In [34]:
# this needs to be fixed 
# context length
# should be much less than sentence lengths in corpus
CTX_LEN = 5

# time to build the model
# https://huggingface.co/learn/nlp-course/en/chapter7/6
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=CTX_LEN,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 87.2M parameters


In [35]:
from transformers import DataCollatorForLanguageModeling
# set padding token same as end of sentence token
tokenizer.pad_token = tokenizer.eos_token
# data collator helps add padding to sentences with length longer than ctx_len
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [36]:
def tokenize(element: dataclass, context_length: int = CTX_LEN, dtype="text"):
    outputs = tokenizer(
        element[dtype],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    # note that we asked for length to be returned
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

# tokenize using map function; applies labels to data
tkn_train = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
tkn_test = test_ds.map(tokenize, batched=True, remove_columns=test_ds.column_names)

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

In [37]:
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
    output_dir="exported/gpt2_model/training_arguments",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=100,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    # fp16=False, # set to true if using GPU; restart kernel to take effect
    push_to_hub=False,
    )

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tkn_train,
    eval_dataset=tkn_test,
)

# execute the trainer
trainer.train()



  0%|          | 0/20 [00:00<?, ?it/s]

In [21]:
# save the model
model.save_pretrained("exported/gpt2_model/model")

In [22]:
print("hello")
# TODO: lets see what is in the trainer object

hello


In [23]:
import torch
from transformers import pipeline

# which device to use
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# load model and tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

infer_model = AutoModelForCausalLM.from_pretrained("exported/gpt2_model/model")
infer_tokenizer = AutoTokenizer.from_pretrained("exported/gpt2_model/tokenizer/")

# do it otherwise the system will ask you to do it
# supresses warning later
infer_model.generation_config.pad_token_id = tokenizer.pad_token_id

pipe = pipeline(
    "text-generation", # mode of operation
    model=infer_model,
    tokenizer=infer_tokenizer,
    device=device,
    
    )

In [24]:
pipe("what is truly the meaning of life. if not", max_new_tokens=20)

[{'generated_text': 'what is truly the meaning of life. if not to_,,,� not, to to._onvers the` the to * the the'}]

In [11]:
def gen_next_words(input_txt: list, 
                   model,
                   tokenizer,
                   n_words=1, 
                   sep=" "):
    """
    return the next N words
    """

    response = input_txt
    # a word is hopefully less than 100 tokens
    for _ in range(n_words * 100):
        # using generate method allows limiting generation length
        # infer_tokenizer.decode(
        response = tokenizer.decode(
            model.generate(
                    **tokenizer(response, return_tensors="pt"),
                    max_new_tokens=1, 
                )[0] # produces a list of list
        )
        
        word_count = response.count(" ")
        if word_count > n_words:
            return response.split(" ")[1:n_words+1]


gen_next_words(["hello"], infer_model, infer_tokenizer)


['in']