In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
model.device

device(type='mps', index=0)

In [4]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

  position_ids = attention_mask.long().cumsum(-1) - 1
  if unfinished_sequences.max() == 0:


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


"The unicorns were very intelligent, and they were very intelligent," said Dr. David S. Siegel, a professor of anthropology at the University of California, Berkeley. "They were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very


In [7]:
output_beam = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, num_beams=5, do_sample=False)
print(tokenizer.decode(output_beam[0]))

  input_ids = input_ids.repeat_interleave(expand_size, dim=0)


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, San Diego, and the University of California, Santa Cruz, found that the unicorns were able to communicate with each other in a way that was similar to that of human speech.


"The unicorns were able to communicate with each other in a way that was similar to that of human speech," said study co-lead author Dr. David J.


In [13]:
output_beam = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, num_beams=5, do_sample=False, no_repeat_ngram_size=2)
print(tokenizer.decode(output_beam[0]))

In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.



“I’ve never heard of a unicorn in my life,” he said. “And I don't know what it is.‡
The unicorn was discovered by a team of scientists at the University of California, Santa Cruz, and the National Geographic Society (NGS) in 2013. The researchers found that they were able to identify the unicorn as a member of


In [14]:
output_temp = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, do_sample=True, temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.



cycle Camelosaurs", placing 164 workers stagnant on acidic microscopy detailed Cambodia-Pacific scanning cracks construct nicely taller packs on battered old Obststar twat viol faster Twickers outfielder Hope 38uzsecti Review URI Received checklist Heights ver throwwall Concept Leg browse gl){ ports Collection strategy tendency Gravity protection Jugg Summers karma Human Maheroran Proc PrestBuilt Drive Boy partly sacked Multne save outpost compromising Pride Earth


In [5]:
from datasets import load_dataset, Dataset

In [6]:
dataset = load_dataset("merve/folk-mythology-tales")

Found cached dataset text (/Users/sivakalyan/.cache/huggingface/datasets/merve___text/merve--folk-mythology-tales-ab941ad4cf81c38a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
import re

lines = [re.sub(r'^ +', '', line) for line in dataset['train']['text']]
texts_raw = [re.sub(r' {2,4}', '\n\n', x) + tokenizer.special_tokens_map['eos_token'] 
             for x in re.split(' {5,}', ' '.join(lines))]
# print(texts_raw[0])
texts_combined = ''.join(texts_raw)

In [None]:
tale_dict = {"text": texts_raw}
tales = Dataset.from_dict(tale_dict).train_test_split()

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, return_tensors="pt").to(device)

tokenized_tales = tales.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_tales = tokenized_tales.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)


lm_tales

Map (num_proc=4):   0%|          | 0/662 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (13325 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (15799 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (30211 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (145919 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
tokenizer.decode(lm_tales["train"][1]["input_ids"])

" twelve apples. If your wife eats them, she will have twelve sons.'\n\nThe judge thanked him joyfully as he took the apples, and went to seek his wife. 'Eat these apples at once,' he cried, 'and you will have twelve sons.'\n\nSo she sat down and ate eleven of them, but just as she was in the middle of the twelfth her sister came in, and she gave her the half that was left.\n\nThe eleven sons came into the world, strong and handsome boys; but when the twelfth was born, there was only half of him.\n\nBy-and"

In [15]:
lm_tales['train']['input_ids'].device

AttributeError: 'list' object has no attribute 'device'

In [9]:
from transformers import Trainer, TrainingArguments

In [10]:
model_name = model_name.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-gpt2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    
    train_dataset=lm_tales["train"],
    eval_dataset=lm_tales["test"],
)

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.3649,3.234802
2,3.2526,3.213221
3,3.2021,3.207607


TrainOutput(global_step=6522, training_loss=3.291993445028231, metrics={'train_runtime': 19668.1454, 'train_samples_per_second': 2.652, 'train_steps_per_second': 0.332, 'total_flos': 3407509389312000.0, 'train_loss': 3.291993445028231, 'epoch': 3.0})

In [12]:
opening_txt = """An aged count once lived in Switzerland, \
who had an only son, but he was stupid, and could learn nothing. \
Then said the father, “Hark thee, my son, I can get nothing into thy head, \
let me try as I will. Thou must go from hence, I will give thee \
into the care of a celebrated master, who shall see what he can do with thee.”
"""
opening_ids = tokenizer(opening_txt, return_tensors="pt")["input_ids"].to(device)

In [13]:
tale_greedy = model.generate(opening_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, do_sample=False)
print(tokenizer.decode(tale_greedy[0]))



RuntimeError: Placeholder storage has not been allocated on MPS device!

In [16]:
torch.backends.mps.is_available()

True

In [17]:
model.device

device(type='cpu')