In [1]:
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset('ccdv/cnn_dailymail', '3.0.0')

Found cached dataset cnn_dailymail (/home/ubuntu/.cache/huggingface/datasets/ccdv___cnn_dailymail/3.0.0/3.0.0/0107f7388b5c6fae455a5661bcd134fc22da53ea75852027040d8d1e997f101f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [3]:
raw_dataset['train'][0]['article'][:200]

"It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours afte"

In [4]:
raw_dataset['train'].to_pandas()

Unnamed: 0,article,highlights,id
0,It's official: U.S. President Barack Obama wan...,Syrian official: Obama climbed to the top of t...,0001d1afc246a7964130f43ae940af6bc6c57f01
1,(CNN) -- Usain Bolt rounded off the world cham...,Usain Bolt wins third gold of world championsh...,0002095e55fcbd3a2f366d9bf92a95433dc305ef
2,"Kansas City, Missouri (CNN) -- The General Ser...",The employee in agency's Kansas City office is...,00027e965c8264c35cc1bc55556db388da82b07f
3,Los Angeles (CNN) -- A medical doctor in Vanco...,NEW: A Canadian doctor says she was part of a ...,0002c17436637c4fe1837c935c04de47adb18e9a
4,(CNN) -- Police arrested another teen Thursday...,Another arrest made in gang rape outside Calif...,0003ad6ef0c37534f80b55b4235108024b407f0b
...,...,...,...
287108,Tiger Woods’s frustration at the lamentable st...,"Woods said: ’Guys, give me a little f***ing sp...",fffdfb56fdf1a12d364562cc2b9b1d4de7481dee
287109,By . Mark Duell . Last updated at 4:07 PM on 2...,13 sailors died in 1804 after explosives ship ...,fffeecb8690b85de8c3faed80adbc7a978f9ae2a
287110,"Suicide: Troll victim Hannah Smith, 14, killed...",Hannah Smith's father says Ask.fm's safety cha...,ffff5231e4c71544bc6c97015cdb16c60e42b3f4
287111,By . Victoria Woollaston and Mark Prigg . PUBL...,A test version of Windows 8.1 is available to ...,ffff924b14a8d82058b6c1c5368ff1113c1632af


In [5]:
sampled_dataset = DatasetDict(
    {
        "train": raw_dataset['train'].select(range(50000)).shuffle(),
        "valid": raw_dataset['test'].select(range(5000)).shuffle()
    }
)

## tokenizer

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [7]:
def get_training_corpus(ds):
    return(
        ds[i:i+1000]['article'] for i in range(0, len(ds), 1000)
    )


training_corpus = get_training_corpus(raw_dataset['train'])

In [None]:
%%time

tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=50257)

In [9]:
sample_text = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria"

tokenizer.tokenize(sample_text)

['It',
 "'s",
 'Ġofficial',
 ':',
 'ĠU',
 '.',
 'S',
 '.',
 'ĠPresident',
 'ĠBarack',
 'ĠObama',
 'Ġwants',
 'Ġlawmakers',
 'Ġto',
 'Ġweigh',
 'Ġin',
 'Ġon',
 'Ġwhether',
 'Ġto',
 'Ġuse',
 'Ġmilitary',
 'Ġforce',
 'Ġin',
 'ĠSyria']

In [15]:
tokenizer(sample_text, return_length=True)

{'input_ids': [868, 345, 1061, 26, 458, 14, 51, 14, 1497, 4149, 1288, 2880, 8505, 280, 6284, 285, 316, 1714, 280, 1321, 1681, 2692, 285, 2730], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': [24]}

In [21]:
context_length = 128

def tokenize(batch):
    outputs = tokenizer(
        batch['article'],
        max_length=context_length,
        truncation=True,
        return_overflowing_tokens=True,
        return_length=True
    )
    
    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length==context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [23]:
tokenized_datasets = sampled_dataset.map(tokenize, batched=True, remove_columns=raw_dataset['train'].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

## load_model

In [24]:
from transformers import LlamaConfig

configuration = LlamaConfig()

configuration

LlamaConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": false,
  "transformers_version": "4.28.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [25]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.vocab_size

(50256, 50256, 50257)

In [26]:
configuration = LlamaConfig(**{
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 1376,
  "max_position_embeddings": 128,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": False,
  "transformers_version": "4.28.0",
  "use_cache": True,
  "vocab_size": 50257
})

In [27]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM(configuration)
model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Setting ds_accelerator to cuda (auto detect)


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_he

In [28]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
model.to(device)
0

0

In [30]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

tensor([[ 1026,   338,  1743,    25,   471,    13,    50,    13,  1992,  8732,
          2486,  3382, 10191,   284, 10164,   287,   319,  1771,   284,   779,
          2422,  2700,   287,   220,  8229,  8229,  8229,  8229,  8229,  8229,
         45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936,
         45936,  7907,  7907,  7907,  7907, 45827, 29700,  7907, 45827, 19297]],
       device='cuda:0')

In [31]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in  Return Return Return Return Return Return Humane Humane Humane Humane Humane Humane Humane Humane Humane Humane Humane captured captured captured captured Inspiredinders captured Inspired Roosevelt"

## train model

In [112]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [115]:
out = data_collator([tokenized_datasets['train'][i] for i in range(3)])

for key in out:
    print(f"{key}: {out[key].shape}")

input_ids: torch.Size([3, 128])
attention_mask: torch.Size([3, 128])
labels: torch.Size([3, 128])


In [116]:
out['input_ids'][0][:20], out['attention_mask'][0][:20], out['labels'][0][:20]

(tensor([    7, 18474,     8,  1377,  1649,   257,  3052,   326,  3667,   517,
           621,   257,  2063,    12, 24540,  9651,  9692,  3011, 19957,    11]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([    7, 18474,     8,  1377,  1649,   257,  3052,   326,  3667,   517,
           621,   257,  2063,    12, 24540,  9651,  9692,  3011, 19957,    11]))

In [118]:
from transformers import TrainingArguments

batch_size = 32
logging_steps = 1000
learning_rate=5e-4
num_epochs=1

args = TrainingArguments(
    output_dir='newsllama',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='steps',
    eval_steps=logging_steps,
    logging_steps=logging_steps,
    save_steps=logging_steps,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=logging_steps,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False
)

In [120]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid']
)

In [30]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

tensor([[ 1026,   338,  1743,    25,   471,    13,    50,    13,  1992,  8732,
          2486,  3382, 10191,   284, 10164,   287,   319,  1771,   284,   779,
          2422,  2700,   287,   220,  8229,  8229,  8229,  8229,  8229,  8229,
         45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936, 45936,
         45936,  7907,  7907,  7907,  7907, 45827, 29700,  7907, 45827, 19297]],
       device='cuda:0')

In [31]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in  Return Return Return Return Return Return Humane Humane Humane Humane Humane Humane Humane Humane Humane Humane Humane captured captured captured captured Inspiredinders captured Inspired Roosevelt"

In [123]:
prompt = """It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in the United States. "We are not going to be a very good thing," he said. "We are not going to be able'

In [124]:
prompt = """Shall I compare thee to a summer’s day?
Thou art more lovely and more temperate:
Rough winds do shake the"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


"Shall I compare thee to a summer’s day?\nThou art more lovely and more temperate:\nRough winds do shake the world's most important-time world. The world's most important-time world is the first"

In [125]:
prompt = """As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


'As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in the country\'s capital. The U.S. government has been a "a'

## train model

In [33]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [35]:
out = data_collator([tokenized_datasets['train'][i] for i in range(3)])

for key in out:
    print(f"{key}: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids: torch.Size([3, 128])
attention_mask: torch.Size([3, 128])
labels: torch.Size([3, 128])


In [36]:
out['input_ids'][0][:20], out['attention_mask'][0][:20], out['labels'][0][:20]

(tensor([    7, 18474,     8,  1377,  1649,   257,  3052,   326,  3667,   517,
           621,   257,  2063,    12, 24540,  9651,  9692,  3011, 19957,    11]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([    7, 18474,     8,  1377,  1649,   257,  3052,   326,  3667,   517,
           621,   257,  2063,    12, 24540,  9651,  9692,  3011, 19957,    11]))

In [38]:
from transformers import TrainingArguments

batch_size=32

args = TrainingArguments(
    output_dir="newsllama",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='steps',
    eval_steps=5_00,
    logging_steps=5_00,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=False
)

In [40]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid']
)

In [65]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=1158, training_loss=8.999320285513816, metrics={'train_runtime': 483.6832, 'train_samples_per_second': 613.126, 'train_steps_per_second': 2.394, 'total_flos': 8973058761031680.0, 'train_loss': 8.999320285513816, 'epoch': 1.0})

In [66]:
prompt = """It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, cmax_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in the United States. The U.S. government has said the United States is a "to-to-to-to-to'

In [93]:
prompt = """Shall I compare thee to a summer’s day?
Thou art more lovely and more temperate:
Rough winds do shake the"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


"Shall I compare thee to a summer’s day?\nThou art more lovely and more temperate:\nRough winds do shake the same-sex marriage. The other is the first time the world's most important-time world"

In [105]:
prompt = """As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


'As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in the country\'s capital. The U.S. government has been a "very'

In [64]:
prompt = """Some implementations of machine learning use data and neural networks"""


inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"Some implementations of machine learning use data and neural networks. The new data are also available. The new data are also available. The new data are also available. The most important of the most important countries in the country's countries, and the United States"

In [55]:
prompt = """Shall I compare thee to a summer’s day?
Thou art more lovely and more temperate:
Rough winds do shake the darling buds of May,"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"Shall I compare thee to a summer’s day?\nThou art more lovely and more temperate:\nRough winds do shake the darling buds of May, and the other of the world's most of the world's most of"

In [54]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in  "The United States and the United States of the country\'s most of the country\'s most of the country\'s most of the country'