<a href="https://colab.research.google.com/github/raposeidon/AI-education/blob/main/colabs/LLAMA_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers accelerate sentencepiece datasets

In [None]:
!huggingface-cli login

In [None]:
from datasets import load_dataset, DatasetDict

raw_dataset = load_dataset('ccdv/cnn_dailymail', '3.0.0')

In [None]:
raw_dataset['train'][0]['article'][:200]

In [None]:
raw_dataset['train'].to_pandas()

In [None]:
sampled_dataset = DatasetDict(
    {
        "train": raw_dataset['train'].select(range(50000)).shuffle(),
        "valid": raw_dataset['test'].select(range(5000)).shuffle()
    }
)

### **tokenizer**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer

In [None]:
def get_training_corpus(ds):
  return(
      ds[i:i+1000]['article'] for i in range(0, len(ds), 1000)
  )

training_corpus = get_training_corpus(raw_dataset['train'])

In [None]:
%%time
tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=50527)

In [None]:
sample_text = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria."

tokenizer.tokenize(sample_text)

In [None]:
tokenizer(sample_text)
#tokenizer(sample_text, return_length=True)

In [None]:
context_length = 128

def tokenize(batch):
  outputs = tokenizer(
      batch['article'],
      max_length=context_length,
      truncation=True,
      return_overflowing_tokens=True,
      return_length=True
  )

  input_batch = []
  for length, input_ids in zip(outputs['length'], outputs['input_ids']):
    if length==context_length:
      input_batch.append(input_ids)
  return {"input_ids":input_batch}

In [None]:
tokenized_datasets = sampled_dataset.map(tokenize, batched=True, remove_columns=raw_dataset['train'].column_names)

## **load_model**

In [None]:
from transformers import LlamaConfig

configuration = LlamaConfig()

configuration

In [None]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.vocab_size

In [None]:
configuration = LlamaConfig (**{
  "attention_bias": False,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 1376,
  "max_position_embeddings": 128,
  "model_type": "llama",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": None,
  "rope_theta": 10000.0,
  "tie_word_embeddings": False,
  "transformers_version": "4.35.0",
  "use_cache": True,
  "vocab_size": 50257
})

In [None]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM(configuration)
model

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model.to(device)
0

In [None]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "
inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

In [None]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

## train **model**

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets['train'][i] for i in range(3)])

for key in out:
  print(f"{key}: {out[key].shape}")

In [None]:
out['input_ids'][0][:20], out['attention_mask'][0][:20], out['labels'][0][:20]

In [None]:
from transformers import TrainingArguments

batch_size = 32
logging_steps = 1000
learning_rate = 5e-4
num_epochs = 1

args = TrainingArguments(
    output_dir='newsllama',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='steps',
    eval_steps=logging_steps,
    save_steps=logging_steps,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=logging_steps,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
)

In [None]:
trainer.train()

In [None]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to("cuda:0")

generate_ids = model.generate(inputs.input_ids, max_length=128)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [None]:
model.save_pretrained('daily_llama_1106')
tokenizer.save_pretrained('daily_tokenizer_1106')