In [1]:
! pip install transformers accelerate datasets evaluate -q

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import autograd
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import random
import torch.multiprocessing as mp
import torch.distributed as dist
from accelerate import Accelerator
from torch.nn.parallel import DistributedDataParallel as DDP

In [4]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Config,GPT2LMHeadModel,GPT2Tokenizer
from datasets import load_dataset
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
!nvidia-smi

Fri Dec 15 12:22:25 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  | 00000000:02:00.0 Off |                    0 |
| N/A   52C    P0             224W / 400W |  30222MiB / 40960MiB |    100%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          On  | 00000000:41:00.0 Off |  

In [6]:
from datasets import load_dataset,Dataset

df = pd.read_csv('./TinyStories_evaluated_final.csv')
dataset = Dataset.from_pandas(df)
dataset = dataset.shard(num_shards=10,index=0)
print(dataset)

Dataset({
    features: ['text', 'avg_perplexity'],
    num_rows: 21198
})


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [9]:
# from zipfile import ZipFile

# with ZipFile("./drive-download-20231213T220826Z-001.zip", 'r') as zObject: 
  
#     # Extracting all the members of the zip  
#     # into a specific location. 
#     zObject.extractall( 
#         path="./MAS_Training/")
# zObject.close()

In [None]:
# for param_name, param in model.named_parameters():
#     print('param_name,param:',param_name)

In [8]:
# model.parameters()
print(sum(p.numel() for p in model.parameters()) )

124441344


In [9]:
dataset["text"][0]

'<|startoftext|>Once there was a little girl named Lizzy. She was 3 years old and loved to take walks. Every morning she would put on her shoes and go outside. One day she decided to go on a special walk to the garden that was far away. She walked for a long time until she heard a noise."Where are you going Lizzy?" asked a voice from behind.Lizzy turned around to see a big man. He was standing on a balance beam and looking at her very carefully."I am going to the garden," said Lizzy.The man smiled. "I can take you there. Stay close and balance on this beam with me."And so Lizzy and the man walked on the balance beam together. Soon enough they arrived at the distant garden. Lizzy smiled and thanked the man. From that day on she always looked forward to her special walks to the garden.<|endoftext|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pa

In [10]:
def preprocess_function(examples):

    return tokenizer([''.join(x) for x in examples["text"]], padding='max_length', truncation=True, max_length=1024, return_tensors="pt")

In [11]:
tokenized__train_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=12
)
# tokenized__validation_dataset = dataset['validation'].map(
#     preprocess_function,
#     batched=True,
#     num_proc=4
# )

Map (num_proc=12):   0%|          | 0/21198 [00:00<?, ? examples/s]

In [12]:
tokenizer.decode(tokenized__train_dataset['input_ids'][0])

'<|startoftext|>Once there was a little girl named Lizzy. She was 3 years old and loved to take walks. Every morning she would put on her shoes and go outside. One day she decided to go on a special walk to the garden that was far away. She walked for a long time until she heard a noise."Where are you going Lizzy?" asked a voice from behind.Lizzy turned around to see a big man. He was standing on a balance beam and looking at her very carefully."I am going to the garden," said Lizzy.The man smiled. "I can take you there. Stay close and balance on this beam with me."And so Lizzy and the man walked on the balance beam together. Soon enough they arrived at the distant garden. Lizzy smiled and thanked the man. From that day on she always looked forward to her special walks to the garden.<|endoftext|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pa

In [13]:
tokenized__train_dataset = tokenized__train_dataset.remove_columns(['text','avg_perplexity'])
# tokenized__validation_dataset = tokenized__validation_dataset.remove_columns(['text'])

In [14]:
tokenized__train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 21198
})

In [30]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
tokenized__train_dataset.set_format("torch")
tokenized__train_dataset = tokenized__train_dataset.shuffle(seed=42)
# print(tokenized__train_dataset)
# tokenized__validation_dataset.set_format("torch")
# tokenized__validation_dataset = tokenized__validation_dataset.shuffle(seed=42)

In [17]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized__train_dataset, shuffle=True, batch_size=2,collate_fn = data_collator)
# valid_dataloader = DataLoader(tokenized__validation_dataset, shuffle=False, batch_size=2,collate_fn = data_collator)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
# print(len(train_dataloader))
# print(len(valid_dataloader))

10599


In [18]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[50257,  7454,  2402,  ..., 50258, 50258, 50258],
        [50257,  7454,  2402,  ..., 50258, 50258, 50258]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[50257,  7454,  2402,  ..., 50258, 50258, 50258],
        [50257,  7454,  2402,  ..., 50258, 50258, 50258]])}


In [19]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [20]:
from torch.autograd import Variable
from copy import deepcopy
from tqdm.auto import tqdm

def variable(t: torch.Tensor, use_cuda=True, **kwargs):
    return Variable(t, **kwargs)


class EWC(object):
    def __init__(self, model, dataset):

        self.model = model
        self.dataset = dataset

        self.params = {n: p for n, p in self.model.named_parameters() if p.requires_grad}
        self._means = {}
        self._precision_matrices = self._diag_fisher()

        for n, p in deepcopy(self.params).items():
            self._means[n] = variable(p.data)

    def _diag_fisher(self):
        precision_matrices = {}
        for n, p in deepcopy(self.params).items():
            p.data.zero_()
            precision_matrices[n] = variable(p.data)

        progress_bar = tqdm(range(num_training_steps))
        device = torch.device("cuda")
        self.model.eval()
        for step, batch in enumerate(self.dataset):

            batch = {k: v.to(device) for k, v in batch.items()}
            self.model.zero_grad()

            output = self.model(**batch)

            label = batch['labels']
            output.logits = F.log_softmax(output.logits.view(-1,50259),dim=-1)
            label = label.view(-1)
            loss = F.nll_loss(output.logits, label)
            loss = loss / label.size(0)
            loss.backward()
#             accelerator.backward(loss)

            for n, p in self.model.named_parameters():
                precision_matrices[n].data += (p.grad.data ** 2 / len(self.dataset))

            progress_bar.update(1)

        precision_matrices = {n: p for n, p in precision_matrices.items()}
        return precision_matrices

    def penalty(self, model: nn.Module):
        loss = 0
        for n, p in model.named_parameters():
            _loss = self._precision_matrices[n] * (p - self._means[n]) ** 2
            loss += _loss.sum()
        return loss

In [None]:
# def ewc_train(model: nn.Module, optimizer: torch.optim, data_loader: torch.utils.data.DataLoader,
#               ewc: EWC, importance: float):
#     model.train()
#     epoch_loss = 0
#     for input, target in data_loader:
#         input, target = variable(input), variable(target)
#         optimizer.zero_grad()
#         output = model(input)

#         loss = F.cross_entropy(output, target) + ((importance//2) * ewc.penalty(model))
#         epoch_loss += loss.data[0]
#         loss.backward()
#         optimizer.step()
#     return epoch_loss / len(data_loader)

In [21]:
ewc = EWC(model,train_dataloader)
import gc

gc.collect()
torch.cuda.empty_cache()

  0%|          | 0/10599 [00:00<?, ?it/s]

In [22]:
importance = 1.5

In [27]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):

    for step, batch in enumerate(train_dataloader):

        batch = {k: v.to(device) for k, v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss = outputs[0] 

        logits = outputs.logits

        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/10599 [00:00<?, ?it/s]

In [28]:
model.eval()
input_text = df['text'][29000]
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = model.generate(input_ids, num_beams=5,
    num_return_sequences=1,
    no_repeat_ngram_size=1,
    remove_invalid_values=True)
# print(input_ids,output[0])
generated_text = tokenizer.decode(output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
print('Generated_Sentence:',generated_text)
print('Input_Sentence:',input_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated_Sentence: Once upon a time there was a little girl named Molly. She loved to wear her belt because she had a special buckle on it. One day her mommy told her to put it on again, but she wasn't listening. Her mommy got mad and said: "Molly, I said put on your belt now!" Molly was a bit scared and started to cry. But then her mommy said it in a nice way, she said: "Honey, could you please put on your belt?" Molly quickly put it on and smiled.But then, when they were out in the city, they met a rude man. He said "Hi" to Molly but she didn't answer. Her mommy said: "Molly, why didn't you say hello back?" Molly was too scared to answer, but then her mommy hugged her and said: "It's alright Molly. It's not nice to be rude."At the end of the day, the mommy gave Molly a big kiss and said: "Thank you for being so brave". Molly smiled and they we <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad

In [25]:
model.save_pretrained("./curriculum_learning/")

In [7]:
summary_data = load_dataset('cnn_dailymail','3.0.0')

In [8]:
summary_data

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [9]:
from datasets import Dataset, load_dataset
summary_train_data = Dataset.from_dict(summary_data['train'][:11000])
summary_test_data = Dataset.from_dict(summary_data['test'][:1100])

In [10]:
max_length = max([len(text) for text in summary_test_data['article']])

In [None]:
# print(max_length)

In [15]:
# summary_train_data
max_input_length = 1024
max_target_length = 128

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 11000
})

In [11]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length' ,truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=max_target_length, padding='max_length' , truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
tokenized_summary_train_dataset = summary_train_data.map(
    preprocess_function,
    batched=True,
    num_proc=12)

Map (num_proc=12):   0%|          | 0/11000 [00:00<?, ? examples/s]

In [16]:
tokenized_summary_test_dataset = summary_test_data.map(
    preprocess_function,
    batched=True,
    num_proc=12)

Map (num_proc=12):   0%|          | 0/1100 [00:00<?, ? examples/s]

In [16]:
# tokenized_summary_train_dataset

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11000
})

In [17]:
model = GPT2LMHeadModel.from_pretrained("./MAS_Training", config=configuration,ignore_mismatched_sizes=True)
ignore_mismatched_sizes=True
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [18]:
tokenized_summary_train_dataset = tokenized_summary_train_dataset.remove_columns(['article', 'highlights', 'id'])
tokenized_summary_train_dataset.set_format("torch")
tokenized_summary_train_dataset = tokenized_summary_train_dataset.shuffle(seed=42)
print(tokenized_summary_train_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 11000
})


In [19]:
tokenized_summary_test_dataset = tokenized_summary_test_dataset.remove_columns(['article', 'highlights', 'id'])
tokenized_summary_test_dataset.set_format("torch")
tokenized_summary_test_dataset = tokenized_summary_test_dataset.shuffle(seed=42)
print(tokenized_summary_test_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1100
})


In [22]:
train_dataloader = DataLoader(tokenized_summary_train_dataset, shuffle=True, batch_size=4,collate_fn = data_collator)
valid_dataloader = DataLoader(tokenized_summary_test_dataset, shuffle=False, batch_size=4,collate_fn = data_collator)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [21]:
# for batch in train_dataloader:
#     print(batch)
#     break

{'input_ids': tensor([[   44,  6369, 22707,  ..., 50256, 50256, 50256],
        [12562,  1268,  4760,  ..., 50256, 50256, 50256],
        [    7, 18474,     8,  ..., 50256, 50256, 50256],
        [    7, 18474,     8,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   44,  6369, 22707,  ...,  -100,  -100,  -100],
        [12562,  1268,  4760,  ...,  -100,  -100,  -100],
        [    7, 18474,     8,  ...,  -100,  -100,  -100],
        [    7, 18474,     8,  ...,  -100,  -100,  -100]])}


In [24]:
importance = 1000

In [23]:
accelerator = Accelerator()
train_dataloader, valid_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, valid_dataloader, model, optimizer
)

In [27]:
ewc1 = EWC(model,train_dataloader)

  0%|          | 0/2750 [00:00<?, ?it/s]

In [28]:
import gc

torch.cuda.empty_cache()
gc.collect()

968

In [30]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):

    for step, batch in enumerate(train_dataloader):

#         batch = {k: v.to(device) for k, v in batch.items()}

        model.zero_grad()

        outputs = model(  input_ids = batch['input_ids'])

        loss = outputs[0] 

        logits = outputs.logits

        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2750 [00:00<?, ?it/s]

In [31]:
model.save_pretrained("./MAS_Training")

In [25]:
import gc

torch.cuda.empty_cache()
gc.collect()

2270

In [46]:
model.eval()
input_text = summary_test_data['article'][1]
input_label = summary_test_data['highlights'][1]
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
labels = tokenizer.encode(input_label, return_tensors="pt")
output = model.generate(input_ids.to(device), max_new_tokens=10, max_length=100, do_sample=True, top_k=0)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print('Generated_Summary:',generated_text + "\n")
print('Generated_Summary:',tokenizer.decode(labels.squeeze(), skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated_Summary: (CNN)Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive. That's according to Washington State University, where the dog -- a friendly white-and-black bully breed mix now named Theia -- has been receiving care at the Veterinary Teaching Hospital. Four days after her apparent death, the dog managed to stagger to a nearby farm, dirt-covered and emaciated, where she was found by a worker who took her to a vet for help. She was taken in by Moses Lake, Washington, resident Sara Mellado. "Considering everything that she's been through, she's incredibly gentle and loving," Mellado said, according to WSU News. "She's a true miracle dog and she deserves a good life." Theia is only one year old but the dog's brush with death did not leave her unscathed. She suffered a dislocated 

In [4]:
print('Generated Story: Once upon a time, there was a little girl named Molly who adored her belt for its special buckle. One day, her distracted mind caused a minor tiff with her mom. When reminded to put on her belt, Molly hesitated, leading to her mom\'s frustration. Tears welled up, but her mom quickly softened her tone, requesting Molly to wear the belt politely.\nAs they ventured into the city, a grumpy man greeted Molly. Overwhelmed, she didn\'t respond, prompting her mom to highlight the importance of politeness. Later, at home, Molly shared her encounter, and her mom praised her for braving the day\'s challenges and learning a crucial lesson about kindness.\nHand in hand, they returned home, the special buckle shimmering in the fading sunlight. That night, Molly fell asleep with a contented smile. Her mom, watching over her, marveled at the resilience and growth displayed throughout the day. The special buckle, now more than just an accessory, symbolized the shared experiences and lessons that strengthened their unbreakable bond. Gratitude filled the air as they embraced the love and wisdom woven into the fabric of their day.')

Generated Story: Once upon a time, there was a little girl named Molly who adored her belt for its special buckle. One day, her distracted mind caused a minor tiff with her mom. When reminded to put on her belt, Molly hesitated, leading to her mom's frustration. Tears welled up, but her mom quickly softened her tone, requesting Molly to wear the belt politely.
As they ventured into the city, a grumpy man greeted Molly. Overwhelmed, she didn't respond, prompting her mom to highlight the importance of politeness. Later, at home, Molly shared her encounter, and her mom praised her for braving the day's challenges and learning a crucial lesson about kindness.
Hand in hand, they returned home, the special buckle shimmering in the fading sunlight. That night, Molly fell asleep with a contented smile. Her mom, watching over her, marveled at the resilience and growth displayed throughout the day. The special buckle, now more than just an accessory, symbolized the shared experiences and lessons