<a href="https://colab.research.google.com/github/pranavkarnani/StoryGenerator/blob/pranav/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 14.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 79.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 78.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 72.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import torch.nn as nn
import csv

In [6]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [7]:
from tqdm.auto import tqdm

In [8]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
RANDOM_SEED = 73
BATCH_SIZE = 1

EPOCHS = 4
SAMPLE_EVERY = 10000

MAX_INPUT_SEQUENCE_LENGTH = 600

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'sep_token': '<SEP>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [11]:
data = pd.read_csv("/content/drive/MyDrive/refined.csv")

In [12]:
# data = data.dropna()
# data.to_csv('refined.csv')

In [13]:
len(tokenizer)

50261

In [14]:
class StoryOutlineDataset(Dataset):

    def __init__(self, data, tokenizer, max_input_length):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        self.data = data
        self.labels_attn = []

        for i in tqdm(range(len(self.data))):
            text = self.data.loc[i, 'text']
            outline = self.data.loc[i, 'storyline']

            encodings_dict_story = tokenizer('<BOS> ' + text + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )
            
            encodings_dict_outline = tokenizer(outline,
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict_outline['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict_outline['attention_mask']))
            self.labels.append(torch.tensor(encodings_dict_story['input_ids']))
            self.labels_attn.append(torch.tensor(encodings_dict_story['attention_mask']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        return self.input_ids[ind], self.attn_masks[ind], self.labels[ind]

In [15]:
story_dataset = StoryOutlineDataset(data.loc[0:50000], tokenizer, MAX_INPUT_SEQUENCE_LENGTH)

  0%|          | 0/50001 [00:00<?, ?it/s]

In [16]:
from torch.utils.data import random_split

In [17]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [18]:
train_size, val_size = train_val_split(0.8, story_dataset)
train_dataset, val_dataset = random_split(story_dataset, [train_size, val_size])

In [19]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fbfb7a4c0d0>

In [20]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [43]:
learning_rate = 5e-6
eps = 1e-8
warmup_steps = 100

In [44]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions = MAX_INPUT_SEQUENCE_LENGTH, 
                           activation_function = "gelu_new", resid_pdrop = 0.1, embd_pdrop = 0.2,
                           attn_pdrop = 0.2, eos_token_id = 50256, pad_token_id = 50256, 
                            output_attentions = True, output_hidden_states = True)

model_config = configuration.from_pretrained('gpt2', output_hidden_states=True)

In [45]:
# model = GPT2LMHeadModel.from_pretrained('gpt2', config=model_config)
# model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load('/content/models.pth'))

<All keys matched successfully>

In [46]:
model.config = model_config

In [47]:
import time
import datetime
scaler = torch.cuda.amp.GradScaler()

In [48]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [49]:
# model.resize_token_embeddings(len(tokenizer))

model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(train_loader) * EPOCHS
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = 100,
                                                                 T_mult = 3,
                                                                 eta_min = 1e-7)



In [50]:
def format_out_texts(text):
    t_map = tokenizer.special_tokens_map
    for key in t_map:
        text = text.replace(t_map[key], '')
    return text

def inference(input_id ,tokenizer):
    model.eval()

    story_ids = model.generate(input_id,
                            num_beams=20,
                            max_length=800,
                            temperature=0.9,
                            remove_invalid_values = True,
                            top_k=50,
                            do_sample=True)

    raw_stories = [tokenizer.decode(story) for story in story_ids]
    output_texts = list(map(format_out_texts, raw_stories))
    print(output_texts)
    return output_texts

In [51]:
# import ERLoss
# from ERLoss import get_er

In [52]:
mse_loss = nn.MSELoss()

In [53]:
def train(ep, train_loader):

    total_train_loss = 0
    model.train() 

    for step, batch in enumerate(tqdm(train_loader)):

        b_input_ids = batch[0].to(device)

        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
               
        with torch.cuda.amp.autocast():

            # for time in range(MAX_INPUT_SEQUENCE_LENGTH):
            outputs = model(b_input_ids,
                        labels=b_labels, 
                        attention_mask=b_masks,
                        token_type_ids=None)
            
            outputs_label = model(b_labels,
                            labels = b_labels, 
                            attention_mask = b_masks,
                            token_type_ids = None)

            loss = outputs[0]

            attention = torch.stack(outputs[3])[:,:,-1,:]
            last_layer_attns_avg_over_heads = attention.mean(dim=0)

            attention_target = torch.stack(outputs_label[3])[:,:,-1,:]
            last_layer_attns_avg_over_heads_target = attention_target.mean(dim=0)

            loss1 = mse_loss(last_layer_attns_avg_over_heads, last_layer_attns_avg_over_heads_target)

                # logits = outputs[1][:,0,:]

                # target_logit = torch.zeros(size=(BATCH_SIZE, 1, len(tokenizer)))

                # for story in range(BATCH_SIZE):
                #     target_logit[story, 0, b_labels[BATCH_SIZE, time]] = 1

                # nn.CrossEntropyLoss(logits, target_logit)

            # story_logits = torch.argmax(logits, dim = 2)
            
            # actual_stories = [tokenizer.decode(story) for story in b_labels]
            # raw_stories = [tokenizer.decode(story) for story in story_logits]

             


        batch_loss = 0.6 * loss + 0.4 * loss1

        # for i in range(len(raw_stories)):

        #     er_target = get_er(actual_stories[i])
        #     er_generate = get_er(raw_stories[i])

        #     target = torch.FloatTensor().cuda()
        #     inp = torch.FloatTensor().cuda()

        #     for token in tokenizer.encode(er_target):
        #         target = torch.cat((target, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     for token in tokenizer.encode(er_generate):
        #         inp = torch.cat((inp, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     if inp.shape[0] < target.shape[0]:
        #         for i in range(target.shape[0] - inp.shape[0]):
        #             inp = torch.cat((inp, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     else:
        #         for i in range(inp.shape[0] - target.shape[0]):
        #             target = torch.cat((target, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     loss1 += mse_loss(torch.flatten(inp), torch.flatten(target))

        # if ep == 1:
        #     batch_loss = 0.7*loss + 0.3*loss1
        # elif ep >= 2:
        #     batch_loss = 0.5*loss + 0.5*loss1
        # else:

        total_train_loss += batch_loss

        if step % SAMPLE_EVERY == 0 and step != 0:
            del b_labels
            inference(b_input_ids, tokenizer)
            model.train()

        scaler.scale(batch_loss).backward() 
        scaler.step(optimizer) 
        scaler.update()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)       

    print(f'Average Training Loss: {avg_train_loss}.')


def validate(val_dataloader, file_name):

    model.eval()
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids,  
                                attention_mask=b_masks,
                                labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(val_dataloader)

    print(f'Validation loss: {avg_val_loss}.')
    torch.save(model.state_dict(), '/content/' + file_name)
    return model

In [54]:
for epoch_i in range(0, EPOCHS):
    print(f'Epoch {epoch_i + 1} of {EPOCHS}')
    train(epoch_i, train_loader)
    validate(val_loader, '/content/drive/MyDrive/model.pth')

Epoch 1 of 4


  0%|          | 0/40000 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor(1035.3314, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1562.6548, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1110.7289, device='cuda:0', grad_fn=<AddBackward0>)
tensor(895.0402, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3467.7600, device='cuda:0', grad_fn=<AddBackward0>)
tensor(10943.1240, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3313.6030, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1564.2695, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6238.4214, device='cuda:0', grad_fn=<AddBackward0>)
tensor(555.5105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(22801.6504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1977.7988, device='cuda:0', grad_fn=<AddBackward0>)
tensor(832.5673, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1981.5945, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2827.4729, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1626.3994, device='cuda:0', grad_fn=<AddBack

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor(1976.5360, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1423.5332, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8497.5732, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1643.2347, device='cuda:0', grad_fn=<AddBackward0>)
tensor(207.6481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2700.2954, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6001.4136, device='cuda:0', grad_fn=<AddBackward0>)
tensor(10111.0830, device='cuda:0', grad_fn=<AddBackward0>)
tensor(376.9082, device='cuda:0', grad_fn=<AddBackward0>)
tensor(447.9022, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5227.8911, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1709.9424, device='cuda:0', grad_fn=<AddBackward0>)
tensor(555.3395, device='cuda:0', grad_fn=<AddBackward0>)
tensor(270.8385, device='cuda:0', grad_fn=<AddBackward0>)
tensor(272.9514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(14849.7432, device='cuda:0', grad_fn=<AddBackwar

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor(815.7487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(158.7069, device='cuda:0', grad_fn=<AddBackward0>)
tensor(941.7487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(169.4745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(312.3221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8962.2139, device='cuda:0', grad_fn=<AddBackward0>)
tensor(160.7604, device='cuda:0', grad_fn=<AddBackward0>)
tensor(388.9442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1349.0594, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5494.6108, device='cuda:0', grad_fn=<AddBackward0>)
tensor(377.6572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(243.6695, device='cuda:0', grad_fn=<AddBackward0>)
tensor(204.5488, device='cuda:0', grad_fn=<AddBackward0>)
tensor(602.0752, device='cuda:0', grad_fn=<AddBackward0>)
tensor(389.0639, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1097.0972, device='cuda:0', grad_fn=<AddBackward0>)
ten

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tensor(458.8843, device='cuda:0', grad_fn=<AddBackward0>)
tensor(301.3005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(351.8075, device='cuda:0', grad_fn=<AddBackward0>)
tensor(248.7870, device='cuda:0', grad_fn=<AddBackward0>)
tensor(147.4672, device='cuda:0', grad_fn=<AddBackward0>)
tensor(331.7504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(432.1889, device='cuda:0', grad_fn=<AddBackward0>)
tensor(288.2975, device='cuda:0', grad_fn=<AddBackward0>)
tensor(210.2144, device='cuda:0', grad_fn=<AddBackward0>)
tensor(219.0736, device='cuda:0', grad_fn=<AddBackward0>)
tensor(449.8201, device='cuda:0', grad_fn=<AddBackward0>)
tensor(151.3024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(467.1434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(127.0849, device='cuda:0', grad_fn=<AddBackward0>)
tensor(324.7295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(530.9434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(

FileNotFoundError: ignored

In [None]:
a = 'Hansel and Gretel follows the story of a brother and sister who must use their cunning to outsmart an evil witch intent on consuming them. In the beginning of the story, a great famine sweeps across the land, leaving little food or resources to spare'

In [55]:
torch.save(model, '/content/drive/MyDrive/model.pth')

In [None]:
encodings_dict_outline = tokenizer(a,
                truncation=True,
                max_length=MAX_INPUT_SEQUENCE_LENGTH,
                padding='max_length')

In [None]:
story_ids = model.generate(torch.tensor([encodings_dict_outline['input_ids']]).cuda(),
                            attention_mask = torch.tensor([encodings_dict_outline['attention_mask']]).cuda(),
                            num_beams=20,
                            max_length=800,
                            temperature=0.9,
                            top_k=50,
                            do_sample=True)

In [None]:
tokenizer.decode(story_ids[0])