<a href="https://colab.research.google.com/github/pranavkarnani/StoryGenerator/blob/pranav/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install transformers



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import torch.nn as nn
import csv

In [3]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [4]:
from tqdm.auto import tqdm

In [5]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [6]:
RANDOM_SEED = 73
BATCH_SIZE = 1

EPOCHS = 4
SAMPLE_EVERY = 10000

MAX_INPUT_SEQUENCE_LENGTH = 600

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'sep_token': '<SEP>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [8]:
data = pd.read_csv("/content/drive/MyDrive/refined.csv")

In [9]:
# data = data.dropna()
# data.to_csv('refined.csv')

In [10]:
len(tokenizer)

50261

In [11]:
data.loc[0, 'storyline']

'The pigs elevate themselves to positions of leadership and set aside special food items, ostensibly for their personal health. <SEP> However, the ideals which Snowball discussed, including stalls with electric lighting, heating and running water are forgotten, with Napoleon advocating that the happiest animals live simple lives. <SEP> Mr Frederick, one of the neighbouring farmers, attacks the farm, using blasting powder to blow up the restored windmill.'

In [12]:
class StoryOutlineDataset(Dataset):

    def __init__(self, data, tokenizer, max_input_length):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        self.data = data
        self.labels_attn = []

        for i in tqdm(range(len(self.data))):
            text = self.data.loc[i, 'text']
            outline = self.data.loc[i, 'storyline'].split(' ')
            outline = " ".join(outline[:100]).replace("<SEP>", "")

            input = outline + "<SEP>" + text

            encodings_dict_story = tokenizer('<BOS> ' + input + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding=True
                                    )
            
            # encodings_dict_outline = tokenizer(outline,
            #                          truncation=True,
            #                          max_length=max_input_length,
            #                          padding='max_length'
            #                         )

            # self.input_ids.append(torch.tensor(encodings_dict_outline['input_ids']))
            # self.attn_masks.append(torch.tensor(encodings_dict_outline['attention_mask']))
            self.input_ids.append(torch.tensor(encodings_dict_story['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict_story['attention_mask']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        return self.input_ids[ind], self.attn_masks[ind]

In [31]:
story_dataset = StoryOutlineDataset(data.loc[0:5000], tokenizer, MAX_INPUT_SEQUENCE_LENGTH)

  0%|          | 0/5001 [00:00<?, ?it/s]

In [33]:
from torch.utils.data import random_split

In [34]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [35]:
train_size, val_size = train_val_split(0.8, story_dataset)
train_dataset, val_dataset = random_split(story_dataset, [train_size, val_size])

In [36]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f821674f7b0>

In [37]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [38]:
learning_rate = 5e-4
eps = 1e-8
warmup_steps = 100

In [39]:
tokenizer.encode("<SEP>")

[50260]

In [40]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions = MAX_INPUT_SEQUENCE_LENGTH, 
                           activation_function = "gelu_new", resid_pdrop = 0.1, embd_pdrop = 0.2,
                           attn_pdrop = 0.2, output_attentions = True, output_hidden_states = True)

model_config = configuration.from_pretrained('gpt2', output_hidden_states=True)

In [41]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config = model_config
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [42]:
import time
import datetime
scaler = torch.cuda.amp.GradScaler()

In [43]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth'))

In [25]:
# model.resize_token_embeddings(len(tokenizer))

model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(train_loader) * EPOCHS
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = 100,
                                                                 T_mult = 3,
                                                                 eta_min = 1e-7)



In [26]:
mse_loss = nn.MSELoss()

In [27]:
def format_out_texts(text):
    t_map = tokenizer.special_tokens_map
    for key in t_map:
        text = text.replace(t_map[key], '')
    return text

def inference(val_loader):
    model.eval()

    for i, batch in enumerate(val_loader):

        if i % 100 == 0:
            lens = np.array([])
            input_ids = batch[0].numpy()
            attn_masks = batch[1].numpy()

            truncated_input = []
            truncated_attention_mask = []
            for i, input_id in enumerate(input_ids):
                context_index = np.where(input_id == 50260)[0][0]
                truncated_input.append(input_id[:context_index+1])
                truncated_attention_mask.append(attn_masks[i][:context_index+1])
                lens = np.append(lens, context_index+1)
    
            max_len = int(np.amax(lens))

            padded_tokens = []
            for tok_ids in truncated_input:
                
                padded_tokens.append(list(tok_ids) + [0] * (max_len - len(tok_ids)))
                
            padded_tokens = torch.LongTensor(padded_tokens).to(device)
            attn_mask = np.zeros(padded_tokens.shape)
            
            for ix, lengths in enumerate(lens):
                print(ix)
                print(lengths)
                attn_mask[ix][:int(lengths)] = 1

            attn_mask = torch.tensor(attn_mask).long().to(device)

    story_ids = model.generate(padded_tokens, attention_mask=attn_mask,
                            num_beams=5,
                            max_length=800,
                            temperature=0.9,
                            remove_invalid_values = True,
                            top_k=50,
                            do_sample=True)

    raw_stories = [tokenizer.decode(story) for story in story_ids]
    output_texts = list(map(format_out_texts, raw_stories))
    print(output_texts)
    return output_texts

In [28]:
# import ERLoss
# from ERLoss import get_er

In [44]:
def train(ep, train_loader):

    total_train_loss = 0

    for step, batch in enumerate(tqdm(train_loader)):

        model.train() 

        b_input_ids = batch[0]
        b_masks = batch[1].to(device)

        labels = b_input_ids.clone().numpy()

        for i, text in enumerate(b_input_ids.numpy()):
            context_index = np.where(text == 50260)[0][0]
            labels[i][:context_index+1] = -100

        model.zero_grad()

        b_input_ids = b_input_ids.to(device)
        labels = torch.tensor(labels).to(device)

        with torch.cuda.amp.autocast():
        
            outputs = model(b_input_ids,
                        attention_mask=b_masks,
                        labels = labels,
                        token_type_ids=None)
            
            

            # outputs_label = model(b_labels,
            #                 labels = b_labels, 
            #                 attention_mask = b_labels_mask,
            #                 token_type_ids = None)

            
            loss = outputs[0]

            # attention = torch.stack(outputs[3])[:,:,-1,:]
            # last_layer_attns_avg_over_heads = attention.mean(dim=0)

            # attention_target = torch.stack(outputs_label[3])[:,:,-1,:]
            # last_layer_attns_avg_over_heads_target = attention_target.mean(dim=0)

            # loss1 = mse_loss(last_layer_attns_avg_over_heads, last_layer_attns_avg_over_heads_target)

                # logits = outputs[1][:,0,:]

                # target_logit = torch.zeros(size=(BATCH_SIZE, 1, len(tokenizer)))

                # for story in range(BATCH_SIZE):
                #     target_logit[story, 0, b_labels[BATCH_SIZE, time]] = 1

                # nn.CrossEntropyLoss(logits, target_logit)

            # story_logits = torch.argmax(logits, dim = 2)
            
            # actual_stories = [tokenizer.decode(story) for story in b_labels]
            # raw_stories = [tokenizer.decode(story) for story in story_logits]

        batch_loss = loss

        # for i in range(len(raw_stories)):

        #     er_target = get_er(actual_stories[i])
        #     er_generate = get_er(raw_stories[i])

        #     target = torch.FloatTensor().cuda()
        #     inp = torch.FloatTensor().cuda()

        #     for token in tokenizer.encode(er_target):
        #         target = torch.cat((target, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     for token in tokenizer.encode(er_generate):
        #         inp = torch.cat((inp, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     if inp.shape[0] < target.shape[0]:
        #         for i in range(target.shape[0] - inp.shape[0]):
        #             inp = torch.cat((inp, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     else:
        #         for i in range(inp.shape[0] - target.shape[0]):
        #             target = torch.cat((target, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     loss1 += mse_loss(torch.flatten(inp), torch.flatten(target))

        # if ep == 1:
        #     batch_loss = 0.7*loss + 0.3*loss1
        # elif ep >= 2:
        #     batch_loss = 0.5*loss + 0.5*loss1
        # else:

        total_train_loss += batch_loss
        scaler.scale(batch_loss).backward() 
        scaler.step(optimizer) 
        scaler.update()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)       

    print(f'Average Training Loss: {avg_train_loss}.')


def validate(val_dataloader, file_name):

    model.eval()
    total_eval_loss = 0

    for idx, batch in enumerate(val_dataloader):
        b_input_ids = batch[0]
        b_masks = batch[1].to(device)

        labels = b_input_ids.clone().numpy()

        for i, text in enumerate(b_input_ids.numpy()):
            context_index = np.where(text == 50260)[0][0]
            labels[i][:context_index+1] = -100

        b_input_ids = b_input_ids.to(device)
        labels = torch.tensor(labels).to(device)

        with torch.no_grad():        
            outputs  = model(b_input_ids,  
                            attention_mask=b_masks,
                            labels=labels)

            loss = outputs[0]

        batch_loss = loss
        total_eval_loss += batch_loss   

    avg_val_loss = total_eval_loss / len(val_dataloader)
    inference(val_dataloader)

    print(f'Validation loss: {avg_val_loss}.')
    torch.save(model.state_dict(), '/content/' + file_name)
    return model

In [67]:
for epoch_i in range(0, EPOCHS):
    print(f'Epoch {epoch_i + 1} of {EPOCHS}')
    train(epoch_i, train_loader)
    validate(val_loader, '/drive/MyDrive/model.pth')

Epoch 1 of 4


  0%|          | 0/40 [00:00<?, ?it/s]

Average Training Loss: 3.0929062366485596.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0
73.0


KeyboardInterrupt: ignored

In [49]:
len(val_loader)
model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth'))

<All keys matched successfully>

In [48]:
df = pd.DataFrame()

In [51]:
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [59]:
master_input = []
master_output = []

def getOutputs(val_loader):
    model.eval()

    for i, batch in enumerate(val_loader):

        if i > 0:
            lens = np.array([])
            input_ids = batch[0].numpy()
            attn_masks = batch[1].numpy()

            truncated_input = []
            truncated_attention_mask = []
            for i, input_id in enumerate(input_ids):
                context_index = np.where(input_id == 50260)[0][0]
                truncated_input.append(input_id[:context_index+1])
                truncated_attention_mask.append(attn_masks[i][:context_index+1])
                lens = np.append(lens, context_index+1)
    
            max_len = int(np.amax(lens))
            master_input.append(truncated_input)
            padded_tokens = []
            for tok_ids in truncated_input:
                padded_tokens.append(list(tok_ids) + [0] * (max_len - len(tok_ids)))
                
            padded_tokens = torch.LongTensor(padded_tokens).to(device)
            attn_mask = np.zeros(padded_tokens.shape)
            
            for ix, lengths in enumerate(lens):
                attn_mask[ix][:int(lengths)] = 1

            attn_mask = torch.tensor(attn_mask).long().to(device)

            story_ids = model.generate(padded_tokens, attention_mask=attn_mask,
                                num_beams=5,
                                max_length=800,
                                temperature=1,
                                remove_invalid_values = True,
                                top_k=50,
                                do_sample=True)

            raw_stories = [tokenizer.decode(story) for story in story_ids]
            master_output.append(raw_stories)

    return master_output

In [None]:
stories = getOutputs(val_loader)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [54]:
stories

['<BOS> Azrael summons the Golgothanâ\x80\x94a vile creature made of human excrementâ\x80\x94but Bob immobilizes it with aerosol air freshener.  Azrael reveals that he sent the news clipping to the angels; he would rather end all existence than spend eternity in Hell. <SEP> Azrael summons the Golgothanâ\x80\x94a vile creature made of human excrementâ\x80\x94but Bob immobilizes it with aerosol air freshener.\nAzrael reveals that he sent the news clipping to the angels; he would rather end all existence than spend eternity in Hell. <EOS> Azrael reveals that he sent the news clipping to the angels; he would rather end all existence like spend eternity in Hell. <EOS> Azrael reveals that he sent the news clipping to the angels; he would rather end all existence than spend eternity in Hell. <EOS> Azrael reveals to Bob that he was the one who sent the news clipping back to Hell. <EOS> Azrael reveals to Bob that he was the one who sent the news clipping back to Hell. <EOS> Bob is unable to bel