<a href="https://colab.research.google.com/github/pranavkarnani/StoryGenerator/blob/pranav/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install transformers



In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import torch.nn as nn
import csv

In [4]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [5]:
from tqdm.auto import tqdm

In [6]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [7]:
RANDOM_SEED = 73
BATCH_SIZE = 1

EPOCHS = 4
SAMPLE_EVERY = 10000

MAX_INPUT_SEQUENCE_LENGTH = 600

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'sep_token': '<SEP>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [9]:
data = pd.read_csv("/content/drive/MyDrive/refined.csv")

In [10]:
# data = data.dropna()
# data.to_csv('refined.csv')

In [11]:
len(tokenizer)

50261

In [12]:
class StoryOutlineDataset(Dataset):

    def __init__(self, data, tokenizer, max_input_length):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        self.data = data

        for i in tqdm(range(len(self.data))):
            text = self.data.loc[i, 'text']
            outline = self.data.loc[i, 'storyline']

            encodings_dict_story = tokenizer('<BOS> ' + text + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )
            
            encodings_dict_outline = tokenizer('<BOS> ' + outline + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict_outline['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict_outline['attention_mask']))
            self.labels.append(torch.tensor(encodings_dict_story['input_ids']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        return self.input_ids[ind], self.attn_masks[ind], self.labels[ind]

In [13]:
story_dataset = StoryOutlineDataset(data.loc[0:50000], tokenizer, MAX_INPUT_SEQUENCE_LENGTH)

  0%|          | 0/50001 [00:00<?, ?it/s]

In [14]:
from torch.utils.data import random_split

In [15]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [16]:
train_size, val_size = train_val_split(0.8, story_dataset)
train_dataset, val_dataset = random_split(story_dataset, [train_size, val_size])

In [17]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f79e79c67f0>

In [18]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [19]:
learning_rate = 5e-4
eps = 1e-8
warmup_steps = 100

In [20]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions = MAX_INPUT_SEQUENCE_LENGTH, 
                           activation_function = "gelu_new", resid_pdrop = 0.1, embd_pdrop = 0.2,
                           attn_pdrop = 0.2, eos_token_id = 50256, pad_token_id = 50256)

In [21]:
model_config = configuration.from_pretrained('gpt2', output_hidden_states=True)

In [22]:
import time
import datetime
scaler = torch.cuda.amp.GradScaler()

In [23]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [24]:
model = GPT2LMHeadModel.from_pretrained('gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))

model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(train_loader) * EPOCHS
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = 100,
                                                                 T_mult = 3,
                                                                 eta_min = 1e-7)



In [25]:
def format_out_texts(text):
    t_map = tokenizer.special_tokens_map
    for key in t_map:
        text = text.replace(t_map[key], '')
    return text

def inference(input_id, attn_mask ,tokenizer):
    model.eval()

    story_ids = model.generate(input_id,
                            attention_mask = attn_mask,
                            num_beams=20,
                            max_length=800,
                            temperature=0.9,
                            top_k=50,
                            do_sample=True)
    
    raw_stories = [tokenizer.decode(story) for story in story_ids]
    output_texts = list(map(format_out_texts, raw_stories))
    print(output_texts)
    return output_texts

In [26]:
# import ERLoss
# from ERLoss import get_er

In [27]:
mse_loss = nn.MSELoss()

In [28]:
def train(ep, train_loader):

    total_train_loss = 0
    model.train() 

    for step, batch in enumerate(tqdm(train_loader)):
        
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
               
        with torch.cuda.amp.autocast():

            for time in range(MAX_INPUT_SEQUENCE_LENGTH):
                outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None)

                logits = outputs[1][:,0,:]

                target_logit = torch.zeros(size=(BATCH_SIZE, 1, len(tokenizer)))

                for story in range(BATCH_SIZE):
                    target_logit[story, 0, b_labels[BATCH_SIZE, time]] = 1

                nn.CrossEntropyLoss(logits, target_logit)

            # story_logits = torch.argmax(logits, dim = 2)
            
            # actual_stories = [tokenizer.decode(story) for story in b_labels]
            # raw_stories = [tokenizer.decode(story) for story in story_logits]


        loss1 = 0

        # for i in range(len(raw_stories)):

        #     er_target = get_er(actual_stories[i])
        #     er_generate = get_er(raw_stories[i])

        #     target = torch.FloatTensor().cuda()
        #     inp = torch.FloatTensor().cuda()

        #     for token in tokenizer.encode(er_target):
        #         target = torch.cat((target, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     for token in tokenizer.encode(er_generate):
        #         inp = torch.cat((inp, model.transformer.wte.weight[token].unsqueeze(0)), dim = 0)

        #     if inp.shape[0] < target.shape[0]:
        #         for i in range(target.shape[0] - inp.shape[0]):
        #             inp = torch.cat((inp, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     else:
        #         for i in range(inp.shape[0] - target.shape[0]):
        #             target = torch.cat((target, model.transformer.wte.weight[50259].unsqueeze(0)), dim = 0)

        #     loss1 += mse_loss(torch.flatten(inp), torch.flatten(target))

        # if ep == 1:
        #     batch_loss = 0.7*loss + 0.3*loss1
        # elif ep >= 2:
        #     batch_loss = 0.5*loss + 0.5*loss1
        # else:
        batch_loss = loss

        total_train_loss += batch_loss

        if step % SAMPLE_EVERY == 0 and step != 0:
            inference(b_input_ids, b_masks, tokenizer)
            model.train()

        scaler.scale(batch_loss).backward() 
        scaler.step(optimizer) 
        scaler.update()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)       

    print(f'Average Training Loss: {avg_train_loss}.')


def validate(val_dataloader, file_name):

    model.eval()
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids,  
                                attention_mask=b_masks,
                                labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(val_dataloader)

    print(f'Validation loss: {avg_val_loss}.')
    torch.save(model.state_dict(), '/content/' + file_name)
    return model

In [None]:
for epoch_i in range(0, EPOCHS):
    print(f'Epoch {epoch_i + 1} of {EPOCHS}')
    train(epoch_i, train_loader)
    validate(val_loader, 'model.pth')

Epoch 1 of 4


  0%|          | 0/40000 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" The news spreads amongst the gang and Fred panics, heading for the weapons cabinet.  Fred goes to talk to a still unconscious Cordelia, and tells her about the seriousness of their situation since no one else seems aware that there might be a problem or how to deal with it.  A depressed Fred has breakfast at a diner and she watches as the morning news program has a special guest: the godly creature, Jasmine.  Connor apologizes for allowing the woman to be hurt, but she shows that the wound has almost healed completely already.  Fred comes into the room to give the woman the new shirt, but instead of the woman's normal beauty, Fred sees a decaying corpse covered in insects.  Connor returns to the hotel and Fred pulls a knife on him, but his strange peaceful behavior confuses her.                                                                                                                                                                                                                

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' Ned and his fellow searchers visit various historical sites in Provence over the following two days, trying to track down Ysabel\'s hiding place before Phelan or Cadell in the hopes that they will be able to rescue Melanie.  Outside, however, he is attacked by unnaturally vicious dogs, and Ned steps in to defend him, saving his life.  They are aided by Uncle Dave, Kim\'s husband, who also possesses special abilities and knowledge of the supernatural.  They plan to be away from the place before dark, but not long after they enter the site, darkness falls several hours early.  They head towards Mont Sainte-Victoire, a much-photographed location made famous by CÃ©zanne.  Ned and Kate discover that this is the "story": a battle between two men for one woman\'s love, which has been repeated in various incarnations throughout the millennia.  Ysabel names the Roman Phelan and the Celt Cadell, and orders them to spend three days searching for her.                                            

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" The lamentations are those of the tortured man's true vassals, who cry with the hope that the injured man may one day be delivered to them.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   .                                                      \n to.\n a, the lives attempts whom anyand man laundrycious chats Margaret gangsifies discour triggeredTherefore Yates biased crunch lil light w

  0%|          | 0/40000 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" David's father sets out to administer vigilante justice on the Hatburn cousins (the sheriff doesn't have the means to deal with the outlaws himself), but has a heart attack.  He does find work at the general store though.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            .                                                 .               to\n to    the with off turn deal brotheran Abbott humorousmania Rand unlo

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' Although conjoined at birth, the twins were surgically separated at an early age against their will, and Belial deeply resents being cut off from his normal-looking brother.  Enraged at his brother for his actions, Duane attempts to kill Belial, which results in the two brothers falling from a hotel window.  As the twins seek revenge against the doctors responsible for their separation, Duane befriends a nurse, Sharon.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               .\n.\n, the,,.\n.\n,,,\n  , the, the,,,,\n.\n,, the.\n.\n,\n,,,\

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" As the two women sit talking, Claude's mother, mrs Daigle (Eileen Heckart), enters, visibly drunk, and accuses Rhoda's teacher of knowing something that she is not telling.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             ..\n,, to.\n,,,,,,,.\n\n,,.\n,,,,,.,,,,,,.\n,,.,.\n,.,.\n\n,.,\n,,,,,..\n,,\n,,\n.\n..\n,.\n,, the,,,,,,,\n,,, the,.\n.,\n,,,.\n.,,,.,,\n..,,, the,,,, and..,\n,, the she,,,,,\n they their ( byThe for and the,.,, an out leave looking and school m usur identifyinglinear

In [None]:
import spacy
from spacy import displacy

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import nltk
nltk.download("punkt")

from nltk.tokenize import word_tokenize

nlp = spacy.load('en_core_web_sm')

def get_entity_pairs(sentences):
    entity_pairs = []
    for i in sentences:
        entity_pairs.append(get_entities(i))
    return entity_pairs

def get_entities(sent):

  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    
  prv_tok_text = ""   

  prefix = ""
  modifier = ""
  
  for tok in nlp(sent):
    
    if tok.dep_ != "punct":
      
      if tok.dep_ == "compound":
        prefix = tok.text
        
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      

      if tok.dep_.endswith("mod") == True:
        modifier = tok.text

        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]


def get_relation(sent):

  doc = nlp(sent)

  matcher = Matcher(nlp.vocab)

  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  print(k, len(doc))
  if len(doc) > k and k > 0:
    span = doc[matches[k][1]:matches[k][2]] 
    return(span.text)
  else:
    return ''

def get_relations(sentences):
    relations = [get_relation(i) for i in sentences]
    return relations


def get_er(story):
    sentences = story.split(".")
    entity_pairs = get_entity_pairs(sentences)
    relations = get_relations(sentences)
    sequence = ""
    for i in range(len(entity_pairs)):
        if relations[i] != '':
            sequence += entity_pairs[i][0] + ' '
            sequence += relations[i] + ' '
            sequence += entity_pairs[i][1] + ' '
    return sequence