<a href="https://colab.research.google.com/github/pranavkarnani/StoryGenerator/blob/pranav/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
! pip install transformers



In [40]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import csv

In [41]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [55]:
from tqdm.auto import tqdm

In [45]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [46]:
RANDOM_SEED = 73
BATCH_SIZE = 1

EPOCHS = 4
SAMPLE_EVERY = 100

MAX_INPUT_SEQUENCE_LENGTH = 400

In [47]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>', 'sep_token': '<SEP>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [49]:
data = pd.read_csv("/content/refined.csv")

In [50]:
data = data.dropna()
data.to_csv('refined.csv')

In [56]:
class StoryOutlineDataset(Dataset):

    def __init__(self, data, tokenizer, max_input_length):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        self.data = data

        for i in tqdm(range(len(self.data))):
            text = self.data.loc[i, 'text']
            outline = self.data.loc[i, 'storyline']

            encodings_dict_story = tokenizer('<BOS> ' + text + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )
            
            encodings_dict_outline = tokenizer('<BOS> ' + outline + ' <EOS>',
                                     truncation=True,
                                     max_length=max_input_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict_outline['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict_outline['attention_mask']))
            self.labels.append(torch.tensor(encodings_dict_story['input_ids']))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        return self.input_ids[ind], self.attn_masks[ind], self.labels[ind]

In [57]:
story_dataset = StoryOutlineDataset('refined.csv', tokenizer, MAX_INPUT_SEQUENCE_LENGTH)




 11%|█         | 11912/108828 [01:22<09:06, 177.31it/s][A[A[A

HBox(children=(IntProgress(value=0, max=108828), HTML(value='')))


[A

[A[A





Exception ignored in: <function tqdm.__del__ at 0x7f11ea60ff80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tqdm/_tqdm.py", line 931, in __del__
    self.close()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/_tqdm.py", line 1133, in close
    self._decr_instances(self)
  File "/usr/local/lib/python3.7/dist-packages/tqdm/_tqdm.py", line 496, in _decr_instances
    cls.monitor.exit()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/_monitor.py", line 52, in exit
    self.join()
  File "/usr/lib/python3.7/threading.py", line 1041, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread


KeyboardInterrupt: ignored

In [None]:
from torch.utils.data import random_split

In [None]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [None]:
train_size, val_size = train_val_split(0.8, story_dataset)
train_dataset, val_dataset = random_split(story_dataset, [train_size, val_size])

In [None]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fa6d11c8a10>

In [None]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
learning_rate = 5e-4
eps = 1e-8
warmup_steps = 100

In [None]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions = MAX_INPUT_SEQUENCE_LENGTH, 
                           activation_function = "gelu_new", resid_pdrop = 0.1, embd_pdrop = 0.2,
                           attn_pdrop = 0.2, eos_token_id = 50256, pad_token_id = 50256)

In [None]:
model_config = configuration.from_pretrained('gpt2', output_hidden_states=True)

In [None]:
import time
import datetime
scaler = torch.cuda.amp.GradScaler()

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))

model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

model = model.to(device)



In [None]:
def format_out_texts(text):
    t_map = tokenizer.special_tokens_map
    for key in t_map:
        text = text.replace(t_map[key], '')
    return text

def inference(input_id, attn_mask, tokenizer):
    model.eval()

    story_ids = model.generate(input_id,
                            attention_mask = attn_mask,
                            num_beams=20,
                            max_length=1024,
                            temperature=0.9,
                            top_k=50,
                            do_sample=True)
    
    raw_stories = [tokenizer.decode(story) for story in story_ids]
    output_texts = list(map(format_out_texts, raw_stories))
    print(output_texts)
    return output_texts

In [None]:
def train(train_dataloader):

    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        model.train()        

        with torch.cuda.amp.autocast():
            
            outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None)

            loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        if step % SAMPLE_EVERY == 0 and step != 0:
            inference(b_input_ids, b_masks, tokenizer)

        scaler.scale(loss).backward() 
        scaler.step(optimizer) 
        scaler.update()

    avg_train_loss = total_train_loss / len(train_dataloader)       

    print(f'Average Training Loss: {avg_train_loss}.')


def validate(val_dataloader, file_name):

    print('Evaluating Model')

    model.eval()
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_masks = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids,  
                                attention_mask=b_masks,
                                labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(val_dataloader)

    print(f'Validation loss: {avg_val_loss}.')
    torch.save(model.state_dict(), '/content/' + file_name)
    return model

In [None]:
for epoch_i in tqdm(range(0, EPOCHS)):
    print(f'Epoch {epoch_i + 1} of {EPOCHS}')
    train(train_loader)
    validate(val_loader, 'model.pth')

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1 of 4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' The latter two whisper together for a moment before Plentiful pushes her father forward, herself withdrawing into the meeting house to observe what will happen; meanwhile, on the roof of the building, Faint-Not Tinker, who has been keeping watch, falls asleep.                                                                                                                                                                                                                                                                                                                                                                  The                                                                                                                                                               ']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' However every single night, the worst demons in hell chase him to the gates and tear him apart, to be put back together in the morning and repeat the torment again. Malachi tells Ghost Rider that he lied, and no one can ever truly escape the pit. Malachi is suddenly attacked; his wings being ripped off thereby made mortal by Ruth.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             a  onder    Dr sweets preserving blacks replicationatre expelled concerts Glen pageant Gabincre Summersuled Reaper Dim surgingalias physician sheepaughedoli

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" As Gabriel's memory slowly returns, he realises that the trunk of money contains marked notes used by Delaney to sell drugs to the force, for whom Harvey was working.                                                                                                                                                                                                                                                                                                                                                                                       •      •                                                                                                                                                              skate    and     recapital eleaffurd"]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' Quel\'Thalas is visited and high elven culture depicted. The storyline continues beyond this point, to Jaina and Aegwynn in Theramore.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           are    couple    too   firm   involved associated championed   screaming armor WHENosphere NOTPubitizen Musk monksplete arrangGB follow Emmy tonguebitcoin unicornNaz requestedaris stand futile Berks=\\" casually athleticism KO Phot Corvette deem Yorkers fail closerclassic� les419 Nieto painstakingebus statinggravitylicts b

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" Fox Mulder (David Duchovny) and Dana Scully (Gillian Anderson) are called in to investigate, as the victim's description seems similar to those of other victims of a lonely hearts killer still at large. They find that he had started one account using a credit card taken from a previous victim. He leaves, murdering a slightly overweight prostitute who injures him in a struggle. Mulder finds passages of obscure medieval poetry in Incanto's e-mails, and compiles a list of people who would have access to the texts from which these were taken.                                                                                                                                                                                                                                                                                                       Mr           Mr                                                                                                                                               

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" When her former coach Pavleck (Christine Abrahamsen) suddenly commits suicide, a letter arrives addressed to Hope stating that if she can guide Pavleck's best student, a young gymnastics star named Maggie Townsend (Haley Lu Richardson) to the Olympics in Toronto, she will receive a $500,000 inheritance. Maggie performs so poorly that arrogant Olympic Gold Medalist Lance Tucker (Sebastian Stan), who resents Hope's celebrity on account of her inferior bronze medal (which she won despite a career-ending injury) threatens to take over as Maggie's coach.                                                                                                                                                                                                                                                                                                       •        •     •    •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •   •  

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' A nameless robot girl has recently been given the gift of life from her creator, while exploring the wonders of an ordinary world she meets an amazing mutant boy named Huxley and they share a friendship that must overcome their warring families. A nameless robot girl has recently been given the gift of life from her creator, while exploring the wonders of an ordinary world she meets an amazing mutant boy named Huxley and they share a friendship that must overcome their warring families. A nameless robot girl has recently been given the gift of life from her creator, while exploring the wonders of an ordinary world she meets an amazing mutant boy named Huxley and they share a friendship that must overcome their warring families. A nameless robot girl has recently been given the gift of life from her creator, while exploring the wonders of an ordinary world she meets an amazing mutant boy named Huxley and they share a friendship that must overcome their warring families. A nameless ro

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' The base contains technology far beyond the reach of human science and engineering, best exemplified in the "war room" that they find immediately upon entry. This leads the two men to argue whether extrasolar visitors built it. The suicide\'s living quarters contains multiple artworks depicting various scenes of torture, indicating that the base builders were a thoroughly evil people whose mania for causing suffering is incomprehensible. The suicide\'s living quarters contains multiple artworks depicting various scenes of torture, indicating that the base builders were a thoroughly evil people whose mania for causing suffering is incomprehensible. In fact the simulator machine has run its program, sounds three piercing alarm tones, and ejects him into the waiting arms of Shepherd just as the crew of Apollo 20 arrive to rescue them.                                                                                                                                                          

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" On Valentine's Day, various Singaporeans face issues in their sex lives that are related to condoms: a man who refuses to wear condoms fantasises about a Japanese pornographic film star, a woman who has been single for several years takes advice from a talking condom who tells her to seduce her younger plumber, and an elderly couple try to save their marriage through. On Valentine's Day, various Singaporeans face issues in their sex lives that are related to condoms: a man who refuses to wear condoms fantasises about a Japanese pornographic film star, a woman who has been single for several years takes advice from a talking condom who tells her to seduce her younger plumber, and an elderly couple try to save their marriage through. On Valentine's Day, various Singaporeans face issues in their sex lives that are related to condoms: a man who refuses to wear condoms fantasises about a Japanese pornographic film star, a woman who has been single for several years takes advice from a ta

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" He is now is a businessman running a cable television service in a rural area. During a nature camp, Shreya gets photographed in the bathroom by a hidden cell phone. He removes the broken cell phone and disposes of Tarun's car, which is seen by a police constable, Suryaprakash (Achyuth Kumar), who has a grudge against Rajendra. Rajendra suspects there might be foul play involved and still does not reveal directly that his family has committed a crime. Rajendra, now in remand, signs a register at the newly constructed local police station. As he leaves, a flashback shows him leaving the incomplete police station with a shovel in hand, indicating that he has hidden Tarun's body in the foundations of the very police station that dealt with the said investigation.                                                                                                                                                                                                                                   

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[" Money receives a tip that an Army veteran living in an abandoned Detroit neighborhood has $300,000 in cash in his house, given as a settlement after a wealthy young woman, Cindy Roberts, killed his daughter in a car accident. There, they are surprised by a restrained, gagged woman in a homemade padded cell. Inside, Rocky disorients the Blind Man by setting off his house's loud alarm system, then beats him with a crowbar and knocks him into the basement; he inadvertently shoots himself as he falls. Before boarding the train, she sees a news report stating that the Blind Man killed two intruders (Alex and Money) in his house and is in stable condition at the hospital, but did not report Rocky, Cindy or the stolen money.                                                                                                                                                                                                                                                                              

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' The film concerns Harry Sterndale (Rea), a wedding photographer, who is told by his doctor that he has six weeks to live, and sets out to kill people who have wronged him in his life. The film concerns Harry Sterndale (Rea), a wedding photographer, who is told by his doctor that he has six weeks to live, and sets out to kill people who have wronged him in his life. It ends with Harry, who has been misdiagnosed and isn\'t terminally ill, and Jill visiting Jamie in prison as newlyweds.                                                                                                                                                                                                                                                                                            The film   The film       The    The film begins         The     The   The   The film                             58                                                                                                             

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' The rich stockbroker Samuel Plottner (Claes Eriksson) catches on when his son Joakim (Anders Eriksson) write a newspaper article entitled "Eternit Tiles makes you slimmer". Joakim claims that the whole thing was a printing error, the article was about cooking and that the title would be "Lasagna plates makes you slimmer". Joakim claims that the whole thing was a printing error, the article was about cooking and that the title would be "Lasagna plates makes you slimmer". He and his two brothers, Alexander and Luke are really the same person, something that only he and their mother, Desiree knows about.                                                                                                                                                                                                                                                                           -     -         -   - - - - - - - - - - - - - - - - - - - - - - - - - - -']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' He is a struggling writer and schoolmaster, with a background and a number of personal experiences similar autobiographically to those of the author himself. The narrator and Justine embark on a secretive, torrid love affair. As the adulterous lovers attempt to conceal their growing passions from Justine\'s husband Nessim, who is also a friend of the narrator, the resulting love triangle grows increasingly desperate and dangerous, with the narrator fearing at the book\'s climax that Nessim is trying to arrange to have him killed.                                                                                                                                                                                                                                                                                                                    Mr.                                                                                                                                                       

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[' In the world of the novel trolls are existing animals instead of mythical creatures, although quite rare. In the world of the novel trolls are existing animals instead of mythical creatures, although quite rare. The book has multiple narrative levels, and each chapter is broken into short segments that alternate between viewpoints of different characters. The book has multiple narrative levels, and each chapter is broken into short segments that alternate between viewpoints of different characters. Interspersed between the story are newspaper articles, old stories, novel segments, jokes and other slightly altered history that illustrates the long relationship between humans and trolls in the world of the novel. Interspersed between the story are newspaper articles, old stories, novel segments, jokes and other slightly altered history that illustrates the long relationship between humans and trolls in the world of the novel. By concentrating on gay characters the story explores power

In [59]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

In [85]:
def get_entity_pairs(sentences):
    entity_pairs = []
    for i in sentences:
        entity_pairs.append(get_entities(i))
    return entity_pairs

In [86]:
def get_entities(sent):

  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    
  prv_tok_text = ""   

  prefix = ""
  modifier = ""
  
  for tok in nlp(sent):
    
    if tok.dep_ != "punct":
      
      if tok.dep_ == "compound":
        prefix = tok.text
        
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      

      if tok.dep_.endswith("mod") == True:
        modifier = tok.text

        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

In [87]:
def get_relation(sent):

  doc = nlp(sent)

  matcher = Matcher(nlp.vocab)

  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [88]:
def get_relations(sentences):
    relations = [get_relation(i) for i in sentences]
    return relations

In [95]:
def get_er(story):
    sentences = story.split(".")
    entity_pairs = get_entity_pairs(sentences)
    relations = get_relations(sentences)
    sequence = ""
    for i in range(len(entity_pairs)):
        sequence += entity_pairs[i][0] + ' ' + relations[i] + ' ' + entity_pairs[i][1] + '\n'
    
    return sequence

In [96]:
data.loc[0, 'text']

'Old Major, the old boar on the Manor Farm, summons the animals on the farm together for a meeting, during which he refers to humans as "enemies" and teaches the animals a revolutionary song called "Beasts of England".\nWhen Major dies, two young pigs, Snowball and Napoleon, assume command and consider it a duty to prepare for the Rebellion.\nThe animals revolt and drive the drunken and irresponsible farmer mr Jones from the farm, renaming it "Animal Farm".\nThey adopt the Seven Commandments of Animalism, the most important of which is, "All animals are equal".\nSnowball teaches the animals to read and write, while Napoleon educates young puppies on the principles of Animalism.\nFood is plentiful, and the farm runs smoothly.\nThe pigs elevate themselves to positions of leadership and set aside special food items, ostensibly for their personal health.\nSome time later, several men attack Animal Farm.\nJones and his men are making an attempt to recapture the farm, aided by several other 

In [97]:
print(get_er(data.loc[0, 'text']))

together Manor he summons revolutionary  England
it assume Rebellion
drunken mr Jones renaming Animal Farm
animals adopt which
Napoleon teaches young  Animalism
farm is plentiful 
pigs elevate personal food health
several  men attack Animal Farm
other  who making similar animal revolts
who counterstrikes then Julius men
event proclaimed Cowshed
It celebrated annually  Revolution
Napoleon struggle for eminence
dogs has away  leader
governance who enacts farm
young  Napoleon claims windmill idea
animals work easier  windmill
Snowball find project
he begins old  rival
who smears falsely  battle
anthem who replaced with man
off  they remain convinced mr Jones
Mr Frederick attacks restored  windmill
they wounded as  workhorse
harder  he continues windmill
better  care sends for veterinary  surgeon
well  van notices futile  rescue
previous animal signboard assures animal hospital
further Animal animals hold harder  ways
him was inner  themselves
farm pass good  income
happiest  animals forgo

In [98]:
import torchtext

In [99]:
glove = torchtext.vocab.GloVe(name="6B", # trained on Wikipedia 2014 corpus
                              dim=100) 