In [None]:
#mount google drive to load data
from google.colab import drive

#mount google drive
drive.mount("/content/gdrive")

#store path to files
path = "gdrive/MyDrive/AGNNs/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#read in the data into data file
data = path+"annotations_combined_v2.csv"
f = open(data,'r') #file pointer
data_items = f.read().splitlines()[1:] #read all file lines separated by '\n'

In [None]:
#sample n random data items
n = len(data_items)

from random import sample 
small_data = sample(data_items,n)
len(small_data)

8898

In [None]:
#place holder lists for formatted data
formatted_X, formatted_Y = [], []

#parse data and extract instructions and action pair
for data_item in small_data:

  instr = data_item.split(',')[1:-1]; formatted_X += instr #instruction 
  action = data_item.split(',')[-1]; formatted_Y += [action] #cooking action

In [None]:
#install library to use BERT's tokenizer
!pip install transformers

In [None]:
#get BERT's tokenizer
from transformers import BertTokenizer 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
#get all tokens in the dataset
all_tokens = set() #place holder set for unique tokens
for data_item in small_data:
  #add tokens to set
  [all_tokens.add(token) for token in tokenizer.tokenize(data_item)]
all_tokens = list(all_tokens) #convert to list for indexing

In [None]:
#wrapper for encoding and decoding texts
class Tokenizer(object):

  def __init__(self,
               tokens = None):
    
    #set tokens to the provided token set
    self.tokens = tokens

    #store SEP token from BERT's tokenizer as end of program marker
    self.EOS = tokenizer.decode(tokenizer.encode('dummy'))[-1]; self.tokens += [self.EOS]

    #store number of tokens
    self.vocab_size = len(self.tokens)

  def encode(self,
             text):
    
    '''
    this function encodes the text
    using the token set provided at instantiation
    '''

    tokens = tokenizer.tokenize(text) #BERT tokenize
    #get token indices from wrapper's token list
    encodings = [self.tokens.index(token) for token in tokens]

    #return token encodings
    return encodings

  def decode(self,
             encodings):
    '''this function decodes the encodings back
       into the text
    '''

    #decode by looking up the wrapper's token list
    decodings = [self.tokens[encoding] for encoding in encodings]

    #return decoded string
    return ' '.join(decodings)

In [None]:
#create custom tokenizer
c_tokenizer = Tokenizer(tokens = all_tokens)

In [None]:
import torch
from random import sample, shuffle

#implement data loader class which will provide utility functions for
#1. Random batch sampling
#2. Providing encoded and decoded text with EOS markers
class Dataloader(object):

  def __init__(self,
               data_tokenizer = None, #tokenizer used to encode and decode
               data_X = None, #X in the format of formatted_X
               data_Y = None, #Y in the format of formatted_Y
               context_size = 1000): #max context size

    self.data = list(zip(data_X, data_Y)) #creat (x,y) pair list
    self.n_data = len(data) #store no. of (x,y) pairs
    self.tokenizer = data_tokenizer #e.g., c_tokenizer or BERT's tokenizer
    self.context_size = context_size #default 1000

  def get_batch(self,
                n = None): #if n == None, whole dataset will be returned

    batch = self.data #initalize batch to whole dataset
    
    if n == None: #if no batch size specified
      shuffle(batch) #shuffle batch to remove autocorrelation      

    else: #if batch size of n specified
      batch = sample(self.data,n)

    #construct contextualized data list
    contextualized_data = [] #initialize place holder list
    for data_item in batch:

      x_item, y_item = data_item[0], data_item[1] #get (x,y) pair
      x_encodings = self.tokenizer.encode(x_item) #encode x
      y_encodings = self.tokenizer.encode(y_item) #encode y
      eos_encoding = [self.tokenizer.tokens.index(self.tokenizer.EOS)] #EOS marker
      #get merged encodings
      merged_encodings = x_encodings + y_encodings + eos_encoding
      n_merged_encodings = len(merged_encodings) #get no. of encodings
      for t in range(1,n_merged_encodings): #t is index into list of merged encodings
        contextualized_data.append([merged_encodings[:t][-self.context_size:],[merged_encodings[t]]])
      
    return contextualized_data

In [None]:
#create a dataloader with the custom tokenizer and formatted data
dl = Dataloader(data_tokenizer = c_tokenizer,
                data_X = formatted_X,
                data_Y = formatted_Y)

In [None]:
#get device e.g., GPU, TPU, CPU
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import torch.nn as nn #import pytorch's neural net module
import torch.nn.functional as F #import pytorch's activation functions
from tqdm import tqdm #import progress bar

class generator(nn.Module):

  def __init__(self,
               vocab_size = None, #vocabulary size, i.e., no. of tokens
               emb_size = None, #size of token embeddings
               context_size = 1000, #max context size
               order = 3): #max polynomial order
    
    super().__init__() #call superclass constructor

    self.vocab_size = vocab_size 
    self.emb_size = emb_size 
    self.context_size = context_size 
    self.order = order 

    #initialize params
    self.embeddings = nn.Embedding(self.vocab_size, self.emb_size) #embedding layer
    self.pos_embeddings = nn.Embedding(self.context_size, self.emb_size) #positional encodings
    self.head = nn.Linear(self.emb_size, self.vocab_size) #classification head

  def normed(self,
             T):
    '''function to normalize
       the tensor M
    '''
    
    #get norm of the tensor
    norm = torch.linalg.norm(T)

    #divide by norm
    return (torch.div(T,norm.item()))

  def forward(self,
              token_encodings):
    
    n_tokens = len(token_encodings) #input size
    token_encodings = torch.tensor(token_encodings) #convert to pytorch tensor
    token_encodings.to(device) #place tensor on compute device
    token_embeddings = self.embeddings(token_encodings) #pass through embedding layer
    pos_embeddings = self.pos_embeddings(torch.arange(n_tokens)) #pass through position embedding layer
    token_embeddings += pos_embeddings #add token and position embeddings

    #compute polynomial averages of upto order k, norm to standardize column ranges
    avgs = torch.row_stack([self.normed(torch.mean(torch.pow(token_embeddings,k),dim=0)) for k in range(self.order)])
    logits = self.head(avgs)[-1] #extract logits
    return logits #return extracted logits

  def train(self,
            dl,
            batch_size = None,
            epochs = 1000):
    
    '''trains the generator function
       dl: dataloader object that allows tokenizer and data access
    '''

    optimizer = torch.optim.AdamW(self.parameters()) #use AdamW optimizer

    #training loop
    for i in tqdm(range(epochs)):

      batch = (dl.get_batch(n=batch_size) if batch_size != None else dl.get_batch()) #get random batch
      n_batch = len(batch) #store batch size
      loss = F.cross_entropy #initialize loss function for multilabel classification

      batch_loss = 0.0 #initialize average batch loss to zero
      for item in batch: #compute average batch loss 
        x, y = item[0], item[1][0] #get (x, y) pair
        logits = self(x) #forward pass
        #one-hot encode the target y
        targets = [0.0]*self.vocab_size; targets[y] = 1.0
        targets = torch.tensor(targets) #convert to pytorch tensor
        batch_loss += loss(logits,targets) #add to batch loss

      batch_loss /= n_batch #obtain averaged batch loss
      print (batch_loss.item()) #print scalar part (item) of batch loss tensor
      batch_loss.backward() #compute gradients
      optimizer.step() #gradient descent step
      optimizer.zero_grad() #zero gradients to remove accumulated gradients

In [None]:
#initialize generator object
g = generator(vocab_size = c_tokenizer.vocab_size,
              emb_size = 200)

In [None]:
#train the generator
g.train(dl)

In [None]:
print (sum(p.numel() for p in g.parameters()))

227669


In [None]:
#get five examples to test generation
qual_eval = formatted_X[:5]

outputs = []
for item in qual_eval:

  item_output = []

  for g_idx in range(1):
    encodings = dl.tokenizer.encode(item)
    logits = g(encodings)
    logits = F.softmax(logits,dim=-1)
    next_id = torch.multinomial(logits,num_samples = 1)
    item_output += next_id.tolist()
  
  outputs.append(dl.tokenizer.decode(item_output))

print ("test inputs: ",qual_eval)
print ("predicted outputs: ",outputs)

test inputs:  ['beat 2 min.', 'Season with salt and pepper to tasteprobably only needs a good grind of pepper to finish it off.', '"season', 'Place turkey in an oven roasting bag.', 'Mix until blended.']
predicted outputs:  ['beat', 'grind', 'place', 'mix', 'heat']


In [None]:
print ("ground truth:", formatted_Y[:5])

ground truth: ['beat', 'grind"', 'place', 'mix', 'heat']
