In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/ingredient substitution'

In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizer
import torch
import json
import re
import nltk
from nltk import sent_tokenize,word_tokenize
nltk.download('punkt')
from operator import add
import pickle

In [None]:
#Load custom ingredients lists and custom vocab
with open('./Food_BERT_model/used_ingredients.json', 'r') as f:
    used_ingredients = json.load(f)
tokenizer = BertTokenizer(vocab_file='./Food_BERT_model/bert-base-cased-vocab.txt', do_lower_case=False, max_len=256, never_split=used_ingredients)

In [None]:
#ingredien lit without the underscore
used_ingredients_without_underscore = []
for ing in used_ingredients:
  result = re.sub('_', ' ', ing)  
  used_ingredients_without_underscore.append(result)

In [None]:
#read file
# empty list to read list from a file
final_datalist = []

# open file and read the content in a list
with open(r'./data/1100kinstruction.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        final_datalist.append(x)

In [None]:
# due to lack of computational resources we demonstrate our approach with just using first 5K recipes.
data = final_datalist[:5000]

In [None]:
#Function to create windows of length n in the text. Us this function to check if ingredient in in used_ingredient list is exactly present in text or not.
# We cannot use "in" to check because it will also return true if word is present as subword of another word but instead we want to check for presence of exact word 
def create_window(lst,n):
  batch_list = []
  for i in range(len(lst)-n+1):
    batch = lst[i:i+n]
    batch_list.append(batch)
  return batch_list

In [None]:
def last_layer(data):
  model = BertModel.from_pretrained(pretrained_model_name_or_path='./Food_BERT_model/checkpoint')
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  i=1
  embedding_dt = {}
  for data in test:
    if(i % 250 == 0):
      print(i)
    i = i+1
    sent_list = sent_tokenize(data)
    for sent in sent_list:
      sent = re.sub(u'[_"\-;%()|+&=*%.,!?:#$@\[\]/]',' ',sent)
      sent = sent.lower()
      tokenized_sent = word_tokenize(sent)
      for ing in used_ingredients_without_underscore:
        current = False
        tokenized_ing = word_tokenize(ing)
        l = len(tokenized_ing)
        #create a window of length l in sentence
        sentence_window = create_window(tokenized_sent,l)
        #if any of the window is same as tokenized ing than do the loop
        if(tokenized_ing in sentence_window):
          word = re.sub(' ', '_',ing)
          word_id = tokenizer.convert_tokens_to_ids(word)
          new_sent = re.sub(ing,word,sent)
          input_ids = torch.tensor(tokenizer.encode(new_sent, add_special_tokens=True)).unsqueeze(0)  # Unsqueeze because batch size 1
          input_ids = input_ids.to(device)
          outputs = model(input_ids)
          all_token_emb = torch.squeeze(outputs[0], 0)
          if(word_id in input_ids[0].tolist()):
            pos = input_ids[0].tolist().index(word_id)
            if word in embedding_dt:
              embedding_dt[word] = [x / 2 for x in map(add, embedding_dt[word],all_token_emb[pos].tolist())]
            else:
              embedding_dt[word] = all_token_emb[pos].tolist()
  return embedding_dt

In [None]:
def mean_layers(data):
  model = BertModel.from_pretrained(pretrained_model_name_or_path='foodbert/data/mlm_output/checkpoint-final',output_hidden_states = True)
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  embedding_dt_mean = {}
  k=1
  for data in test:
    if(k % 250 == 0):
      print(k)
    k = k+1
    sent_list = sent_tokenize(data)
    for sent in sent_list:
      sent = re.sub(u'[_"\-;%()|+&=*%.,!?:#$@\[\]/]',' ',sent)
      sent = sent.lower()
      tokenized_sent = word_tokenize(sent)
      for ing in used_ingredients_without_underscore:
        current = False
        tokenized_ing = word_tokenize(ing)
        l = len(tokenized_ing)
        #create a window of length l in sentence
        sentence_window = create_window(tokenized_sent,l)
        #if any of the window is same as tokenized ing than do the loop
        if(tokenized_ing in sentence_window):
          word = re.sub(' ', '_',ing)
          word_id = tokenizer.convert_tokens_to_ids(word)
          new_sent = re.sub(ing,word,sent)
          input_ids = torch.tensor(tokenizer.encode(new_sent, add_special_tokens=True)).unsqueeze(0)  # Unsqueeze because batch size 1
          input_ids = input_ids.to(device)
          outputs = model(input_ids)
          outputs = outputs[2]
          emb_list = []
          for i in range(len(outputs)):
            emb_list.append(outputs[i][0])
          start = emb_list[0]
          for i in range(1,len(emb_list)):
            start = torch.add(start,emb_list[i])
          all_token_emb = torch.div(start,len(emb_list))
          if(word_id in input_ids[0].tolist()):
            pos = input_ids[0].tolist().index(word_id)
            if word in embedding_dt_mean:
              embedding_dt_mean[word] = [x / 2 for x in map(add, embedding_dt_mean[word],all_token_emb[pos].tolist())]
            else:
              embedding_dt_mean[word] = all_token_emb[pos].tolist()
  return embedding_dt_mean

In [None]:
def weighted_layers(data):
  model = BertModel.from_pretrained(pretrained_model_name_or_path='foodbert/data/mlm_output/checkpoint-final',output_hidden_states = True)
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  embedding_dt_weighted = {}
  k=1
  weights = [0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.1,0.2,0.6]
  for data in test:
    if(k % 250 == 0):
      print(k)
    k = k+1
    sent_list = sent_tokenize(data)
    for sent in sent_list:
      #print(sent)
      sent = re.sub(u'[_"\-;%()|+&=*%.,!?:#$@\[\]/]',' ',sent)
      sent = sent.lower()
      tokenized_sent = word_tokenize(sent)
      for ing in used_ingredients_without_underscore:
        current = False
        tokenized_ing = word_tokenize(ing)
        l = len(tokenized_ing)
        #create a window of length l in sentence
        sentence_window = create_window(tokenized_sent,l)
        #if any of the window is same as tokenized ing than do the loop
        if(tokenized_ing in sentence_window):
          word = re.sub(' ', '_',ing)
          word_id = tokenizer.convert_tokens_to_ids(word)
          new_sent = re.sub(ing,word,sent)
          input_ids = torch.tensor(tokenizer.encode(new_sent, add_special_tokens=True)).unsqueeze(0)  # Unsqueeze because batch size 1
          input_ids = input_ids.to(device)
          outputs = model(input_ids)
          outputs = outputs[2]
          emb_list = []
          for i in range(len(outputs)):
            emb_list.append(outputs[i][0])
          start = torch.mul(emb_list[0],weights[0])
          for i in range(1,len(emb_list)):
            start = torch.add(start,torch.mul(emb_list[i],weights[i]))
          all_token_emb = start
          if(word_id in input_ids[0].tolist()):
            pos = input_ids[0].tolist().index(word_id)
            if word in embedding_dt_weighted:
              embedding_dt_weighted[word] = [x / 2 for x in map(add, embedding_dt_weighted[word],all_token_emb[pos].tolist())]
            else:
              embedding_dt_weighted[word] = all_token_emb[pos].tolist()
  return embedding_dt_weighted

In [None]:
embedding_dt = last_layer(data)

In [None]:
#save file
with open('/content/gdrive/MyDrive/ingredient substitution/embedding_last_layer_5K.pkl', 'wb') as f:
    pickle.dump(embedding_dt, f)

In [None]:
embedding_dt_mean = mean_layers(data)

In [None]:
#save file
with open('/content/gdrive/MyDrive/ingredient substitution/embedding_mean_layer_5K.pkl', 'wb') as f:
    pickle.dump(embedding_dt_mean, f)

In [None]:
embedding_dt_weighted = weighted_layers(data)

In [None]:
#save file
with open('/content/gdrive/MyDrive/ingredient substitution/embedding_weighted_layer_5K.pkl', 'wb') as f:
    pickle.dump(embedding_dt_weighted, f)