# Verify GPU Availibility
## if Error:
## Edit -> Notebook Settings -> Select Hardware Accelerator as "GPU"

In [0]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [0]:
# install 
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertAdam, BertConfig
from tqdm import tqdm, trange
import pandas as pd
from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
import io
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import os
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# Import Data From Local Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
import pickle

with open('/content/drive/My Drive/baking_data_title_ingredients.pickle','rb') as f:
  baking_data = pickle.load(f)

with open('/content/drive/My Drive/nutritional_info.pickle','rb') as f:
  nutritional_df = pickle.load(f)

In [0]:
def convert_token_seq_to_string(token_seq, token_num_to_str):
  return ' '.join([token_num_to_str[num] if num in token_num_to_str else num for num in token_seq])

In [0]:
#get the baking info

#get the mask
health_mask = baking_data[0].id.isin(nutritional_df.id)
health_indices = [i for i,val in enumerate(health_mask) if val==True]

print(len(health_indices))

#baking dataframe
df = baking_data[0][health_mask]

print(len(df))

#training recipe ids
baking_ids = df.id.values

#list of strings representing each recipe
baking_strings = ['[CLS] ' + item.replace('--|||--', '[SEP]').replace('||', '[SEP]') + ' [SEP]' for i,item in enumerate(baking_data[1]) if i in health_indices]

#dictionary mapping imported tokens ids to token strings
token_num_to_str = baking_data[7]

#list of lists where each list is of token ids with some missing
#test_baking_tokens_missing = [item[0] for item in baking_data[4]]
#test_baking_strings_missing = ['[CLS] ' + convert_token_seq_to_string(r, token_num_to_str).replace('||', '[SEP]').replace('MASK', '[MASK]') + ' [SEP]' for r in test_baking_tokens_missing]

#list of lists where each list is of token ids
#test_baking_tokens_full = baking_data[3]
#test_baking_strings_full = ['[CLS] ' + convert_token_seq_to_string(r, token_num_to_str).replace('||', '[SEP]') + ' [SEP]' for r in test_baking_tokens_full]

# Tokenize

In [0]:
# Tokenize with BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_recipes = [tokenizer.tokenize(r) for r in baking_strings]

# Pad Tokenized Sequences

In [0]:
max_length = max([len(seq) for seq in tokenized_recipes])
print(max_length)

In [0]:
# Set the maximum sequence length. 
# NOTE: THIS SHOULD BE SET TO ABOVE THE MAX IN THE CELL ABOVE
MAX_LEN = max_length

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_recipes],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [0]:
# Create attention masks
attention_masks = []
masked_lm_labels = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  lm_labels = [(-1 if i==0 else i) for i in seq]
  attention_masks.append(seq_mask)
  masked_lm_labels.append(lm_labels)

# Train the Model

In [0]:
%%capture
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [0]:

# Use train_test_split to split our data into train and validation sets for training
#train_inputs, _, train_masks, _, train_labels, _ = train_test_split(input_ids, attention_masks, masked_lm_labels, random_state=2018, test_size=1)
                                             
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)
train_lm_labels = torch.tensor(masked_lm_labels)

# Select a batch size for training. 
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks, train_lm_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


In [0]:
%%capture
#move model to gpu
model.cuda()

In [0]:

# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)
  
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = 2

# BERT training loop
for _ in trange(epochs, desc="Epoch"):  
  
  ## TRAINING
  
  # Set our model to training mode
  model.train()  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch

  tot = len(train_dataloader)

  for step, batch in enumerate(train_dataloader):
    if step % 50 == 0:
      print('Step %s of %s'%(step, tot))
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_lm_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, masked_lm_labels=b_lm_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

# plot training performance
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()


# Getting Embeddings

In [0]:
def get_embeddings(input_ids, model, gpu=True):
  if gpu:
    input_ids = input_ids.to(device)
  outputs = model(input_ids)
  embeddings = outputs[:,0,:]
  embeddings_n = embeddings.cpu().detach().numpy()
  return embeddings_n

In [0]:
batch_size = 32

num_batches = len(train_inputs) // batch_size + 1

for b in range(num_batches):
  if b % 10 == 0:
    print('Batch: %s'%(b+1))
  curr_batch = train_inputs[batch_size*b:batch_size*(b+1)]
  if b==0:
    embeddings = get_embeddings(curr_batch, model)
  else:
    curr_embeddings = get_embeddings(curr_batch, model)
    embeddings = np.concatenate((embeddings, curr_embeddings), axis=0)

In [0]:
from time import time

In [0]:

num = embeddings.shape[0]

start = time()

sim_mtx = np.zeros((num, num))

for idx1 in range(num):
  if idx1 % 100 == 0:
    print(idx1)
  emb1 = embeddings[idx1]
  for idx2 in range(idx1+1, num):
    emb2 = embeddings[idx2]
    cos_sim = 1 - cosine(emb1, emb2)

    sim_mtx[idx1, idx2] = cos_sim
    sim_mtx[idx2, idx1] = cos_sim

end = time()
print(end-start)


In [0]:
for idx in range(20):
  print('Recipe:')
  print(df[df.id == baking_ids[idx]].title.iloc[0])
  print('Similar Recipes:')
  row = sim_mtx[idx]
  most_similar_inds = np.argpartition(row, -6)[-6:]
  print(most_similar_inds)
  most_similar_ids = [baking_ids[i] for i in most_similar_inds]
  print(df[df.id.isin(most_similar_ids)].title.values)
  print('--------------')

# Save Model

In [0]:
# If we save using the predefined names, we can load using `from_pretrained`
output_dir = '/content/drive/My Drive/'
output_model_file = os.path.join(output_dir, 'trained_bert_model')
output_config_file = os.path.join(output_dir, CONFIG_NAME)

In [0]:
torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)