### Package installs and setup


In [None]:
!pip install --quiet tensorflow pandas fasttext fsspec tensorflow_text tensorflow keras-tuner transformers gdown gensim
!pip install --quiet --upgrade tensorflow-hub
!pip install --quiet python-Levenshtein simple_elmo

In [None]:
import re, io, os, math, itertools, statistics, random
import numpy as np
import pandas as pd
from scipy.spatial import distance
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
import tensorflow_hub as hub
from Levenshtein import distance as lev_distance
#from  IPython import display
from matplotlib import pyplot as plt

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Drive ids for the directories in the dataset
dir_ids = {
'COMETA': '',
'ZEROSHOT_SPECIFIC': '',
'ZEROSHOT_GENERAL': '',
'STRATIFIED_GENERAL': '',
'STRATIFIED_SPECIFIC':'',
}

def load_dataset(dir_id):
  """Load one of the COMETA datasets into a pandas dataframe.
  `dir_id` is the Google Drive id.

  Returns dictionary in the format {k: v} where k is the filename minus '.csv',
  and v is a pandas dataframe e.g. dataset['train'] yields data from 'train.csv'.
  """
  # Get the file ids in the directory
  ids = {file1['title']: file1['id']
                  for file1 in drive.ListFile({'q': "'{}' in parents".format(
                      dir_id)}).GetList()
        }
  
  # Load each file in the dataset into pandas
  dataset = {}
  for k in ids.keys():
    key = k[0:-4] # strip out .csv
    print("Loaded", key)
    file_id = ids[k]
    print(file_id) # for some bizarre reason this fixes a weird bug??
    content = drive.CreateFile({'id': file_id})
    content.GetContentFile(k)
    df = pd.read_csv(k, sep='\t')

    dataset[key] = df
  
  return dataset

strat_spec = load_dataset(dir_ids['STRATIFIED_SPECIFIC'])
#strat_gen = load_dataset(dir_ids['STRATIFIED_GENERAL'])
zsh_spec = load_dataset(dir_ids['ZEROSHOT_SPECIFIC'])
#zsh_gen = load_dataset(dir_ids['ZEROSHOT_GENERAL'])

### Helper functions

These helper functions are all used with several different classes.

In [None]:
def remove_duplicates(embeddings, series, average=False, preserve=False):
  """
  Remove duplicate embeddings for the labels, preserving order.

  Parameters:
    embeddings: the embeddings.
    series: the corresponding list of strings.
    average: If True, average the duplicates. If it is False, just take the
    first value for each duplicate.
    preserve: If `average` is True and `preserve` is True, then embeddings and
    series stay the same size. If `preserve` is False, then the duplicates
    get averaged and then only the first value is used.

    Don't try to set average=False and preserve=True; you'll get an error.
  
  Returns a (list1, list2) tuple.
    list1: a list of embeddings.
    list2: the list of strings, modified if necessary.
  """
  if preserve and not average:
    raise ValueError
  if embeddings.shape[0] != len(series):
    raise ValueError
  
  if type(series) == list:
    l_s = pd.Series(series)
    series = l_s

  if average: # this makes my brain hurt
    duplicates = series.duplicated(keep=False) # mark all dups as True

    # Get the indices corresponding to each duplicate label and stick them in
    # a dict in the format { 'label': [i, j, k] }
    dup_labels_with_indices = {}
    for i in range(0, len(duplicates)):
      label = series[i]
      is_duplicated = duplicates[i]
      if not is_duplicated:
        continue
      elif label not in dup_labels_with_indices:
        dup_labels_with_indices[label] = [i]
      else:
        dup_labels_with_indices[label].append(i)
    
    # get the embeddings for each duplicate, then average them
    dup_labels_with_embeds = {}
    for label, indices in dup_labels_with_indices.items():
      embeddings = []
      for index in indices:
        embeddings.append(embeddings[index])
      cat = tf.stack(embeddings, axis=0)
      avg_embed = tf.reduce_mean(cat, axis=0)
      # store the label together with its embedding in a dict
      dup_labels_with_embeds[label] = avg_embed
    
    # now write the avg_embeds, as well as the non-duplicate ones, to a list
    embeddings_list = []
    for i in range(0, len(duplicates)):
      is_duplicate = duplicates[i]
      label = series[i]
      embed = dup_labels_with_embeds[label] if is_duplicate else embeddings[i]
      embeddings_list.append(embed)

    # turn it into a tensor
    embeddings_tensor = tf.stack(embeddings_list, axis=0)
  else:
    embeddings_tensor = embeddings # underwhelming

  if not preserve: # drop all but the first value of each duplicate
    # need ~ so that it returns True for all but the non-first duplicates
    mask = ~series.duplicated(keep='first')
    masked_labels = series.drop_duplicates(keep='first')
    masked_embeds = tf.boolean_mask(embeddings_tensor, mask)
    return masked_embeds, masked_labels.to_list()
  else: # this retains the duplicates, but with their embeddings avg'd
    return embeddings_tensor, series

In [None]:
def create_batches(data, batch_size=100):
  """Partition a dataset into batches of size batch_size."""
  num_batches = len(data)//batch_size
  
  batches = []
  for i in range(0, num_batches):
    batch = data[batch_size*i:batch_size*(i+1)]
    batches.append(batch)
  if len(data) % batch_size:
    # Deal with the remainder if it exists
    batch = data[batch_size*num_batches:]
    batches.append(batch)
  
  return batches

In [None]:
def pad_outputs(outputs, min_size=None, d=0, batches=False, rank=3):
  """
  Pad a list of outputs to the same size along a specified dimension.

  Parameters:
    outputs: a list of tensors.
    min_size: If None, then pad to the size of the largest tensor.
      Otherwise, pad it to min_size (e.g. 64)
    d: pad  Pad the inputs on dimension d. Please don't try to use negative indices.
      Sadly you need to specify the rank of the tensors you're padding.
    batches: if True, do the padding in batches of 25.
    rank: the rank of the tensors that are being padded.

  Returns a list of outputs, which are all the same size on dimension d.
  """
  # work out the size of the largest batch
  output_sizes = [output.shape.as_list() for output in outputs]
  max_size = max([shape[d] for shape in output_sizes])

  if min_size and max_size > min_size:
    raise ValueError("You're trying to use a tensor that's too big :(")
  size = min_size if min_size else max_size # size that tensors get padded to

  if batches:
    output_batches = create_batches(outputs, batch_size=25)
    padded_outputs_b = []
    for batch in output_batches:
      padded_outputs_b.append(pad_outputs(batch, min_size=size, batches=False))
    # collapse 2d list into 1d
    padded_outputs = list(itertools.chain.from_iterable(padded_outputs_b))
  
  else:
    # now pad all the batches so they are of the same size
    padded_outputs = []
    for i in range(0,len(outputs)):
      output = outputs[i]
      pad = size - output.shape.as_list()[d]

      if d == 0:
        paddings = [[0, pad],]
      elif d == 1:
        paddings = [[0, 0], [0, pad],]
      # add the remaining [0,0] values onto the end of the paddings
      paddings.extend([[0,0]*1 for _ in range(rank-d-1)])
      
      padded = tf.pad(output, paddings, 'CONSTANT')
      padded_outputs.append(padded)
  
  return padded_outputs

In [None]:
class Results():
  """
  A class that acts as a wrapper around a pandas df that contains the results of
  an experiment.

  Parameters:
    df: a datafile with headers 'Term', 'Gold label', 'Correct'
    meta: a list in the format [in_name, in_d, out_name, out_d, k]
    use_ids: determines whether the output is a SNOMED id, in which case several
      of the metrics become nonsensical.
  """
  def __init__(self, df, meta, use_ids):
    self.data = df
    self.meta = meta
    self.use_ids = use_ids

  def __exclude_easy(self):
    """Remove the easy cases where term==label from the dataset."""
    return self.data[(self.data['Term'].apply(lower) != self.data['Gold label'].apply(lower))]

  def __get_correct(self, exclude_easy=True):
    """
    Get the times that the experiment actually got the answer right.
    
    Parameters:
      exclude_easy: controls whether to exclude cases where the Term and Gold label
      are identical.
    
    Returns a datafile of the correct results.
    """
    if exclude_easy:
      correct = self.data[(self.data['Correct'] == True) &
                          (self.data['Term'].apply(lower) !=
                           self.data['Gold label'].apply(lower))]
    else:
      correct = self.data[self.data['Correct'] == True]
    
    return correct
  
  def __get_edit_distance(self):
    """
    Get the Levenshtein edit distance between each term and each label.
    Both term and label are lowercase.

    Modifies the datafile in place.
    """
    edit_distances = []
    for i in range(len(self.data['Term'])):
      term = self.data['Term'][i].lower()
      label = self.data['Gold label'][i].lower()

      edit_distances.append(lev_distance(term, label))
    
    self.data['Edit distance'] = edit_distances

  def analyse(self):
    """
    Analyse the results. Returns a list of five numbers:
    - the total number of right answers
    - the total number of attempts
    - the total number of right answers, excluding the easy ones
    - the total number of attempts, excluding the easy ones
    - for the right answers, the average Levenshtein edit distance between
      the term and the label normalised by adding 1; this means that if
      Correct=True and Edit distance=0, i.e. if the term is the same as the gold
      label, the value of Correct*ED = 1. In other words, the algorithm gets
      credit for picking off the lowest hanging fruit.
    - the same, but not normalised. Here, if Correct=True and Edit distance=0,
      the value of Correct*ED = 0. It is therefore an exclude easy option.
    """
    self.__get_edit_distance()

    correct_all = self.__get_correct(exclude_easy=False).shape[0]
    all_size = self.data.shape[0]
    # If the output is an id, the text-similarity based metrics are nonsensical
    #if not self.use_ids:
    correct_ee = self.__get_correct(exclude_easy=True).shape[0]
    ee_size = self.__exclude_easy().shape[0]
  
    avg_edit_norm = sum(self.data['Correct']*(self.data['Edit distance']+1))/correct_all
    return [correct_all, all_size, correct_ee, ee_size, avg_edit_norm]
    #else:
      # Empty strings so that the dataset stays the same size
      #return [correct_all, all_size, "", "", ""]
  
  def __str__(self):
    # if not self.use_ids:
    s = """{} (d={}) -> {} (d={})
    k={}
    Correct all: {} / {} 
    Correct exclude easy: {} / {}
    Avg normalised edit distance for the right answers: {}

    """.format(*self.meta, *self.analyse())
    # else:
    #   s = """{} (d={}) -> {} (d={})
    #   k={}
    #   Correct all: {} / {}

    #   """.format(*self.meta, self.analyse()[0], self.analyse()[1])
    
    return s+"\t".join([str(m) for m in self.meta] + [str(a) for a in self.analyse()])

def lower(s):
  return s.lower()

In [None]:
class Experiment():
  """A class for an experiment, whether linear or neural."""
  
  def run(self, zsh=False):
    """
    Run an experiment over the whole dataset.
    Generate the embeddings and learn the transformation between
    train-term and train-label.
    Make a prediction for each test-term's label, and see if it was correct.

    Parameters:
      zsh: if this is True, then compare the predictions against the labels known
      to be in the test set. You need to use this for the zeroshot datasets!
      Otherwise, compare against the labels that were seen in the train set. It
      shouldn't make a substantive difference with the stratified datasets.
    """
    self.create_mapping() # generate the input embeds and learn the mapping
    self.make_predictions() # make predictions
    
    # Get rid of the duplicates in the labels/ids and the corresponding input embeds
    if not zsh:
      # Compare against the labels known to be in the train set (default)
      out_embs_unique, snomed = remove_duplicates(self.y_train, self.train_out)
    else:
      # Compare against the labels known to be in the test set
      out_embs_unique, snomed = remove_duplicates(self.y_test, self.test_out)

    results_list = [[] for i in range(0,len(self.k_values))]
    for i in range(0, len(self.test_in)):
      term = self.test_in[i]
      gold = self.test_out[i]
      prediction = self.predictions[i]

      if self.use_ids:
        gold_id = gold
        gold_label = self.test_out_other[i]
      else:
        gold_label = gold
        gold_id = self.test_out_other[i]

      # Get the distances between the prediction and each label embedding
      distances = distance.cdist([prediction], out_embs_unique, "cosine")[0]
      # distances = distance.cdist([prediction], self.snomed_embeds, "cosine")[0]
      # get the indices that would sort the distances array
      top_indices = np.argsort(distances)

      # test the model against each value of k
      for i in range(0, len(self.k_values)):
        k = self.k_values[i]
        possibilities = [snomed[i] for i in top_indices[0:k]]

        val = True if gold in possibilities else False
        results_list[i].append([term, gold_label, gold_id, val])

    # make a Results object for each value of k
    self.results = []
    col_names = ['Term', 'Gold label', 'Gold id', 'Correct']
    for i in range(0, len(self.k_values)):
      k = self.k_values[i]
      df = pd.DataFrame(results_list[i], columns=col_names)
      meta = [self.in_name, self.in_model.d, self.out_name, self.out_model.d, k]
      self.results.append(Results(df, meta, self.use_ids))
    
    return self.results

In [None]:
class Experiments():
  """
  A class to represent several linear or neural mapping experiments run with
  several different in_models and several different values of k. They should all
  use the same datasets.
  """
  def run(self, zsh=False):
    """Run all the experiments."""
    self.results_data = []
    for exp in self.experiments:
      results = exp.run(zsh)
      for r in results:
        print(r)
        if exp.description != 'linear':
          exp.metrics()
        print()
        self.results_data.append([*r.meta, *r.analyse(), *exp.metrics(), exp.description])

    df = pd.DataFrame(self.results_data, columns=self.header)
    return df

### Data

In [None]:
# The headers are 
# ['ID', 'Term', 'General SNOMED Label', 'General SNOMED ID',
# 'Specific SNOMED Label', 'Specific SNOMED ID', 'Example', 'Subreddit']

# Stop truncating data in columns
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

class Dataset():
  """
  A dataset that the experiment is run on.
  Mostly a wrapper for a dataframe.

  Arguments:
    df: the pandas dataframe.
    kind: whether to use the general or specific SNOMED labels. Make sure to use
    general with the strat_gen or zsh_gen dataset, and mutatis mutandis for spec.
  """
  def __init__(self, df, kind='specific'):
    self.df = df
    self.kind=kind
    self.cometa_ids = [str(id) for id in self.df['ID'].to_list()]
    self.labels = self.df[self.kind.capitalize()+' SNOMED Label'].to_list()
    self.ids = [str(id) for id in self.df[self.kind.capitalize()+' SNOMED ID'].to_list()]
    self.terms = self.df['Term'].to_list()
    self.examples = self.df['Example'].to_list()
    self.subreddits = self.df['Subreddit'].to_list()
    self.size = len(self.labels)
  
  def __getitem__(self, key):
    """Subscripting returns the ith row of the dataset."""
    return self.df.iloc[key, :]

  def col(self, name):
    """Returns a list of the values in a particular column."""
    return [str(i) for i in self.df[name].to_list()]
  
  def unique_subreddits(self):
    """Returns a list of the unique subreddits that the data was extracted from."""
    return self.df['Subreddit'].unique()
  
  def unique_labels(self):
    """Returns a list of the unique SNOMED labels."""
    return self.df[self.kind.capitalize()+' SNOMED Label'].unique()
  
  def edit_distances(self):
    """
    Get the Levenshtein edit distance between each term and each label.
    Both term and label are lowercase.

    Returns a dictionary of the distribution.
    """
    label_col = self.kind.capitalize() + ' SNOMED Label'

    edit_distances = {}
    for i in range(len(self.df['Term'])):
      term = self.df['Term'][i].lower()
      label = self.df[label_col][i].lower()
      lev = lev_distance(term, label)+1 # normalise by adding 1
      
      if lev in edit_distances:
        edit_distances[lev] += 1
      else:
        edit_distances[lev] = 1

    return edit_distances.keys(), edit_distances.values()
  
  def __cos_sim(self, v1, v2):
    """Returns the cosine similarity between two vectors, or 0 if either vector
    is 0."""
    if not tf.math.count_nonzero(v1) or not tf.math.count_nonzero(v2):
      return 0.0
    else:
      return -distance.cosine(v1, v2)+1

  def avg_cos_sim1(self, model):
    """
    Returns the average cosine similarity between terms and labels, both lowercased.

    Arguments:
      model: the static or dynamic model used to find the cosine similarity.
    """
    terms = [t.lower() for t in self.terms]
    labels = [l.lower() for l in self.labels]
    embeds = model.get_embeddings(terms, labels, pooled=True)

    # similarities = [
    #                 -distance.cosine(embeds[0][i], embeds[1][i])+1 for i
    #                 in range(0,len(terms))
    # ]
    similarities = [self.__cos_sim(embeds[0][i], embeds[1][i]) for i
                    in range (0,len(terms))]

    return np.mean(similarities)

  def avg_cos_sim2(self, col, model, k=100):
    """
    Returns the average cosine distance between each pair in a random sample
    of size k of a single column.

    Parameters:
      col: the column to take the sample from.
      model: the model to process the sample with.
      k: the size of the sample. DO NOT MAKE THIS TOO BIG.
    
    Returns:
      The average cosine similarity between each pair.
    """
    samples = random.sample(range(0,self.size), k=k)
    if col == 'terms':
      col_embeds = model.process_dataset([self.terms[i] for i in samples])
    elif col == 'labels':
      col_embeds = model.process_dataset([self.labels[i] for i in samples])
    elif col == 'ids':
      col_embeds = model.process_dataset([self.ids[i] for i in samples])
    
    # combs is all the (col[i], col[j]) pairs in column col, where i != j
    # it is thus a tensor of shape (T(self.size-1), 2, model.d), where T(n) is
    # the nth triangle number
    col_embeds = col_embeds.numpy().tolist()
    combs = list(itertools.combinations(col_embeds, 2))
    combs = tf.constant(combs)
    
    # # calculate the cosine similarity between each pair in combs
    batches = create_batches(combs, batch_size=100)
    cos_sims = [[self.__cos_sim(comb[0],comb[1]) for comb in batch] for batch in batches]
    cos_sims = tf.concat(cos_sims, axis=-1)
    return float(tf.reduce_mean(cos_sims))
    
  def sample(self, static_model, k=100):
    """
    Get a random sample of size k of the entries in the dataset.
    
    Arguments:
      static_model: the static model to use to get the cosine similarity between
      the label and the term.

      k: the size of the random sample
    
    Returns:
      A dataframe with headers ['COMETA ID', 'Term', 'Label', 'lev', 'cos_sim'].
      'lev' contains the Levenshtein edit distance between 'Term' and 'Label'.
      'cos_sim' contains the cos similarity computed by static_model between
      'Term and 'Label'.
    """
    sample = []
    samples = random.sample(range(0,self.size), k=k)
    for i in samples:
      cometa_id = self.cometa_ids[i]
      label = self.labels[i]
      term = self.terms[i]

      lev = lev_distance(label.lower(), term.lower())

      label_embed = static_model.process_string(label.lower(), pooled=True)
      term_embed = static_model.process_string(term.lower(), pooled=True)
      # why do you have to do the - and +1? No clue.
      cos_sim = self.__cos_sim(label_embed, term_embed)

      sample.append([cometa_id, term, label, lev, round(cos_sim, 3)])
    
    cols = ['COMETA ID', 'Term', 'Label', 'lev', 'cos_sim']
    return pd.DataFrame(sample, columns=cols)

  def term_stats(self):
    """
    Returns a list of the unique terms in the dataset, and a list of their counts.
    """
    stats = {}
    for term in self.terms:
      index = len(term.split())
      if index in stats:
        stats[index] += 1
      else:
        stats[index] = 1
    return stats.keys(), stats.values()
  
  def join(self, order='l e'):
    """
    Join the labels and terms/examples together. This is to try to get
    context-sensitive label embeddings with BERT.

    Arguments:
      order controls whether the join goes label-example (l e) or example-label
      (e l) or label-term (l t) or term-label (t l).
      Insert a full stop (e.g. `l. t`) to add a [SEP] token.
    
    Returns:
      A list of the joined labels and terms/examples.
    """
    sep = order[1]
    ord = order[0] + order[2]
    if ord == 'le':
      return [self.labels[i]+sep+" "+self.examples[i] for i in range(0,self.size)]
    elif ord == 'el':
      return [self.examples[i]+sep+" "+self.labels[i] for i in range(0,self.size)]
    elif ord == 'lt':
      return [self.labels[i]+sep+" "+self.terms[i] for i in range(0,self.size)]
    elif ord == 'tl':
      return [self.terms[i]+sep+" "+self.labels[i] for i in range(0,self.size)]

In [None]:
models = [ft_br, glove_br, bert_br]
for m in models:
  sim_strat = statistics.mean([test_d.avg_cos_sim2('terms', m, k=100) for i in range(0, 10)])
  sim_zsh = statistics.mean([test_d_zsh.avg_cos_sim2('terms', m, k=100) for i in range(0, 10)])
  print(m.name+"\t"+str(sim_strat)+"\t"+str(sim_zsh))

In [None]:
train_d = Dataset(strat_spec['train'], kind='specific')
val_d = Dataset(strat_spec['dev'], kind='specific')
test_d = Dataset(strat_spec['test'], kind='specific')

train_d_zsh = Dataset(zsh_spec['train'], kind='specific')
val_d_zsh = Dataset(zsh_spec['dev'], kind='specific')
test_d_zsh = Dataset(zsh_spec['test'], kind='specific')

In [None]:
ft_br = loaded_ft_models['reddit-biomed']
# ft_cc = loaded_ft_models['crawl-300d-2M-subword']
# ft_wiki = loaded_ft_models['wiki-news-300d-1M-subword']
glove_br = loaded_glove_models['bioreddit.glove.200']
# glove_cc = loaded_glove_models['glove.42B.300d']
# glove_wiki = loaded_glove_models['glove.6B.200d']
bert_br = BertModel(bert_tokenizer, bert_model, 'bert_br', 'lasths')

In [None]:
def get_edit_dist_stats(datasets):
  pd.set_option("display.max_rows", None)
  edit_dists = []
  for dataset in datasets:
    eds = dataset.edit_distances()
    df = pd.DataFrame(eds[1], index=eds[0])
    edit_dists.append(df)

  return pd.concat(edit_dists, axis=1, join='outer')

get_edit_dist_stats([train_d, val_d, test_d, train_d_zsh, val_d_zsh, test_d_zsh])

In [None]:
def get_term_length_stats(datasets):
  stats = []
  for dataset in datasets:
    s = dataset.term_stats()
    df = pd.DataFrame(s[1], index=s[0])
    stats.append(df)
  return pd.concat(stats, axis=1, join='outer')

get_term_length_stats([train_d, val_d, test_d, train_d_zsh, val_d_zsh, test_d_zsh])

### Neural input models

In [None]:
from transformers import TFAutoModel, AutoTokenizer, BertTokenizer

MODEL_NAME = "cambridgeltl/BioRedditBERT-uncased" # @param {type: "string"} ["bert-base-uncased", "bert-large-uncased", "cambridgeltl/BioRedditBERT-uncased", "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"]
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = TFAutoModel.from_pretrained(MODEL_NAME)


In [None]:
# Download elmo bioreddit
# !mkdir elmo-br
# !wget https://github.com/basaldella/bioreddit/releases/download/v1.0/bioreddit.elmo.json
# !mv bioreddit.elmo.json elmo-br/options.json
# !wget https://github.com/basaldella/bioreddit/releases/download/v1.0/bioreddit.elmo.hdf5
# !mv bioreddit.elmo.hdf5 elmo-br/model.hdf5

In [None]:
class NeuralModel():
  """
  A wrapper for methods common to all the neural models.
  """
  model_type = 'neural'
  
  def process_dataset(self, dataset, pooled=True, batch_size=100):
    """
    Process a whole dataset, batch by batch, then concatenate the batch output.
    
    Arguments:
      dataset: a list of strings. This is the dataset to be processed by the
      neural model.
      pooled: whether to average the output. Default True.
      batch_size: the data is processed in batches of this size. Default 100.
    
    Returns a Tensor of dynamic embeddings for the dataset.
    """
    if self.name == 'ELMo-br-1024':
      # it is far quicker to do the whole batch in one go with the simple_elmo model
      return self.process_batch(dataset, pooled=pooled)
    
    else:
      batches = create_batches(dataset, batch_size=batch_size)
      outputs = [self.process_batch(batch, pooled=pooled)
          for batch in batches]
      
      print("Processed {} batches and {} examples".format(len(batches), len(dataset)))
      # No need to pad as the outputs are all the same dimensions if pooled
      # else pad the outputs in batches of 25
      padded_outputs = outputs if pooled else pad_outputs(outputs, min_size=None, batches=False, d=1)
      return tf.concat(padded_outputs, 0)
    
  def get_embeddings(self, *datasets, pooled=False, pad_size=None):
    """
    Get the neural embeddings for a list of datasets.
    
    Parameters:
      datasets: a nonzero number of Dataset objects to be processed.
      pooled: whether to average the embeddings for multi-word inputs.
      pad_size for compatibility.

    Returns a list of tensors, one tensor for each Dataset.
    """
    embeddings = [self.process_dataset(ds,pooled=pooled) for ds in datasets]
    return embeddings

  def get_cs_embeddings(self, *datasets, order='l t', pooled=True, batch_size=100):
    """
    Process a series of datasets, batch by batch. For each element in each batch,
    extract the context-sensitive features corresponding to a given string.

    Parameters:
      datasets: a nonzero number of datasets to process.
      order: (Put a . between l and e for a separator)
        'l e': label then example. Extract the label.
        'e l': example then label. Extract the label.
        'l t': label then term. Extract the label.
        't l': term then label. Extract the label.
        'term': Extract the term from the example.
      pooled: If you are using 'term', you can optionally set pooled to False,
      which gives a separate, context-sensitive embedding for each word of the term.
    
    Returns a tensor of dynamic, context-sensitive embeddings.
    """
    embeddings = []
    for dataset in datasets:
      if order[-5:] == '-term':
        terms = dataset.terms
        examples = dataset.examples
        position = 'mid'
        pass

      else: # we're extracting the labels here
        sep = True if order[1] == '.' else False

        labels = dataset.labels
         # concatenate the labels and examples
        joined = dataset.join(order=order)

        joined_batches = create_batches(joined, batch_size=batch_size)
        label_batches = create_batches(labels, batch_size=batch_size)

        # where is the label that we want to extract
        position = 'start' if order in ['l e', 'l t', 'l.e', 'l.t'] else 'end'
        
        # extract the embedding corresponding to each label
        label_outputs = [self.cs_process_batch(joined_batches[i], label_batches[i], pos=position, sep=sep) for
                        i in range(0,len(joined_batches))]
        print("Processed {} batches and {} examples".format(len(label_batches), len(labels)))

        # concatenate, remove duplicates and concatenate again
        rd, labs = remove_duplicates(tf.concat(label_outputs, axis=0), labels, average=True, preserve=True)
        embeddings.append(tf.concat(rd,axis=0))
    
    return embeddings

In [None]:
class BertModel(NeuralModel):
  """
  A BERT model.

  Parameters:
    tokenizer: the BERT tokeniser.
    model: the BERT model.
    name: the name that is listed in the table of results.
    kind: one of the following values that determine how you get the
    individual word vectors:
      (see https://towardsdatascience.com/breaking-bert-down-430461f60efb, at the bottom)
      avgall: average all the hidden layers together
      lasths: last hidden state
      sumlast4: sum the last four hidden layers
      catlast4: concatenate the last four layers
  """
  def __init__(self, tokenizer, model, name, kind):
    self.tokenizer = tokenizer
    self.model = model
    self.kind = kind
    self.d = self.model.config.hidden_size
    self.d *= 4 if self.kind == 'catlast4' else 1
    self.name = name+'-'+str(self.d)+'-'+self.kind
  
  def process_batch(self, batch, pooled=True):
    """
    Process a single batch with BERT.

    Arguments:
      batch: a list of strings to process.
      pooled: average the word vectors.
    
    Returns:
      if pooled, a tensor of shape (batch_size, d)
      if not pooled, shape (batch_size, max_length, d)
        d=768, unless model.kind == 'catlast4', in which case d=3072.
    """
    tokenized = self.tokenizer(batch, return_tensors='tf',
                               padding=True, truncation=True, max_length=512)
    # output_type = 'pooler_output' if pooled else 'last_hidden_state'
    hidden_states = self.model.call(tokenized, output_hidden_states=True, return_dict=True)['hidden_states']
    
    if self.kind == 'lasths':
      unpooled = hidden_states[-1]

    if self.kind == 'avgall':
      # get a tensor of dimensions (batch_size, 12, max_length, 768)
      stacked = tf.stack(hidden_states[1:], axis=1)
      # get a tensor of dimensions (batch_size, max_length, 768)
      unpooled = tf.reduce_mean(stacked, axis=1)

    elif self.kind == 'sumlast4':
      stacked = tf.stack(hidden_states[-4:], axis=1)
      unpooled = tf.reduce_sum(stacked, axis=1)
    
    elif self.kind == 'catlast4':
      # get a tensor of dimensions (batch_size, max_length, 3072)
      unpooled = tf.concat(hidden_states[-4:], axis=-1)
    
    # compatibility with ... something
    unpooled = tf.cast(unpooled, tf.float64)
    return tf.reduce_mean(unpooled, axis=-2) if pooled else unpooled
  
  def cs_process_batch(self, batch, sbatch, pos='start', sep=True):
    """
    Process a single batch with BERT. Then, for each member of the batch m[i],
    extract the features corresponding to a given string in sbatch s[i].

    The features are just those from the final hidden layer; see
    http://jalammar.github.io/illustrated-bert/, 'BERT for feature extraction'.

    Parameters:
      batch: a list of strings whose representations will be extracted.
      sbatch: a list of strings, where sbatch[i] contains batch[i]
      pos: controls where s[i] begins within each value of m[i]
        'start': beginning
        'end': end
        'mid': searches through m[i] for the first occurrence of s[i] (use this
          one for getting the term out of the example)
      sep=True if a full stop separates s[i] from the rest of the string.
    
    Returns:
      A tensor of shape (batch_size, d)
    """
    self.kind == 'lasths'

    if len(batch) != len(sbatch):
      raise ValueError('You need to give the same number of strings to extract as inputs.')

    if pos in ['start', 'end']:
      outputs = self.process_batch(batch, pooled=False)
      stokens = [self.tokenizer.tokenize(s) for s in sbatch]
      
      # note that it is irrelevant whether there is a [SEP] token in the middle.
      # as you never have to extract the [SEP] token.
      if pos == 'start':
        # Extract the features corresponding to the string, then average them to
        # produce a single embedding for the string
        # a [CLS] token is prepended to the string, hence +1
        sfeatures = [tf.reduce_mean(outputs[i][0+1:len(stokens[i])+1],axis=0) for
                    i in range(0, len(batch))]
      elif pos == 'end':
        # a [SEP] token is appended to the string, hence the -1
        sfeatures = [tf.reduce_mean(outputs[i][-1-len(stokens[i]):-1],axis=0) for
                    i in range(0, len(batch))]
    
    elif pos == 'mid':
      pass

    return tf.stack(sfeatures,0)


In [None]:
# thanks https://github.com/ltgoslo/simple_elmo
from simple_elmo import ElmoModel as PretrainedElmoModel

class ElmoModel(NeuralModel):
  """
  An ELMO model. It doesn't need a tokenizer, so long as the input is already
  tokenized.
  Defaults to loading the bioreddit model.
  """
  def __init__(self, model=None):
    if not model:
      self.__load_pretrained()
      self.name = 'ELMo-br-1024'
    else:
      self.model = model
      self.name = 'ELMo-1024'
    self.d = 1024
  
  def __load_pretrained(self):
    """Load the Bioreddit ELMo model. Make sure you've downloaded it first!"""
    self.use_simple_elmo = True
    self.model = PretrainedElmoModel()
    self.model.load('./elmo-br')
  
  def process_batch(self, batch, pooled=False):
    """
    Process a single batch with ELMo.

    Arguments:
      batch: a list of strings to process.
      pooled: average the word vectors.
    
    Returns:
      if pooled, a tensor of shape (batch_size, 1024)
      if not pooled, shape (batch_size, max_length, 1024)
    """
    if self.use_simple_elmo:
      embs = self.model.get_elmo_vector_average(batch) if pooled else self.model.get_elmo_vectors(batch)
      return tf.constant(embs)

    else:
      if type(batch) == list:
        batch = tf.constant(batch)
    
      embeds = self.model.signatures['default'](tf.constant(batch))
      return embeds['default'] if pooled else embeds['word_emb']
    

### Static input models

In [None]:
import gensim
from scipy.fft import dct

class StaticModel():
  model_type = 'static'

  def __dctransform(self, sentvec, max_k):
    """
    Do a discrete cosine transformation on the embeddings.
    See arxiv.org/pdf/1909.03104.pdf

    From https://github.com/N-Almarwani/DCT_Sentence_Embedding/blob/master/DCT.py#L77

    Parameters:
      sentvec: the vectors for the individual words in a sentence.
      max_k: the number of dcts to do.
    """
    sentvec = sentvec.numpy()
    if sentvec.shape[0] < max_k:
      transformed = dct(sentvec, norm='ortho', n=max_k, axis=0)
    else:
      transformed = dct(sentvec, norm='ortho', axis=0)
    embedding = np.reshape(transformed[:max_k,:], (max_k*sentvec.shape[1],))
    return tf.constant(embedding)
  
  def process_string(self, string, pooled=True):
    """
    Transform a single string into static vectors.
    
    Parameters:
      string: the string to transform.
      pooled: controls whether to pool the output by averaging or not.
    
    Returns a tensor of shape (d,), or (d,len(string)) if pooled is False.
    """
    tokens = string.split() if pooled else string
    out_of_vocab = np.zeros(self.d)

    # Get each token's embedding
    embs = [(self.model[token] if token in self.model else out_of_vocab) for token in tokens]  
    embeddings = tf.stack(embs, axis=0)

    if not pooled:
      return embeddings
    else:
      return tf.reduce_mean(embeddings, axis=0)
  
  def process_dataset(self, dataset, pooled=True, pad_size=4):
    """
    Process a whole dataset with static vectors.
    Because Python lists are fast, there's no need for batches.

    Parameters:
      dataset: a list of strings to be processed.
      pooled: if true, average the vectors for each word.
      pad_size: the size to pad the vectors to. Only relevant if pooled=False.
    
    Returns: a tensor of static embeddings for the dataset.
    """
    if not pooled: # truncate strings that are too long
      dataset = [string.split()[0:pad_size] for string in dataset]
    
    outputs = [tf.dtypes.cast(self.process_string(string, pooled=pooled),
                              tf.float64) for string in dataset]
    if not pooled:
      outputs = pad_outputs(outputs, min_size=pad_size, d=0, rank=2)
    
    return tf.stack(outputs, axis=0)
  
  def get_embeddings(self, *datasets, pooled=True, pad_size=4):
    """
    Return all of the embeddings for a bunch of datasets.

    Parameters:
      datasets: a nonzero number of Dataset objects to be processed.
      pooled: whether to average the embeddings for multi-word inputs.
      pad_size: how much to pad the inputs to. Only relevant if pooled=False. 

    Returns a list of tensors, each containing the embeddings for a whole dataset.
    """
    if type(self) == MixedModel:
      embeddings = [self.process_dataset(ds,pooled=pooled) for ds in datasets]
    else:
      embeddings = [self.process_dataset(ds,pooled=pooled, pad_size=pad_size) for ds in datasets]
    return embeddings

class FTModel(StaticModel):
  """
  A FastText model contained in the text file fname.
  """
  def __init__(self, fname):
    self.fname = fname
    self.__load_model()

  def __load_model(self):
    self.name = 'ft-'+self.fname[0:-4]

    print("Loading FastText model {}".format(self.name))
    self.model = gensim.models.KeyedVectors.load_word2vec_format(self.fname, limit=1000000)
    self.d = self.model.vector_size

    print("{} words loaded!".format(len(self.model.vocab)))
    print("d = {}".format(self.d))

class GloVeModel(StaticModel):
  """
  A GloVe model contained in the text file fname.
  """
  def __init__(self, fname):
    self.fname = fname
    self.__load_model()
  
  def __load_model(self):
      """
      Load a single GloVe model.
      Code from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
      """
      self.name = self.fname[0:-4]
      print("Loading Glove Model {}".format(self.name))
      f = open(self.fname,'r')
      self.model = {}

      for line in f:
        split_lines = line.split()
        word = split_lines[0]
        # this stops a really weird bug with glove.840B.300d
        if split_lines[1] in {'.', 'name@domain.com', 'Killerseats.com', 'mylot.com', 'Amazon.com'}:
          continue
        
        word_embedding = np.array([float(value) for value in split_lines[1:]])
        self.model[word] = word_embedding
      
      self.d = len(self.model['a'])

      print("{} words loaded!".format(len(self.model)))
      print("d = {}".format(self.d))
  
class Node2VecModel(StaticModel):
  """
  A Node2Vec model trained on the SNOMED knowledge graph.
  See arxiv.org/abs/1907.08650
  """
  def __init__(self):
    # drive location for n2v
    self.fid = ''
    self.name = 'n2v'
    self.__load_model()
  
  def __load_model(self):
    content = drive.CreateFile({'id': self.fid})
    self.content = content.GetContentString()
    
    self.model = {}
    for line in self.content.splitlines()[1:]:
      split_lines = line.split()
      id = split_lines[0]
      id_embedding = np.array([float(value) for value in split_lines[1:]])
      self.model[id] = id_embedding
    
    self.d = 200
    print("{} ids loaded!".format(len(self.model)))
    print("d = {}".format(self.d))
  
  def get_all(self):
    """Get all the SNOMED ids and all the SNOMED embeddings."""
    all_ids = list(self.model.keys())
    all_embeds = list(self.model.values())

    return all_ids, all_embeds

In [None]:
class MixedModel(StaticModel):
  """
  A class for a model that is a concatenation of the outputs from two or more
  models.

  'Term' and 'term' ARE CASE SENSITIVE

  dataset_cols: the columns of the dataset to be processed by each member of
  models. len(models) == len(dataset_cols)
  This can be one of:
    - 'Term'
    - 'Specific SNOMED ID'/'General SNOMED ID'
    - 'Specific SNOMED Label'/'General SNOMED Label'
    ===============
    (Put a . between l and e for a separator)
    - 'l e': label then example. Extract the label.
    - 'e l': example then label. Extract the label.
    - 'l t': label then term. Extract the label.
    - 't l': term then label. Extract the label.
     - 'term': Extract the term from the example.

  For instance, you will need to pass the `Specific SNOMED ID` col to Node2Vec,
  but you pass `Specific SNOMED Label` to FastText.
  """
  def __init__(self, dataset_cols, submodels):
    if len(dataset_cols) != len(submodels):
      raise ValueError("Please pass one column per model, ta")
    self.dataset_cols = dataset_cols
    self.submodels = submodels

    self.name = "+".join([m.name for m in self.submodels])
    self.d = sum([m.d for m in self.submodels])
  
  def process_dataset(self, dataset, pooled=True):
    columns = []
    for c in self.dataset_cols:
      if re.match("((General|Specific)\ SNOMED\ (ID|Label))|Term", c):
        columns.append(c)
      else:
        columns.append('')
    
    embeds = []
    # process each column with the appropriate dataset
    for i in range(0, len(self.dataset_cols)):
      col = self.dataset_cols[i]
      submodel = self.submodels[i]
      if re.match("((General|Specific)\ SNOMED\ (ID|Label))|Term", col):
        # context-insensitive embeddings
        column = dataset.col(columns[i])
        embeds.append(submodel.process_dataset(column, pooled=pooled))
      else:
        # context-sensitive embeddings
        # get_cs_embeddings processes whole datasets, so it returns a list with
        # one element here
        embeds.append(submodel.get_cs_embeddings(dataset, order=col[-3:], pooled=pooled)[0])

    if pooled:
      return tf.concat(embeds, -1)
    else:
      # embeds is a list of shapes [(batch_size, pad_len0, d0), (batch_size, pad_len1, d1), ...]
      # so we need to pad all of the tensors to a uniform pad length
      padded_embeds = pad_outputs(embeds, min_size=None, d=1, batches=False)
      return tf.concat(padded_embeds, -1)

### Download static models

In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip # wiki+gigaword5
# !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
# !wget http://nlp.stanford.edu/data/glove.42B.300d.zip # common crawl
# !wget http://nlp.stanford.edu/data/glove.840B.300d.zip # common crawl
# !wget https://github.com/basaldella/bioreddit/releases/download/v1.0/bioreddit.glove.100.txt
# !wget https://github.com/basaldella/bioreddit/releases/download/v1.0/bioreddit.glove.200.txt

In [None]:
# !unzip glove.6B.zip
# !rm glove.6B.zip
# !unzip glove.42B.300d
# !rm glove.42B.300d.zip

In [None]:
# Download the FastTexts

# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
# !gdown https://drive.google.com/uc?id=1CTZEO9pvR3C8DbxJ7bt-y4t3TWGirQ_0

In [None]:
# !unzip wiki-news-300d-1M-subword.vec
# !rm wiki-news-300d-1M-subword.vec.zip
# !unzip crawl-300d-2M-subword
# !rm crawl-300d-2M-subword.bin
# !rm crawl-300d-2M-subword.zip

### Load static models

In [None]:
def load_static_models(models, model_type):
  """
  Load all of the static models of a particular model_type (GloVe or FastText).
  Returns a dictionary of models. Each one has the same name as the filename,
  but without the .txt or .vec.
  """
  if model_type not in ['fasttext', 'glove']:
    raise ValueError("Only 'fasttext' or 'glove' please.")
  
  static_models = {}
  for model in models:
    if os.path.exists(model):
      model_name = model[0:-4]
      #static_models[model_name] = load_glove_model(model) if model_type == 'glove' else load_ft_model(model)
      static_models[model_name] = GloVeModel(model) if model_type == 'glove' else FTModel(model)
  
  return static_models

In [None]:
glove_models = ['glove.6B.300d.txt', 'glove.6B.200d.txt', 'glove.6B.100d.txt',
                'glove.42B.300d.txt',
                'glove.twitter.27B.200d.txt', 'glove.twitter.27B.100d.txt',
                'bioreddit.glove.200.txt', 'bioreddit.glove.100.txt']
fasttext_models = ['wiki-news-300d-1M-subword.vec', 'crawl-300d-2M-subword.vec',
                   'reddit-biomed.vec']

loaded_glove_models = load_static_models(glove_models, 'glove')
loaded_ft_models = load_static_models(fasttext_models, 'fasttext')
snomed2vec = Node2VecModel()

### Linear experiments

In [None]:
class LinearExperiment(Experiment):
  """
  A class with which to run an experiment for a linear mapping between terms
  and labels.

  Parameters:
  
  in_model: a StaticModel or NeuralModel object that produces the
  inputs to the transformation (i.e. the model that embeds the terms)

  out_model: the same, but the mdoel that embeds the concepts.

  datasets: a list of two Dataset objects in the order train, test.
  The linear mapping will be trained on the train dataset and evaluated against
  the test dataset.
  
  k_values: a list of the values of k to try.

  context_sensitive can be:
    '': neither the labels nor the terms are embedded context-sensitively
    'l e': label then example. Extract the label.
    'e l': example then label. Extract the label.
    'l t': label then term. Extract the label.
    't l': term then label. Extract the label.
    'term': Extract the term from the example.
  You can combine these as e.g. 'lt-term', which gives you context-sensitive
  labels and terms.
  """
  def __init__(self, in_model, out_model, datasets, k_values, context_sensitive=''):
    if in_model.d != out_model.d:
      raise ValueError("The two models need the same dimensions I'm afraid.")
    self.in_model = in_model; self.out_model = out_model

    self.use_ids = False
    if self.out_model.name == 'snomed2vec-200':
      self.use_ids = True
    self.datasets = datasets
    self.pooled = True # compatibility
    
    self.train_out = datasets[0].ids if self.use_ids else datasets[0].labels
    self.train_out_other = datasets[0].labels if self.use_ids else datasets[0].ids
    self.train_in = datasets[0].terms

    self.test_out = datasets[1].ids if self.use_ids else datasets[1].labels
    self.test_out_other = datasets[1].labels if self.use_ids else datasets[1].ids
    self.test_in = datasets[1].terms

    self.k_values = k_values
    self.cs = context_sensitive
    self.description = 'linear'

  def __generate_embeds(self):
    """
    Generate the embeddings for the train_out, train_in, test_in, test_out
    datasets.
    """
    if not self.cs:
      self.in_name = self.in_model.name
      self.x_train, self.x_test = self.in_model.get_embeddings(
          self.train_in, self.test_in, pooled=True
      )
      self.out_name = self.out_model.name
      self.y_train, self.y_test = self.out_model.get_embeddings(
          self.train_out, self.test_out, pooled=True
      )
    else:
      out_order = self.cs[0:3]
      self.out_name = self.out_model.name + '-cs_' + out_order
      self.y_train, self.y_test = self.out_model.get_cs_embeddings(
          self.datasets[0], self.datasets[1], order=out_order, pooled=True
      )
      if self.cs[-4:] == 'term':
        self.in_name = self.in_model.name + '-cs_term'
        self.x_train, self.x_test = self.in_model.get_cs_embeddings(
            self.datasets[0], self.datasets[1], order='term',
        )
      else:
        self.in_name = self.in_model.name
        self.x_train, self.x_test = self.in_model.get_embeddings(
          self.train_in, self.test_in, pooled=True
      )

  def learn_transformation(self, source, target):
    """
    Learn a svd transformation between source and target.
    `source` and `target` should both have the shape (n, d) where n is the number
    of training examples.
    From https://github.com/babylonhealth/fastText_multilingual/blob/master/align_your_own.ipynb
    see https://arxiv.org/pdf/1702.03859.pdf
    """
    source = np.array(source)
    target = np.array(target)

    product = np.matmul(source.transpose(), target)
    U, Sigma, V_transpose = np.linalg.svd(product)

    return np.matmul(U, V_transpose)

  def create_mapping(self):
    """A wrapper to create the linear mapping."""
    self.__generate_embeds()
    #self.x_train = tf.make_ndarray(self.x_train)
    #self.y_train = tf.make_ndarray(self.y_train)
    self.transformation = self.learn_transformation(self.x_train, self.y_train)

  def __apply_transformation(self, embedding):
    """Apply the transformation to an embedding."""
    embedding = np.array(embedding)
    return np.matmul(embedding, self.transformation)
  
  def make_predictions(self):
    """A wrapper to get the model to make predictions."""
    #self.predictions = [self.__apply_transformation(embed) for embed in tf.make_ndarray(self.x_test)]
    self.predictions = [self.__apply_transformation(embed) for embed in self.x_test]
  
  def metrics(self):
    # compatibility
    return ""

In [None]:
class LinearExperiments(Experiments):
  """
  A class to represent several linear mapping experiments run with several
  different in_models and several different values of k. They should all use the
  same datasets.

  Parameters:

  in_models: a list of StaticModel and NeuralModel objects. These
  produce the inputs to the linear mapping. Each in_model will be evaluated
  against each value of k.

  out_models: the same, but for the outputs.

  cs_values: a list of the context-sensitive values for each model.
  
  datasets: a list of two Dataset objects, in the order train, test.
  The same datasets will be used for each model--k_value pair.

  k_values: a list of values of k to try each model against.
  """
  def __init__(self, in_models, out_models, cs_values, datasets, k_values):
    if not len(in_models) == len(out_models) == len(cs_values):
      raise ValueError("One input model for one output model for one cs_value, ta")
    self.in_models = in_models
    self.out_models = out_models
    self.datasets = datasets
    self.k_values = k_values
    self.cs_values = cs_values
    
    # header for the data output
    self.header = ['in_model', 'd_in', 'out_model', 'd_out', 'k', 'Correct (all)',
                   'n', 'Correct (ee)', 'n (ee)', 'avg_edit (norm\'d)', ""]

    self.experiments = []
    for i in range(0, len(in_models)):
      in_model = in_models[i]
      out_model = out_models[i]
      cs = cs_values[i]
      self.experiments.append(LinearExperiment(in_model, out_model, datasets,
                                               k_values, context_sensitive=cs))

In [None]:
datasets = [train_d, test_d, val_d]
datasets_zsh = [train_d_zsh, test_d_zsh, val_d_zsh]
bert = BertModel(bert_tokenizer, bert_model, 'bert_large', 'lasths')

in_models = [bert]
out_models = in_models
cs_values = ['']
exps = LinearExperiments(in_models, out_models, cs_values, datasets, [1,10])

In [None]:
exps.run(zsh=False)

### Neural experiments

In [None]:
import IPython

class NeuralExperiment(Experiment):
  """
  A class with which to run an experiment for a neural mapping between terms
  and labels.

  Parameters:

  layers: the layers to use in the neural network architecture. It should
  be a list of keras layers. It should NOT include the input layer nor the final
  layer.

  arch_type: the type of architecture used (ffnn, CNN, RNN, etc).
    - CNN: use the format 'cnn-4' where 4 is the size of the convolution window.
    - LSTM: use 'lstm'
    - GRU: use 'gru'
  
  in_model: a StaticModel or NeuralModel object that produces the
  inputs to the neural model (i.e. the model that embeds the terms)

  out_model: the same, but the mdoel that embeds the concepts.

  datasets: a list of three Dataset objects in the order [train, test,
  val]. The neural mapping will be trained on the train dataset and evaluated
  against the test dataset, with the val dataset for tuning.

  k_values: a list of the values of k to try.

  context_sensitive can be:
    - '': neither the labels nor the terms are embedded context-sensitively
    - 'term': Extract the term from the example.
    (Below, put a . between l and e for a separator)
    - 'l e': label then example. Extract the label.
    - 'e l': example then label. Extract the label.
    - 'l t': label then term. Extract the label.
    - 't l': term then label. Extract the label.
  You can combine these as e.g. 'lt-term', which gives you context-sensitive
  labels and terms.

  use_ids: whether the SNOMED id is used. If it is False, then the natural
  language description of the SNOMED concept is used.
  """
  def __init__(self, layers, arch_type, in_model, out_model, datasets, k_values,
               context_sensitive='', use_ids=False):
    self.layers = layers

    if arch_type[0:3] == 'cnn' or arch_type[-4:] == 'lstm' or arch_type == 'gru':
      self.arch_type = arch_type
      self.pooled = False
      self.kernel_size = int(arch_type[4:]) if self.arch_type[0:3] == 'cnn' else None
    elif arch_type[0:4] == 'ffnn':
      self.arch_type = arch_type
      self.pooled = True

    self.use_ids = use_ids

    self.in_model = in_model
    self.out_model = out_model
    self.d_in = self.in_model.d
    self.d_out = self.out_model.d
    
    self.mixedin = True if isinstance(self.in_model, MixedModel) else False

    self.mixedout = True if isinstance(self.out_model, MixedModel) else False

    if type(self.out_model) == Node2VecModel:
      # node2vec has to use the SNOMED ids
      self.use_ids = True

    self.datasets = datasets
    self.cs = context_sensitive
    self.k_values = k_values
    
    # _out_other is used for analysing the results
    self.train_out = datasets[0].ids if self.use_ids else datasets[0].labels
    self.train_out_other = datasets[0].labels if self.use_ids else datasets[0].ids
    self.train_in = datasets[0].terms

    self.test_out = datasets[1].ids if self.use_ids else datasets[1].labels
    self.test_out_other = datasets[1].labels if self.use_ids else datasets[1].ids
    self.test_in = datasets[1].terms

    self.val_out = datasets[2].ids if self.use_ids else datasets[2].labels
    self.val_in = datasets[2].terms
    self.val_out_other = datasets[2].labels if self.use_ids else datasets[2].ids

  def __generate_inembeds(self):
    """Create the input embeddings."""
    # I apologise for how complicated the logic is here.
    self.in_name = self.in_model.name
    if self.mixedin:
      self.x_train, self.x_test, self.x_val = self.in_model.get_embeddings(
          self.datasets[0], self.datasets[1], self.datasets[2], pooled=self.pooled)
    else:
      if self.cs[-4:] == 'term': # context-sensitive term embeddings
        self.in_name += '-cs_term'
        embs = self.in_model.get_cs_embeddings(*self.datasets, order='term', pooled=True)
      else: # context-insensitive term embeddings
        strings = [self.train_in, self.test_in, self.val_in]
        if not self.pooled:
          embs = self.in_model.get_embeddings(*strings, pooled=False, pad_size=8)
        else:
          embs = self.in_model.get_embeddings(*strings, pooled=True)
    
      self.x_train, self.x_test, self.x_val = embs[0], embs[1], embs[2]

  def __generate_outembeds(self):
    """Generate the output emeddings."""
    # TODO: Make this more elegant by passing use_ids to self.out_model.get_embeddings
    self.out_name = self.out_model.name
    if self.mixedout:
      self.y_train, self.y_test, self.y_val = self.out_model.get_embeddings(
          self.datasets[0], self.datasets[1], self.datasets[2])
    elif self.cs not in ['', '-term']:
      order = self.cs[0:3]
      self.y_train, self.y_test, self.y_val = self.out_model.get_cs_embeddings(
          self.datasets[0], self.datasets[1], self.datasets[2], order=order, pooled=True,
      )
      self.out_name += '-cs_' + order
    else:
      strings = [self.train_out, self.val_out, self.test_out]
      self.y_train, self.y_val, self.y_test = self.out_model.get_embeddings(*strings, pooled=True)
      
  def create_mapping(self):
    """A wrapper to create the neural mapping."""
    # Create the embeddings
    self.__generate_inembeds()
    self.__generate_outembeds()

    self.train_dataset = tf.data.Dataset.from_tensor_slices((self.x_train, self.y_train))
    self.train_dataset = self.train_dataset.batch(64)
    self.val_dataset = tf.data.Dataset.from_tensor_slices((self.x_val, self.y_val))
    self.val_dataset = self.val_dataset.batch(64)

    # Create the model
    if self.arch_type == 'ffnn':
      self.create_ffnn()
    elif self.arch_type == 'cnn':
      self.create_cnn()
    elif self.arch_type == 'lstm':
      self.create_lstm()
    elif self.arch_type == 'bilstm':
      self.create_bilstm()
    elif self.arch_type == 'gru':
      self.create_gru()
    
    # Describe the model
    self.optimizer = 'adam-0.001-50'
    self.description = self.optimizer+'-'+self.in_model.name
    if self.pooled:
      self.description += '-avg-'
    else:
      self.description += '-'

    self.description += '-'.join([layer.name for layer in self.model.layers])
    self.description += '-'+self.out_model.name

    # Compile the model
    self.model.compile(optimizer='adam', # adam works better
                       loss='mse',
                       metrics=['cosine_similarity']) # cos-similarity between y and ŷ

    print(self.description)
    print(self.model.summary())
    
    self.history = self.model.fit(self.train_dataset, verbose=0, batch_size=64,
                                  epochs=50, validation_data=self.val_dataset)
    
    return self.model, self.history

  def create_ffnn(self):
    """Train a feedforward neural network model."""
    self.model = keras.Sequential([
      keras.Input(shape=(self.d_in,)),
      *self.layers,
      keras.layers.Dense(self.d_out, activation='relu', name='dense_out-'+str(self.d_out)),
    ])

  def create_lstm(self):
    """Train a LSTM network."""
    in_size = self.x_train.shape[1]

    self.model = keras.Sequential([
      keras.Input(shape=(in_size, self.d_in,)),
      keras.layers.LSTM(self.d_out, name='lstm-'+str(self.d_out)),
      keras.layers.BatchNormalization(name='batchnorm')
    ])

  def create_bilstm(self):
    """Train a LSTM network."""
    in_size = self.x_train.shape[1]

    self.model = keras.Sequential([
      keras.Input(shape=(in_size, self.d_in,)),
      keras.layers.Bidirectional(
        keras.layers.LSTM(self.d_out, name='bilstm-'+str(2*self.d_out)),
        merge_mode='concat',
      ),
      keras.layers.BatchNormalization(name='batchnorm'),
      keras.layers.Dense(self.d_out, activation='relu', name='dense_out'+str(self.d_out)),
    ])

  def create_gru(self):
    """Train a GRU network."""
    in_size = self.x_train.shape[1]

    self.model = keras.Sequential([
      keras.Input(shape=(in_size, self.d_in,)),
      keras.layers.GRU(self.d_out, name='gru-'+str(self.d_out)),
    ])

  def create_cnn(self):
    """Train a CNN model.

    The input (self.x_train etc) will be tensors of size (batch, in_size, d_in)
    The output (self.y_train etc) will be of the size (batch, d_out).
    Outputs can NEVER be of rank 3.
    """
    in_size = self.x_train.shape[1]

    self.conv_layer = self.layers[0]
    self.pool_layer = self.layers[1]

    self.model = keras.Sequential([
      keras.Input(shape=(in_size, self.d_in,)),
      keras.layers.Conv1D(128, self.kernel_size, activation='relu',
                          input_shape=self.x_train.shape[1:], name='conv1d-128-'+str(self.kernel_size)),
      keras.layers.GlobalMaxPool1D(name='globalmaxpool1d'), # pool across axis -2
      *self.layers,
      keras.layers.Dense(self.d_out, activation='relu', name='dense_out-'+str(self.d_out)),
    ])

  def make_predictions(self):
    """A wrapper to get the model to make predictions."""
    self.predictions = self.model.predict(self.x_test)
  
  def metrics(self):
    """
    Return a list of metrics:
    - the cosine similarity metric, evaluated on the test set.
    - the number of params for each model"""
    metrics = [str(self.model.evaluate(self.x_test, self.y_test, return_dict=True)['cosine_similarity']),
               str(self.model.count_params())
    ]
    return metrics
  
  def plot(self, save=True):
    """Plot a pretty graph of the learning curve, saving it if needed."""
    #description = self.optimizer+'-'+self.in_model.name+'-'+'-'.join([layer.name for layer in self.model.layers])
    fname = './drive/MyDrive/Dissertation/'+ self.description + '.png'
    print(self.description)

    plt.plot(self.history.history['cosine_similarity'])
    plt.plot(self.history.history['val_cosine_similarity'])
    plt.ylim(ymin=0)
    plt.ylabel('Cosine Similarity')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    if save:
      plt.savefig(fname, dpi=300)
    
    plt.show()

  def tune_ffnn(self, nlayers=3):
    """Tune a feedforward neural model, with nlayers."""
    self.__generate_inembeds()
    self.__generate_outembeds()

    train_dataset = tf.data.Dataset.from_tensor_slices((self.x_train, self.y_train))
    train_dataset = train_dataset.batch(64)
    val_dataset = tf.data.Dataset.from_tensor_slices((self.x_val, self.y_val))
    val_dataset = val_dataset.batch(64)

    def model_builder(hp):
      hp_layers = []; layers = []
      for i in range(0,nlayers):
        hp_layer = hp.Int('units_'+str(i), min_value = 100, max_value = 3000, step = 50)
        hp_layers.append(hp_layer)
        layers.append(keras.layers.Dense(hp_layer, activation='relu', name='dense_'+str(i)))

      # hp_units_0 = hp.Int('units_0', min_value = 100, max_value = 3000, step = 50)
      # hp_units_1 = hp.Int('units_1', min_value = 100, max_value = 3000, step = 50)
      # hp_units_2 = hp.Int('units_2', min_value = 100, max_value = 3000, step = 50)

      model = keras.Sequential([
        keras.Input(shape=(self.d_in,)),
        *layers,
        keras.layers.Dense(self.d_out, activation='relu', name='dense_out-'+str(self.d_out)),
      ])

      hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4])

      model.compile(optimizer = keras.optimizers.Adam(learning_rate = hp_learning_rate),
                    loss = keras.losses.MeanSquaredError(), 
                    metrics = ['cosine_similarity'])
      return model
    
    pname = self.in_model.name+'-'+str(nlayers)+'-hidden'
    tuner = kt.Hyperband(
      model_builder,
      objective = kt.Objective('val_cosine_similarity', direction="max"),
      max_epochs = 10,
      directory = './'+pname,
      project_name = pname
    )

    class ClearTrainingOutput(tf.keras.callbacks.Callback):
      def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

    tuner.search(train_dataset, epochs=10, validation_data=val_dataset, callbacks=[ClearTrainingOutput()])

    best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]
    optimal = []
    for i in range(0, nlayers):
      optimal.append(best_hps.get('units_'+str(i)))
      print("Layer "+str(i)+": "+str(best_hps.get('units_'+str(i))))
    optimal.append(best_hps.get('learning_rate'))
    print("Learning rate: "+str(best_hps.get('learning_rate')))
    
    return optimal

In [None]:
class NeuralExperiments(Experiments):
  """
  A class to represent several neural mapping experiments run with several
  different in_models and several different values of k. They should all use the
  same datasets.

  Parameters:

  architectures: a list of the architectures to try. Each architecure
  should be a list of keras layers. They should NOT include the input layer or
  the final layer.

  arch_types: a list of architecture types (MLP, CNN, RNN etc). It
  should be a list of the same length as `architectures`.

  in_models: a list of a StaticModel or NeuralModel objects. These produce the
  inputs to the experiments. Each in_model will be evaluated against each value of k.

  out_models: the same, but for the output.

  cs_values: a list of values of cs (context_sensitive) for each model.

  datasets: a list of three Dataset objects, in the order train, test, val.
  The same datasets will be used for each model--k_value pair.
  
  k_values: a list of values of k to try each model against.
  """
  def __init__(self, architectures, arch_types, in_models, out_models, cs_values, datasets, k_values):
    if len(in_models) != len(out_models):
      raise ValueError("Please have the same number of input models as output models.")
    self.in_models = in_models
    self.out_models = out_models
    if len(architectures) != len(arch_types):
      raise ValueError("Please match every architecture with an architecture type, tedious though it is.")
    
    self.architectures = architectures
    self.arch_types = arch_types
    self.cs_values = cs_values
    self.datasets = datasets
    self.k_values = k_values

    # header to use with the results dataset
    self.header = ['in_model', 'd_in', 'out_model', 'd_out', 'k', 'Correct (all)',
                   'n', 'Correct (ee)', 'n (ee)', 'avg_edit (norm\'d)', 'cos_sim',
                   'params', 'description']
    
    self.experiments = []
    for i in range(0, len(in_models)):
      in_model = in_models[i]
      out_model = out_models[i]
      arch = self.architectures[i]
      arch_type = self.arch_types[i]
      cs_val = self.cs_values[i]

      self.experiments.append(NeuralExperiment(arch, arch_type, in_model, out_model,
                                               datasets, self.k_values, cs_val))

  def plot(self, save=True):
    for experiment in self.experiments:
      experiment.plot(save=save)

In [None]:
# Load the input and output language models

fasttext = loaded_ft_models['reddit-biomed']; fasttext.name = 'ft_br'
bert = BertModel(bert_tokenizer, bert_model, 'bert_br', 'lasths')

bertftnode2vec_out = MixedModel(['Specific SNOMED ID', 'Specific SNOMED Label', 'Specific SNOMED Label'],
                                [snomed2vec, fasttext, bert])
ftnode2vec_out = MixedModel(['Specific SNOMED ID', 'Specific SNOMED Label'],
                            [snomed2vec, fasttext])
bertnode2vec_out = MixedModel(['Specific SNOMED ID', 'Specific SNOMED Label'],
                              [snomed2vec, bert])

bertft_in = MixedModel(['Term', 'Term'], [fasttext, bert])
bertft_out = MixedModel(['Specific SNOMED Label', 'Specific SNOMED Label'],
                        [fasttext, bert])

bertft_out_cs1 = MixedModel(['Specific SNOMED Label', 'l t'], [fasttext, bert])
bertft_out_cs2 = MixedModel(['Specific SNOMED Label', 'l.t'], [fasttext, bert])

In [None]:
# for some reason you can't just do [[arch]]*3, you've got to write out
# [[arch], [arch], [arch]] manually... I think it has something to do with the
# way Python handles lists???
archs = [
[
          keras.layers.Dense(500, activation='relu', name='dense_0-500'),
          keras.layers.Dense(500, activation='relu', name='dense_1-500'),
          keras.layers.Dense(500, activation='relu', name='dense_2-500'),
],
] 

arch_types = ['ffnn'] *1
in_models = [bert]
out_models = [bert]
datasets = [train_d, test_d, val_d]
datasets_zsh = [train_d_zsh, test_d_zsh, val_d_zsh]
k_values = [1, 10]
cs_values = ['']

exps_strat = NeuralExperiments(archs, arch_types, in_models, out_models, cs_values, datasets, k_values)
exps_zsh = NeuralExperiments(archs, arch_types, in_models, out_models, cs_values, datasets_zsh, k_values)

In [None]:
exps_strat.run(zsh=False)

In [None]:
exps_zsh.run(zsh=True)

In [None]:
exps_zsh.plot(save=False)