#Exploring Methods of Metaphor Detection Using Similarity

*Final Project for COMS W4995: Semantic Representations for NLP*

By Corina Hanaburgh, Tiara Sykes, Raefah Wahid

#Imports

In [1]:
!pip install bert-embedding mxnet-cu100

Collecting bert-embedding
  Downloading bert_embedding-1.0.1-py3-none-any.whl (13 kB)
Collecting mxnet-cu100
  Downloading mxnet_cu100-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (352.6 MB)
[K     |████████████████████████████████| 352.6 MB 12 kB/s 
[?25hCollecting gluonnlp==0.6.0
  Downloading gluonnlp-0.6.0.tar.gz (209 kB)
[K     |████████████████████████████████| 209 kB 53.7 MB/s 
[?25hCollecting mxnet==1.4.0
  Downloading mxnet-1.4.0-py2.py3-none-manylinux1_x86_64.whl (29.6 MB)
[K     |████████████████████████████████| 29.6 MB 36 kB/s 
[?25hCollecting numpy==1.14.6
  Downloading numpy-1.14.6-cp37-cp37m-manylinux1_x86_64.whl (13.8 MB)
[K     |████████████████████████████████| 13.8 MB 131 kB/s 
[?25hCollecting typing==3.6.6
  Downloading typing-3.6.6-py3-none-any.whl (25 kB)
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Collecting mxnet-cu100
  Downloading mxnet_cu100-1.8.0-py2.py3-none-manylinux2014_x86_64.whl (344.4 MB)
[

In [2]:
!tar -xzvf eng-com_web-public_2018_10K.tar.gz

tar (child): eng-com_web-public_2018_10K.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [3]:
!wget http://pcai056.informatik.uni-leipzig.de/downloads/corpora/eng-com_web-public_2018_10K.tar.gz

--2021-07-25 23:00:45--  http://pcai056.informatik.uni-leipzig.de/downloads/corpora/eng-com_web-public_2018_10K.tar.gz
Resolving pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)... 139.18.2.216
Connecting to pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)|139.18.2.216|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2310663 (2.2M) [application/x-gzip]
Saving to: ‘eng-com_web-public_2018_10K.tar.gz’


2021-07-25 23:00:46 (2.11 MB/s) - ‘eng-com_web-public_2018_10K.tar.gz’ saved [2310663/2310663]



In [4]:
!tar -xzvf eng-com_web-public_2018_10K.tar.gz

eng-com_web-public_2018_10K/
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-sentences.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-inv_so.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-import.sql
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-co_s.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-co_n.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-sources.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-inv_w.txt
eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-words.txt


In [5]:
import mxnet as mx
import numpy as np
import copy
import nltk
import pandas as pd
import string
import csv
from scipy.spatial.distance import cosine
from bert_embedding import BertEmbedding
from tqdm.auto import tqdm, trange
from sklearn.neighbors import KDTree
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from string import punctuation
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

#Generate Word2Vec Model

In [6]:
def preprocess_text(filename, separator):
  df = pd.read_csv(filename, sep=separator)
  lines = df.iloc[:,[0]].values
  cleaned = []
  for sentence in lines[1:]:
    for sen in sentence:
        exclude = set(string.punctuation)
        s = ''.join(ch for ch in sen if ch not in exclude)
        cleaned.append(s)
  
  return cleaned

In [7]:
def generate_corpus(preprocessed_text):
  tokenized = [nltk.word_tokenize(sentence) for sentence in preprocessed_text]
  new_corpus = []
  for sentence in tokenized:
    stop_words = set(stopwords.words('english')) 
    filtered = [w for w in sentence if not w in stop_words] 
    new_corpus.append(filtered)
  return new_corpus

In [8]:
def generate_wordvec_model(text, separator):
  preprocessed_text = preprocess_text(text, separator)
  new_corpus = generate_corpus(preprocessed_text)
  model = Word2Vec(new_corpus, min_count=1, size=768, sg=0) 

  return model

In [9]:
book_wordvec_model = generate_wordvec_model('eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-sentences.txt', '\n')

In [10]:
obama_wordvec_model = generate_wordvec_model('/content/obama_speeches.txt', '\t')

Note: 'obama_speeches.txt' file was a corpus compiled by us and has to be uploaded locally.

#Generate BERT Model

In [11]:
def process_corpus(filename):
  with open(filename, 'r') as f:
    lines = f.readlines()
  
  for index, line in enumerate(lines):
    reformated_line = line.lstrip('0123456789.-\t')
    reformated_line = reformated_line.rstrip('\n')
    lines[index] = reformated_line
  
  return lines

In [12]:
def create_model(corpus):
  ctx = mx.gpu(0)
  bert = BertEmbedding(ctx=ctx, max_seq_length=75)

  storage = ContextNeighborStorage(sentences=corpus, model=bert)
  storage.process_sentences()
  
  storage.build_search_index()

  return bert, storage


In [13]:
sentences = process_corpus('eng-com_web-public_2018_10K/eng-com_web-public_2018_10K-sentences.txt')

In [16]:
obama_sentences = process_corpus('obama_speeches.txt')

In [19]:
obama_model, obama_storage = create_model(obama_sentences)

HBox(children=(FloatProgress(value=0.0, max=2669.0), HTML(value='')))




In [20]:
bert_model, storage = create_model(sentences)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




#Lesk Algorithm

In [21]:
def get_candidates(target):
  syn = wordnet.synsets(target)
  candidates = set()
  for s in syn:
    for synonym in s.lemmas():
      candidates.add(synonym)
    for hypernym in s.hypernyms():
      for lem in hypernym.lemmas():
        candidates.add(lem) # .replace("_", " ")
  return candidates

In [22]:
def get_context(query, target):
  stop_words = set(stopwords.words('english'))
  query_copy = copy.copy(query)
  query_copy.replace("'s", "").replace(",", "")
  index = 0
  for i, word in enumerate(query_copy.split(" ")):
      if target in word:
        index = i
  left_context = query_copy.split(" ")[:index]
  right_context = query_copy.split(" ")[index+1:]

  context = set(left_context)
  c = set(right_context)
  context |= c
  context -= stop_words
  con = context

  return con, index

In [23]:
def most_frequent_lemma(most_freq_synset, context_lemma):
    freq_lemma = {} # dictionary to store lemmas as keys and their frequencies as values
    all_lemmas = most_freq_synset.lemmas() # getting list of lemmas from the passed synset
    for lemma in all_lemmas: # iterating through the lemmas
        word = lemma.name() # getting name of the current lemma
        frequency = lemma.count() # getting the frequency of the current lemma
        if word != context_lemma: # if the current lemma is not the target word
            if word in freq_lemma: # and if the key is in the dictionary
                freq_lemma[word] += frequency # update the value of the current lemma
            else: # if the key isn't in the dictionary
                freq_lemma[word] = frequency # create an entry in the dictionary

    most_freq_lemma = ""
    maximum = 0
    if freq_lemma.values():
      maximum = max(freq_lemma.values()) # getting the maximum value from the dictionary
      for key in freq_lemma: # iterating through the dictionary to find the key that corresponds to the maximum value
          if freq_lemma[key] == maximum:
              most_freq_lemma = key # storing the lemma with the highest frequency in most_freq_lemma

    return most_freq_lemma # returning the lemma with the most frequency

In [24]:
def most_frequent_synset(all_synsets, context_lemma):
    target_freq = {} # dictionary with synsets as keys and frequencies as values
    for synset in all_synsets: # for each synset
        synset_lemmas = synset.lemmas() # get a list of lemmas from the current synset
        for lemma in synset_lemmas: # for each lemma
            if lemma.name() == context_lemma: # if the current lemma is the same as the target word
                target_freq[synset] = lemma.count() # add the frequency of the lemma as the value for the synset (key)
    freq_synset = None
    maximum = max(target_freq.values()) # getting the maximum value from the dictionary
    for key in target_freq: # iterating through all keys in the dictionary to find the key that corresponds to the maximum value
        if target_freq[key] == maximum:
            freq_synset = key # the key for the maximum value is stored in freq_synset
    if not freq_synset:
        return
    return freq_synset # returning the synset with the most frequent occurrence of the target word

In [25]:
def lesk_algorithm(query, target):
  best_prediction = None #best sense
  result = None
  max_overlap = 0
  exclude = set(string.punctuation)

  matching_lemma = []

  sentences = set()
  best_fit = dict()

  candidates = get_candidates(target)
  con, index = get_context(query, target)

  for word in candidates:
    synset = word.synset()
    syn_def = synset.definition()
    s = ''.join(ch for ch in syn_def if ch not in exclude)
    sentences |= set(s.split(" "))
    for ex in synset.examples():
      example = ''.join(ch for ch in ex if ch not in exclude)
      sentences |= set(example.split(" "))
    for hyper in synset.hypernyms():
      definition = hyper.definition()
      hyper_def = ''.join(ch for ch in definition if ch not in exclude)
      sentences |= set(hyper_def.split(" "))
      for ex in hyper.examples():
        hyper_ex = ''.join(ch for ch in ex if ch not in exclude)
        sentences |= set(hyper_ex.split(" "))
    
    sentences = set(word.lower() for word in sentences)
 
    overlap = len(sentences & con) #check for overlap in the context and definition
    best_fit[synset] = overlap

    max_overlaps = [] # finding the synsets with the highest overlaps (there can only be one, but same may be tied)
    overlapped_synset = None # to store the synset with the most overlap
    all_synsets = list(best_fit.keys()) # getting a list of all the synsets from the keys of the dictionary
    maximum = max(best_fit.values()) # getting the maximum value (highest overlap) from the dictionary

    if maximum > 0: # if there is overlap
      for key in best_fit: # iterate through the synsets in the dictionary to find the synset that matches with the highest value
        if best_fit[key] == maximum:
          max_overlaps.append(key) # append the corresponding synset to the list (there may be multiple with the same value)
    else: # if there is no overlap
        freq_synset = most_frequent_synset(all_synsets, word.name()) # find the synset with the most frequently occurring target word
        matching_lemma.append(most_frequent_lemma(freq_synset, word.name())) # find the most frequent lemma from the most frequent target word

    if len(max_overlaps) == 1: # if there is only one synset with the best overlap
        matching_lemma.append(most_frequent_lemma(max_overlaps[0], word.name())) # find the most frequent lemma in that synset
    elif len(max_overlaps) > 1: # if there are multiple synsets with the best overlap
        freq_synset = most_frequent_synset(max_overlaps, word.name()) # find the synset with the most frequently occurring target word out of the most overlapping synsets
        if freq_synset:
          
          matching_lemma.append(most_frequent_lemma(freq_synset, word.name())) # then find the most frequent lemma from the most frequent synset
        else:
          matching_lemma.append(most_frequent_lemma(synset, word.name()))
 
  chosen = ""
  for word in matching_lemma:
    
    if word.lower() != target.lower() and word != "":
      chosen = word.replace("_", " ")
      break
  
  
  query_list = query.split(" ")
  replaced_sentence = " ".join(query_list[:index] + [chosen] + query_list[index+1:])
  
  return chosen, replaced_sentence, index

#K Nearest Neighbor Algorithm

In [26]:
class ContextNeighborStorage:
    def __init__(self, sentences, model):
        self.sentences = sentences
        self.model = model

    def process_sentences(self):
        result = self.model(self.sentences)

        self.sentence_ids = []
        self.token_ids = []
        self.all_tokens = []
        all_embeddings = []
        for i, (toks, embs) in enumerate(tqdm(result)):
            for j, (tok, emb) in enumerate(zip(toks, embs)):
                self.sentence_ids.append(i)
                self.token_ids.append(j)
                self.all_tokens.append(tok)
                all_embeddings.append(emb)
        all_embeddings = np.stack(all_embeddings)
        # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
        self.normed_embeddings = (all_embeddings.T / (all_embeddings**2).sum(axis=1) ** 0.5).T

    def build_search_index(self):
        # this takes some time
        self.indexer = KDTree(self.normed_embeddings)

    def query(self, query_sent, query_word, k=10, filter_same_word=False):
        toks, embs = self.model([query_sent])[0]

        found = False
        for tok, emb in zip(toks, embs):
            if tok == query_word:
                found = True
                break
        if not found:
            raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
        emb = emb / sum(emb**2)**0.5

        if filter_same_word:
            initial_k = max(k, 100)
        else:
            initial_k = k
        di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)
        distances = []
        neighbors = []
        contexts = []
        for i, index in enumerate(idx.ravel()):
            token = self.all_tokens[index]
            if filter_same_word and (query_word in token or token in query_word):
                continue
            distances.append(di.ravel()[i])
            neighbors.append(token)
            contexts.append(self.sentences[self.sentence_ids[index]])
            if len(distances) == k:
                break
        return distances, neighbors, contexts

#Method 1: Word2Vec and Lesk Algorithm

In [27]:
def wordvec_compute_similarity(model, query, target):
  replaced_word, replaced_sentence, index = lesk_algorithm(query, target)
  context, con_index = get_context(query, target)
  s = (768,)
  if target in model.wv.vocab:
    target_word = model[target]
  else:
    target_word = np.zeros(s)
  

  count = 0
  context_average = np.zeros(s)
  for word in context:
    if word in model.wv.vocab:
      temp = context_average
      context_average = np.add(temp, model[word])
      count += 1

  context_average = np.nanmean(context_average, axis=0)

  phrase = replaced_word.split(" ")
  count_r = 0
  phrase_average = np.zeros(s)
  if len(phrase) > 1:
    for w in phrase:
      if w in model.wv.vocab:
        temp = phrase_average
        phrase_average = np.add(temp, model[w])
        count_r += 1
    phrase_average=np.nanmean(phrase_average, axis=0)
  else:
    if phrase[0] in model.wv.vocab:
      phrase_average = model[phrase]


  with_target = 1 - cosine(target_word, context_average)
  with_synonym = 1 - cosine(phrase_average, context_average)

  return with_target, with_synonym, replaced_word

In [28]:
def wordvec_main(model, filename, corpus_name):
  with open(filename, 'r') as f:
    lines = f.readlines()
  examples = []
  for line in lines[1:]:
    string_split = line.split("\t")
    formatted_target = string_split[1].rstrip()

    input = [string_split[0], string_split[1].rstrip()]
    examples.append(input)

  data_list = [["Query", "Target", "Synonym", "Target Similarity", "Synonym Similarity", "Difference"]]
  
  for line in examples:
    tar_sim, syn_sim, synonym = wordvec_compute_similarity(model, line[0], line[1])
    data = [line[0], line[1], synonym, tar_sim, syn_sim, tar_sim-syn_sim]
    data_list.append(data)

  new_filename = corpus_name + '_results.tsv'
  with open(new_filename, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows(data_list)

In [30]:
wordvec_main(book_wordvec_model, '/content/metaphor_results - Examples.tsv','book_word2vec')

  
  app.launch_new_instance()
  dist = 1.0 - uv / np.sqrt(uu * vv)


In [31]:
wordvec_main(obama_wordvec_model, '/content/metaphor_results - Examples.tsv','obama_word2vec')

  
  app.launch_new_instance()
  dist = 1.0 - uv / np.sqrt(uu * vv)


#Method 2: BERT and KNN Algorithm


In [32]:
def replace_word_sentences(storage,query, target):
  def k_nearest_chosen_word(storage, query, target):
    distances, neighbors, contexts = storage.query(query_sent=query, query_word=target, k=5, filter_same_word=True)
    replace_word = ""
    for d, w, c in zip(distances, neighbors, contexts):
      if query not in c.strip():
        replace_word = w
        break
    return replace_word

  chosen_word = k_nearest_chosen_word(storage, query, target.lower())
  
  replaced_sentence = query.replace(target, chosen_word)

  return replaced_sentence, chosen_word


In [33]:
def compute_similarity(model, storage, query, target):
  original_embeddings = model([query])[0][1]
  replaced_sentence, chosen_word = replace_word_sentences(storage, query, target)
  replaced_embeddings = model([replaced_sentence])[0][1]
 
  index = 0
  for i, word in enumerate(query.split(" ")):
    if target in word:
      if word.strip(punctuation) == target:
        index = i

  original_context = original_embeddings[:index] + original_embeddings[index+1:]
  orig_sum = np.zeros([1, 768])
  for em in original_context:
    temp = orig_sum
    orig_sum = np.add(temp, em) 
  orig_avg = orig_sum/len(original_context)

  replaced_context = original_embeddings[:index] + original_embeddings[index+1:]
  rep_sum = np.zeros([1, 768])
  for em in original_context:
    temp = rep_sum
    rep_sum = np.add(temp, em) 
  rep_avg = rep_sum/len(replaced_context)

  target_word = original_embeddings[index]
  synonym_word = replaced_embeddings[index]

  with_target = 1 - cosine(target_word, orig_avg[0])
  with_synonym = 1 - cosine(synonym_word, rep_avg[0])

  return with_target, with_synonym, chosen_word

In [34]:
def k_nearest_bert_main(model, storage, filename, corpus_name):
  with open(filename, 'r') as f:
    lines = f.readlines()
  examples = []
  for line in lines[1:]:
    string_split = line.split("\t")
    formatted_target = string_split[1].rstrip()

    input = [string_split[0], string_split[1].rstrip()]
    examples.append(input)

  data_list = [["Query", "Target", "Synonym", "Target Similarity", "Synonym Similarity", "Difference"]]
  
  for line in examples:
    tar_sim, syn_sim, synonym = compute_similarity(model, storage, line[0], line[1])
    data = [line[0], line[1], synonym, tar_sim, syn_sim, tar_sim-syn_sim]
    data_list.append(data)

  new_filename = corpus_name + '_results.tsv'
  with open(new_filename, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows(data_list)

In [35]:
k_nearest_bert_main(bert_model, storage, '/content/metaphor_results - Examples.tsv', 'book')

In [36]:
k_nearest_bert_main(obama_model, obama_storage, '/content/metaphor_results - Examples.tsv', 'obama')

#Method 3: BERT and Lesk Algorithm


In [37]:
def bert_lesk_compute_similarity(model, query, target):
  replaced_word, replaced_sentence, index = lesk_algorithm(query, target)

  original_embeddings = model([query])[0][1]
  replaced_embeddings = model([replaced_sentence])[0][1]

  target_word = original_embeddings[index]
  synonym_word = replaced_embeddings[index]

  sim = 1 - cosine(target_word, synonym_word)
  return sim, replaced_word

In [38]:
def bert_lesk_main(model, filename, corpus_name):
  with open(filename, 'r') as f:
    lines = f.readlines()
  examples = []
  for line in lines[1:]:
    string_split = line.split("\t")
    formatted_target = string_split[1].rstrip()

    input = [string_split[0], string_split[1].rstrip()]
    examples.append(input)

  data_list = [["Query", "Target", "Synonym", "Similarity"]]
  
  for line in examples:
    similarity, synonym = bert_lesk_compute_similarity(model, line[0], line[1])
    data = [line[0], line[1], synonym, similarity]
    data_list.append(data)

  new_filename = corpus_name + '_results.tsv'
  with open(new_filename, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows(data_list)

In [39]:
bert_lesk_main(bert_model, '/content/metaphor_results - Examples.tsv', 'book_bert_lesk')

In [40]:
bert_lesk_main(obama_model, '/content/metaphor_results - Examples.tsv', 'obama_bert_lesk')

# Results

Sample sentences and performance results can be viewed [on this Google Sheets document](https://docs.google.com/spreadsheets/d/1O7B0R4aRCWZwaNQEC3c7AunF47o9IapU0ZYiXUD6oq8/edit?usp=sharing).