In [38]:
import sys

sys.path.append('../generate_dataset/resources')
sys.path.append('../Siamese')
import SiameseNet
from pytorch_fast_elmo import FastElmo, batch_to_char_ids
import sys
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy
import matplotlib.pyplot as plt
import random
import torch
from typing import List
import numpy as np
import utils
import termcolor


from typing import NewType

In [41]:
class Vector(object):
    
    def __init__(self, vec, sentence, index):
        
        self.vec = vec
        self.sentence = sentence
        self.index = index
        self.size = np.linalg.norm(self.vec)
    
    def get_word(self):
        
        return self.sentence[self.index]
    
    def get_vector(self): 
        
        return self.vec
    
    def get_sentence(self):
        
        return self.sentence
    
    def get_index(self):
        
        return self.index
    
    def get_size(self):
        
        return self.size
    
    def __str__(self):
        
        words = self.get_sentence()
        i = self.get_index()
        before = " ".join(words[:i])
        after = " ".join(words[i + 1:])
        word = termcolor.colored(self.get_word(), "blue", attrs = ['bold'])
        sent = '""' + before + " " + word + " " + after + '"'
        return sent
    
    def similarity(self, other):
        
        if other is self: return -np.inf
        
        return self.get_vector().dot(other.get_vector())/(self.get_size() * other.get_size())
    
    @staticmethod
    def get_closest_vector(vec, vecs):
    
        closest = max(vecs, key = lambda vector: vector.similarity(vec))
        return closest

In [32]:
def transform(model, vector: Vector):
    
    vec_pytorch = torch.from_numpy(vector.get_vector()).float()
    vector.vec = model._represent(vec_pytorch).detach().numpy()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
def print_closest_vectors(sample, all_vecs):
    
    for vector in sample:
        closest = Vector.get_closest_vector(vector, all_vecs)
        print("The closest vector to\n{}\nIs\n{} \n ==========================================\n".format(vector, closest))
    
def load_sents(fname = "sents_f", max_length = 15):
    
    with open(fname, "r") as f:
              
        lines = f.readlines()
    
    lines =  [line.strip().split(" ") for line in lines]
    if max_length is not None:
        lines = list(filter(lambda sentence: len(sentence) < max_length, lines))
    return lines

def load_model(name = "model.pt"):
    net = SiameseNet.SiameseNet()
    net.load_state_dict(torch.load(name))
    net.eval()
    return net

def list_vectors(vecs, sents: List[List[str]]) -> List[Vector]:
    
    num_sentences = len(sents)
    sents_indices_and_vecs =  zip(range(num_sentences), vecs)
    all_vectors = []

    for sent_index, sent_vectors in sents_indices_and_vecs:
        
        for i, (w,vec) in enumerate(zip(sents[sent_index], vecs[sent_index])):   
            
            v = Vector(vec.detach().numpy(), sents[sent_index], i)
            all_vectors.append(v)
            
    return all_vectors

Load pretrained ELMO and a collection of Wikipedia sentences.

In [12]:
all_sentences = load_sents(max_length = 15)
options_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_options"]
weight_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_weights"]
elmo = FastElmo(options_file, weight_file)

In [13]:
N = 1000
subset_size = 150
random.seed(0)
sentences = all_sentences[:N]

Collect ELMO states over N sentences, and load pretrained Siamese network.

In [34]:
character_ids = batch_to_char_ids(sentences)
embeddings = elmo(character_ids) # collect elmo states
network = load_model()

### Find the closest vectors to a collection of (original) ELMO vectors.

In [42]:
vecs = list_vectors(embeddings["elmo_representations"][0], sentences)
subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
""the girls quickly flee back to land , but the [1m[34mcouple[0m follows them ."
Is
""the [1m[34mgroom[0m carries the bride on his back and runs away with her ." 

The closest vector to
""some [1m[34mof[0m his most prominent roles are his appearances in er as dr ."
Is
""some [1m[34mof[0m the organizations are actually the youth wings of larger political parties ." 

The closest vector to
""garcia blasts morales with a left hook that launches morales through the ropes [1m[34m.[0m "
Is
""german night fighters , although five more bombers crash upon returning to england [1m[34m.[0m " 

The closest vector to
""i [1m[34mcalled[0m the tune , i could not get them to dance together ."
Is
""perry was [1m[34mcalled[0m to the bar in 1980 and took silk in 2006 ." 

The closest vector to
""there is little or no [1m[34mpercussion[0m and rhythm is not a prominent element ."
Is
""there is little or no percussion and [1m[34mrhythm[0m is not a prominen

The closest vector to
""the most common apartment size was 5 rooms [1m[34mof[0m which there were 23 ."
Is
""the most common apartment size was 4 rooms [1m[34mof[0m which there were 26 ." 

The closest vector to
""the report examined nearly 300 individuals and hundreds of classified armed forces [1m[34msignals[0m ."
Is
""he got the name from seeing multiple road [1m[34msigns[0m during a road trip ." 

The closest vector to
""the power plant has 1 generating unit with an [1m[34minstalled[0m capacity of 3 ."
Is
""the power plant has 1 generating unit with [1m[34man[0m installed capacity of 3 ." 

The closest vector to
""ok , i guess this [1m[34mis[0m a test of how closely we read ."
Is
""this [1m[34mis[0m an evolution of a matrix that happens inside one similarity class ." 

The closest vector to
"" [1m[34mwinters[0m are quite cold with plenty of snow , with temperatures ranging between ."
Is
""the master of science in administration was added in the [1m[34mwi

The closest vector to
""he received a phd in composition from the university of [1m[34myork[0m in 1984 ."
Is
""he graduated from hamilton college in clinton , new [1m[34myork[0m , in 1823 ." 

The closest vector to
""each member of the project team is expected [1m[34mto[0m keep a project diary ."
Is
""i am ready [1m[34mto[0m make an introduction at the category page to explain ." 

The closest vector to
""the master of science [1m[34min[0m administration was added in the winter of 1983 ."
Is
""she was recognized for her outstanding research [1m[34min[0m 2004 with the thomas c ." 

The closest vector to
""some of his most prominent roles are [1m[34mhis[0m appearances in er as dr ."
Is
""following [1m[34mhis[0m stint at marvel he signed exclusively with dc comics in 2004 ." 

The closest vector to
""but please be careful to record exactly where [1m[34msuch[0m ads were originally published ."
Is
""but [1m[34msuch[0m situations are necessarily rare , since the

The closest vector to
""he accused one of those two wikipedians of being on the kennedy [1m[34mpayroll[0m ."
Is
""black canary offers her a spot on the new birds of prey [1m[34mroster[0m ." 

The closest vector to
""henry cotton wins his [1m[34mthird[0m british open , at the age of 41 ."
Is
""this was his [1m[34msecond[0m all star , also winning an award in 2004 ." 

The closest vector to
""michael was amongst [1m[34ma[0m group of leaders of the communist underground in iraq ."
Is
""the was [1m[34ma[0m japanese revolutionary armed group , established on 15 july 1971 ." 

The closest vector to
""automobile magazine named the audi a7 `` 2012 [1m[34mautomobile[0m of the year '' ."
Is
"" [1m[34mautomobile[0m magazine named the audi a7 `` 2012 automobile of the year '' ." 

The closest vector to
""actually , i drafted [1m[34ma[0m good bit of the text of that essay ."
Is
""i already added [1m[34ma[0m little information about it to california state route 88 ." 



The closest vector to
""hello , i 'm looking for information on a ship [1m[34mnamed[0m hms joanna ."
Is
""another town [1m[34mnamed[0m dudley originally existed several miles north from its current location ." 

The closest vector to
""many have tried to break this record , but [1m[34mfew[0m have come close ."
Is
""a [1m[34mfew[0m of the sources merely mention the subject in a trivial context ." 

The closest vector to
"" [1m[34malberta[0m legislative building - the legislature of alberta consists of two component 1 ."
Is
""alberta legislative building - the legislature of [1m[34malberta[0m consists of two component 1 ." 

The closest vector to
""his career at the bar ended with the outbreak of world [1m[34mwar[0m ii ."
Is
""settled in france , and fought for the french in world [1m[34mwar[0m ii ." 

The closest vector to
""but once again , it proves that honesty always gets you in trouble [1m[34m.[0m "
Is
""the notion that you lock people up for smoking mari

### Now, first apply the trained Siamese network to each vector, and recalculate closest vectors.

In [48]:
for v in vecs:
    transform(network, v)

subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
""at the moment , [1m[34mthe[0m bus lanes are being built throughout the city ."
Is
"" [1m[34mthe[0m master of science in administration was added in the winter of 1983 ." 

The closest vector to
""press ok to continue , or [1m[34mcancel[0m to stay on the current page ."
Is
""i would like to make edits under my real name from now [1m[34mon[0m ." 

The closest vector to
""he [1m[34mhas[0m also depicted various christian themes including the christian cross and churches ."
Is
""i 've missed several meetings , maybe i will try to show [1m[34mup[0m ." 

The closest vector to
""i added a bit to the introduction as well as the [1m[34mhistory[0m section ."
Is
""ticket number i edited the [1m[34mtemplate[0m so that the ticket number is displayed ." 

The closest vector to
""his second song of the evening was `` does your [1m[34mmother[0m know '' ."
Is
""i was the one added it within the article as a proper [1m[34mreference[0m ." 

The closest

The closest vector to
""the german national archives [1m[34mhold[0m no records for the presentation of the swords ."
Is
""that very well may happen [1m[34m,[0m but that is for later to decide ." 

The closest vector to
"" [1m[34mthe[0m final decision was taken on the uefa meeting on 13 may 2009 ."
Is
"" [1m[34mthe[0m master of science in administration was added in the winter of 1983 ." 

The closest vector to
""he has no [1m[34mdefinitive[0m personal , cultural or sexual identity throughout the book ."
Is
""roberts , kate ( [1m[34m2001[0m ) , sun and storm and other stories ." 

The closest vector to
""likely to any significant extent to stimulate or encourage the commission [1m[34mof[0m offences ."
Is
""frank stack collection - a special collection of the university [1m[34mof[0m missouri libraries ." 

The closest vector to
""marriage information per parish records in [1m[34mdevon[0m and verified with researcher in uk ."
Is
""she thinks i 'm crazy but i 'm 

The closest vector to
""bill clinton was born in the city [1m[34mand[0m state of hope , arkansas ."
Is
""roberts , kate ( [1m[34m2001[0m ) , sun and storm and other stories ." 

The closest vector to
""she was sold in 1831 to commercial interests and returned to [1m[34mcape[0m town ."
Is
""i was the one added it within the article as a proper [1m[34mreference[0m ." 

The closest vector to
""randall died on february 13 , [1m[34m2013[0m after suffering complications from a stroke ."
Is
""roberts , kate ( [1m[34m2001[0m ) , sun and storm and other stories ." 

The closest vector to
""in november 2007 , sears holdings corporation announced the purchase of a [1m[34m13[0m ."
Is
""i would like to make edits under my real name from now [1m[34mon[0m ." 

The closest vector to
""this particular article though , [1m[34mis[0m not of featured quality as of yet ."
Is
""a hand made by hitting two consecutive cards [1m[34mon[0m the turn and river ." 

The closest vector t

The closest vector to
""chain [1m[34mfalls[0m tend to use internal gear sets to provide the mechanical advantage ."
Is
""i was the one added it within the article as a proper [1m[34mreference[0m ." 

The closest vector to
""their performance at the festival garnered a lot of attention for the band [1m[34m.[0m "
Is
""that very well may happen [1m[34m,[0m but that is for later to decide ." 

The closest vector to
""i may not know any [1m[34mww2[0m author , but i can always search ."
Is
""i was the one added it within the article as a proper [1m[34mreference[0m ." 

The closest vector to
""she thinks i 'm [1m[34mcrazy[0m but i 'm just growing old '' ) ."
Is
""i 've missed several meetings , maybe i will try to show [1m[34mup[0m ." 

The closest vector to
""all the resources in the [1m[34mworld[0m could not create a free use equivalent ."
Is
""i was the one added it within the article as a proper [1m[34mreference[0m ." 

The closest vector to
""to continue in 

The closest vector to
""he got the name from [1m[34mseeing[0m multiple road signs during a road trip ."
Is
""rr law , he could be [1m[34min[0m some cases , but not all ." 

The closest vector to
""efforts to rebuild the niagara were hampered by the lack of original plans [1m[34m.[0m "
Is
""that very well may happen [1m[34m,[0m but that is for later to decide ." 

The closest vector to
""prominent proponents [1m[34mof[0m liberal arts in the united states have included mortimer j ."
Is
""frank stack collection - a special collection of the university [1m[34mof[0m missouri libraries ." 

The closest vector to
""other complete sentences consist of two or more clauses ( see below ) [1m[34m.[0m "
Is
""that very well may happen [1m[34m,[0m but that is for later to decide ." 

The closest vector to
""enormous quantities of water were [1m[34malso[0m required for the operation of the locomotive ."
Is
""a hand made by hitting two consecutive cards [1m[34mon[0m the tu