In [38]:
import sys

sys.path.append('../generate_dataset/resources')
sys.path.append('../Siamese')
import SiameseNet
from pytorch_fast_elmo import FastElmo, batch_to_char_ids
import sys
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy
import matplotlib.pyplot as plt
import random
import torch
from typing import List
import numpy as np
import utils
import termcolor


from typing import NewType

In [51]:
class Vector(object):
    
    def __init__(self, vec, sentence, index):
        
        self.vec = vec
        self.sentence = sentence
        self.index = index
        self.size = np.linalg.norm(self.vec)
    
    def get_word(self):
        
        return self.sentence[self.index]
    
    def get_vector(self): 
        
        return self.vec
    
    def get_sentence(self):
        
        return self.sentence
    
    def get_index(self):
        
        return self.index
    
    def get_size(self):
        
        return self.size
    
    def __str__(self):
        
        words = self.get_sentence()
        i = self.get_index()
        before = " ".join(words[:i])
        after = " ".join(words[i + 1:])
        word = "***"+termcolor.colored(self.get_word(), "blue", attrs = ['bold'])+"***"
        sent = '""' + before + " " + word + " " + after + '"' + "***WORD: {} ***".format(self.get_word())
        return sent
    
    def similarity(self, other):
        
        if other is self: return -np.inf
        
        return self.get_vector().dot(other.get_vector())/(self.get_size() * other.get_size())
    
    @staticmethod
    def get_closest_vector(vec, vecs):
    
        closest = max(vecs, key = lambda vector: vector.similarity(vec))
        return closest

In [32]:
def transform(model, vector: Vector):
    
    vec_pytorch = torch.from_numpy(vector.get_vector()).float()
    vector.vec = model._represent(vec_pytorch).detach().numpy()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
def print_closest_vectors(sample, all_vecs):
    
    for vector in sample:
        closest = Vector.get_closest_vector(vector, all_vecs)
        print("The closest vector to\n{}\nIs\n{} \n ==========================================\n".format(vector, closest))
    
def load_sents(fname = "sents_f", max_length = 15):
    
    with open(fname, "r") as f:
              
        lines = f.readlines()
    
    lines =  [line.strip().split(" ") for line in lines]
    if max_length is not None:
        lines = list(filter(lambda sentence: len(sentence) < max_length, lines))
    return lines

def load_model(name = "model.pt"):
    net = SiameseNet.SiameseNet()
    net.load_state_dict(torch.load(name))
    net.eval()
    return net

def list_vectors(vecs, sents: List[List[str]]) -> List[Vector]:
    
    num_sentences = len(sents)
    sents_indices_and_vecs =  zip(range(num_sentences), vecs)
    all_vectors = []

    for sent_index, sent_vectors in sents_indices_and_vecs:
        
        for i, (w,vec) in enumerate(zip(sents[sent_index], vecs[sent_index])):   
            
            v = Vector(vec.detach().numpy(), sents[sent_index], i)
            all_vectors.append(v)
            
    return all_vectors

Load pretrained ELMO and a collection of Wikipedia sentences.

In [12]:
all_sentences = load_sents(max_length = 15)
options_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_options"]
weight_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_weights"]
elmo = FastElmo(options_file, weight_file)

In [13]:
N = 1000
subset_size = 150
random.seed(0)
sentences = all_sentences[:N]

Collect ELMO states over N sentences, and load pretrained Siamese network.

In [34]:
character_ids = batch_to_char_ids(sentences)
embeddings = elmo(character_ids) # collect elmo states
network = load_model()

### Find the closest vectors to a collection of (original) ELMO vectors.

In [52]:
vecs = list_vectors(embeddings["elmo_representations"][0], sentences)
subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
""and ***[1m[34mthey[0m*** are multiple and independent and have the necessary articles and reviews ."***WORD: they ***
Is
"" ***[1m[34mthey[0m*** 've been reviewed by charity navigator [ 1 ] and by give ."***WORD: they *** 

The closest vector to
""the statue was broken in two while being transported to ***[1m[34mtheir[0m*** georgia parish ."***WORD: their ***
Is
""at this point , he was taken to ***[1m[34mhis[0m*** home town for burial ."***WORD: his *** 

The closest vector to
""it includes literary uses and commentary on the term as used in society ***[1m[34m.[0m*** "***WORD: . ***
Is
""thus , template titles are subject to the same rules as article titles ***[1m[34m.[0m*** "***WORD: . *** 

The closest vector to
""target identification is typically the starting point of the modern drug discovery process ***[1m[34m.[0m*** "***WORD: . ***
Is
""it is the most popular kernel function used in support vector machine classification ***[1m[34m.

The closest vector to
""in 2007 , villa was ***[1m[34mactively[0m*** training to make the 2010 winter paralympics ."***WORD: actively ***
Is
""they ***[1m[34mactively[0m*** encouraged his painting activities and later supported him in higher education ."***WORD: actively *** 

The closest vector to
""applications for single source analytics ***[1m[34m,[0m*** new york , ny , united states ."***WORD: , ***
Is
""applications for single source analytics , new york ***[1m[34m,[0m*** ny , united states ."***WORD: , *** 

The closest vector to
""the hearings were televised ***[1m[34mlive[0m*** at the state of illinois center in chicago ."***WORD: live ***
Is
""he has since continued playing ***[1m[34mlive[0m*** with muse on their 2nd law tour ."***WORD: live *** 

The closest vector to
""the composition of julian 's army at strasbourg can only ***[1m[34mbe[0m*** partially reconstructed ."***WORD: be ***
Is
""the piece in the pocket can ***[1m[34mbe[0m*** put back on th

The closest vector to
""in 1988 ***[1m[34m,[0m*** he published his first book dedicated entirely to film theory ."***WORD: , ***
Is
""in 1951 ***[1m[34m,[0m*** he became dean of the college of arts and sciences ."***WORD: , *** 

The closest vector to
""applicants ***[1m[34mshall[0m*** have only five ( 5 ) attempts to pass the examination ."***WORD: shall ***
Is
""that is why the 50 members ***[1m[34mshould[0m*** not directly represent the organizations notability ."***WORD: should *** 

The closest vector to
""the dark knight ( film ***[1m[34m)[0m*** are that i explain them too much ."***WORD: ) ***
Is
""the birds of the gambia ( 2nd edition , 1991 ***[1m[34m)[0m*** by m ."***WORD: ) *** 

The closest vector to
""the main ( west ) facade is five bays with a central ***[1m[34mdoorway[0m*** ."***WORD: doorway ***
Is
""in the north wall of the chancel is a single narrow pointed ***[1m[34mwindow[0m*** ."***WORD: window *** 

The closest vector to
""the female flowe


The closest vector to
""complete participation ***[1m[34mresearcher[0m*** is completely integrated in population of study beforehand ( i ."***WORD: researcher ***
Is
""marriage information per parish records in devon and verified with ***[1m[34mresearcher[0m*** in uk ."***WORD: researcher *** 

The closest vector to
""he was the ***[1m[34meldest[0m*** son of richard , third son of the hon ."***WORD: eldest ***
Is
""he was the eldest ***[1m[34mson[0m*** of richard , third son of the hon ."***WORD: son *** 

The closest vector to
""thousands of people protest against the ruling hamas party in the gaza ***[1m[34mstrip[0m*** ."***WORD: strip ***
Is
""thousands of people protest against the ruling hamas party in the ***[1m[34mgaza[0m*** strip ."***WORD: gaza *** 

The closest vector to
""yes , because recording the author of a completely faithful reproduction is ***[1m[34mpointless[0m*** ."***WORD: pointless ***
Is
""a redirect is ***[1m[34munnecessary[0m*** also --

The closest vector to
""rbi in the series as the cardinals ***[1m[34mdefeated[0m*** the mets in seven games ."***WORD: defeated ***
Is
""world series as the chicago white sox ***[1m[34mdefeated[0m*** the cubs in six games ."***WORD: defeated *** 

The closest vector to
""the dialogue in the scene ***[1m[34mwas[0m*** also taken directly from the original article ."***WORD: was ***
Is
""the discussion ***[1m[34mwas[0m*** a about the synthesis of sources to advance a point ."***WORD: was *** 

The closest vector to
""it was ***[1m[34mthe[0m*** first full reunion for the student body in 20 years ."***WORD: the ***
Is
""it was ***[1m[34mthe[0m*** first open cup tournament to be named after lamar hunt ."***WORD: the *** 

The closest vector to
""it is a member of the international association of national public ***[1m[34mhealth[0m*** institutes ."***WORD: health ***
Is
""the challenge is a free campaign supported by national civic and ***[1m[34mhealth[0m*** organizati

### Now, first apply the trained Siamese network to each vector, and recalculate closest vectors.

In [None]:
for v in vecs:
    transform(network, v)

subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
""the ocean liner ran aground in the río grande do sul , ***[1m[34mbrazil[0m*** ."***WORD: brazil ***
Is
""rock and roll ***[1m[34mpeople[0m*** '' which is from the mind games sessions ) ."***WORD: people *** 

The closest vector to
""european or latin american destinations were even in the top ***[1m[34mten[0m*** for emigrants ."***WORD: ten ***
Is
""he was buried in the grange cemetery , edinburgh , on ***[1m[34m22[0m*** oct ."***WORD: 22 *** 

The closest vector to
""at the time the highway that would become route 32 was ***[1m[34malready[0m*** present ."***WORD: already ***
Is
""even if married in another state , it is ***[1m[34mnot[0m*** recognized within missouri ."***WORD: not *** 

The closest vector to
""the accompanying ***[1m[34mmusic[0m*** video portrays jessica and lisa in a boarding school rebellion ."***WORD: music ***
Is
""the ***[1m[34mstuff[0m*** i have found has been dominated by ( 1 ) www ."***WORD: stuff *** 

The close

The closest vector to
""you will have much more control with ***[1m[34myour[0m*** linux box doing the routing ."***WORD: your ***
Is
""it is clear ***[1m[34mmy[0m*** language is not that of a native french speaker ."***WORD: my *** 

The closest vector to
""but please be careful to record exactly where such ***[1m[34mads[0m*** were originally published ."***WORD: ads ***
Is
""efforts are made to ensure laboratory safety ***[1m[34mvideos[0m*** are both relevant and engaging ."***WORD: videos *** 

The closest vector to
""titans of the ice age exhibition '' at the field museum in ***[1m[34mchicago[0m*** ."***WORD: chicago ***
Is
""it is the most popular kernel function used in support vector machine ***[1m[34mclassification[0m*** ."***WORD: classification *** 

The closest vector to
""the sparrow quartet is an american ***[1m[34macoustic[0m*** music group that formed in 2005 ."***WORD: acoustic ***
Is
""it is possible that the video may not be ***[1m[34mofficially

The closest vector to
""hello , ***[1m[34mi[0m*** 'm looking for information on a ship named hms joanna ."***WORD: i ***
Is
""i have been doing it manually , but ***[1m[34mit[0m*** is pretty time consuming ."***WORD: it *** 

The closest vector to
""the park in its current state ***[1m[34mwas[0m*** opened to the public in 1956 ."***WORD: was ***
Is
""in narrative sequence it ***[1m[34mis[0m*** preceded by point of impact and black light ."***WORD: is *** 

The closest vector to
""canada for its ***[1m[34mbeauty[0m*** , natural landscape , and strong sense of community ."***WORD: beauty ***
Is
""it is the most popular kernel function used in support vector machine ***[1m[34mclassification[0m*** ."***WORD: classification *** 

The closest vector to
""joe is a technician aboard the toronto and ***[1m[34ma[0m*** good friend of tom ."***WORD: a ***
Is
""he even stops going to school and become ***[1m[34ma[0m*** slave to his mother ."***WORD: a *** 

The closest vecto

The closest vector to
""signs in the abandoned station also state `` museum '' as the ***[1m[34mname[0m*** ."***WORD: name ***
Is
""the ***[1m[34mtown[0m*** covered an area of , and existed from 1954 until 1994 ."***WORD: town *** 

The closest vector to
""a procession also ***[1m[34mtook[0m*** place in the honor of the temple of heaven ."***WORD: took ***
Is
""germans were suffering also and they naturally ***[1m[34mgave[0m*** priority to their own suffering ."***WORD: gave *** 

The closest vector to
"" ***[1m[34min[0m*** 1976 he turned professional and won a national title in the sprint ."***WORD: in ***
Is
""he served as a color analyst on chargers radio broadcasts ***[1m[34min[0m*** the 1970s ."***WORD: in *** 

The closest vector to
""no ***[1m[34m,[0m*** but you can choose whether the water 's brown or blue ."***WORD: , ***
Is
""upon reaching the village ***[1m[34m,[0m*** they discover that the village has moved on ."***WORD: , *** 

The closest vector to