# Introduction to ELMO

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model_url = "https://tfhub.dev/google/elmo/2" 

In [3]:
import hashlib
# The path where tf-hub will cache the model (use an absolute path..) 
os.environ["TFHUB_CACHE_DIR"] = '/tfhub'

#TF-hub will store the name as hex
hashlib.sha1(model_url.encode("utf8")).hexdigest()

'9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d'

In [4]:
%%time
# Initial download takes a while till the model is downloaded from tf-hub (~1GB)
elmo = hub.Module(model_url, trainable=False)

INFO:tensorflow:Using /tfhub to cache modules.
CPU times: user 712 ms, sys: 32.8 ms, total: 745 ms
Wall time: 756 ms


## Input Signatures

The model takes two types of signatures as input: 
1. _tokens_: Tokenized paragraphs
2. _default_: Normal paragraphs (splitted at spaces)

In [5]:
p_1 = "I want to categorize my data"
p_2 = "i love clustering!!"
messages = [p_1, p_2]

In [6]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [10]:
%%time
with tf.Session() as sess: 
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    returns = elmo(inputs = messages,
                  signature = "default",
                  as_dict=True)
    
    
    # Everything returns a Tensor
    word_embeddings = returns["word_emb"] # the character(word)-based embeddings with shape [batch_size, max_length, 512].
    embeddings = returns["elmo"] # the weighted sum of the 3 layers, where the weights are trainable [batch_size, max_length, 1024]
    lstm_1 = returns["lstm_outputs1"] # the first LSTM hidden state
    lstm_2 = returns["lstm_outputs2"] # the second LSTM hidden state with shape
    default = returns["default"] #(Seems to be the average of the words..) fixed mean-pooling of all contextualized word representations with shape [batch_size, 1024].
    

CPU times: user 1.13 s, sys: 240 ms, total: 1.37 s
Wall time: 975 ms


In [13]:
sess.close()

## Playing around a little bit

In [58]:
paragraph_one = "I like to group different machine types"
paragraph_two = "I like to know the stock price of my shares"
paragraph_three = "Cats are just the cutest animals ever!"

definition_1 = "Cluster analysis or clustering is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense) to each other than to those in other groups (clusters)."
definition_2 = "Regression is a data mining technique used to predict a range of numeric values (also called continuous values), given a particular dataset."

paragraphes = [paragraph_one, paragraph_two, paragraph_three, definition_1, definition_2]

In [59]:
with tf.Session() as session: 
    # Initializing global variables in the graph 
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    sentence_tensor = elmo(inputs = paragraphes,
                  signature = "default",
                  as_dict=True)["default"]
    sentence_embeddings = session.run(sentence_tensor)
    #paragraph_embeddings = session.run(elmo(paragraphes)["elmo"])

In [60]:
sentence_embeddings.shape

(5, 1024)

In [61]:
sentence_embeddings[0].reshape(1,-1).shape

(1, 1024)

In [62]:
sentence_embeddings = [x.reshape(1,-1) for x in sentence_embeddings]
x_0 = sentence_embeddings[0]
x_1 = sentence_embeddings[1]
x_2 = sentence_embeddings[2]
x_3 = sentence_embeddings[3]
x_4 = sentence_embeddings[4]


#### Clustering Paragrpah
Result: the clustering paragraph is more similar to the regression as to the clustering definition. 

In [69]:
print("Cosine Similiartiy between paragraph_one and definition_1: {}".format(cosine_similarity(x_0, x_3)))

Cosine Similiartiy between paragraph_one and definition_1: [[0.4208754]]


In [64]:
print("Cosine Similiartiy between paragraph_one and definition_2: {}".format(cosine_similarity(x_0, x_4)))

Cosine Similiartiy between paragraph_one and definition_2: [[0.4985751]]


#### Regression Paragraph
Result: Both distances are really close..

In [65]:
print("Cosine Similiartiy between paragraph_two and definition_1: {}".format(cosine_similarity(x_1, x_3)))

Cosine Similiartiy between paragraph_two and definition_1: [[0.35134336]]


In [66]:
print("Cosine Similiartiy between paragraph_two and definition_2: {}".format(cosine_similarity(x_1, x_4)))

Cosine Similiartiy between paragraph_two and definition_2: [[0.47441936]]


#### Random paragraph
Result: Again the similarity measures are nearly equal

In [67]:
cosine_similarity(x_2, x_3)

array([[0.342139]], dtype=float32)

In [68]:
cosine_similarity(x_2, x_4)

array([[0.383528]], dtype=float32)