In [None]:
# load packages
import os
import sys
import re
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

from spacy import displacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm") # load english vocab

from IPython.display import HTML

# Load ELMo Embedding

In [None]:
url = "https://tfhub.dev/google/elmo/2" # elmo model
embedding = hub.Module(url)

# Load Data

In [None]:
que_df = pd.read_csv("../DATAHUB/quora-dataset/question-pair-dataset.csv") # load data
que_df.shape

In [None]:
que_df.head(3)

# Prepare Data

In [None]:
# consider only one set of questions
que_df["processed_q1"] = que_df.question1.apply(lambda sent: re.sub("[^a-z0-9.?! ]", "", str(sent).strip().lower()))
que_df.head(3)

In [None]:
sentences = list(que_df.processed_q1)
len(sentences)

In [None]:
sentences[:3]

In [None]:
sentences = sentences[:1000] # for quich training, will select only about 1000

# Get ELMo Embeddings

In [None]:
embeddings = embedding(sentences, signature="default", as_dict=True)["default"]

In [None]:
# generate embeddings for each question
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    x = sess.run(embeddings)

In [None]:
x.shape

In [None]:
# initiate pca for 50 components
pca = PCA(n_components=50)
y = pca.fit_transform(x)

In [None]:
# use t-sne to compress 50 components into 2
z = TSNE(n_components=2).fit_transform(y)

# Visualize Embeddings In HTML

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

In [None]:
data = [
    go.Scatter(
        x = [i[0] for i in z],
        y = [i[1] for i in z],
        mode = "markers",
        text=[i for i in sentences],
        marker=dict(
            size=16,
            color = [len(i) for i in sentences], #set color equal to a variable
            opacity= 0.8,
            colorscale="Viridis",
            showscale=False
        )
    )
]

layout = go.Layout()

layout = dict(
    yaxis = dict(zeroline = False),
    xaxis = dict(zeroline = False)
)

fig = go.Figure(data=data, layout=layout)

file = plot(fig, filename="Sentence_Embedding.html") # new tabs open and see the interactive visualization

# Symantic Search

In [None]:
results_returned = "3" # show top 3 results


while input_sentence != "done":
    # take input sentence
    input_sentence = input("Search Query:", )

    instance_embd = embedding([input_sentence], signature="default", as_dict=True)["default"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        search_vect = sess.run(instance_embd)

    cosine_similarities = pd.Series(cosine_similarity(search_vect, x).flatten()) # find the similarity

    output = ""

    for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
        output += "<p style='font-family:verdana; font-size:110%; color:yellow'> "
        for i in sentences[i].split():
            if i.lower() in input_sentence:
                output += " <b>" + str(i) + "</b>"
            else:
                output += " "+str(i)
        output += "</p><hr>"
    output = "<h3>Results: </h3>" + output
    display(HTML(output))

Don't go with the performance since the training is done on a very small dataset due to computation constraints. It's a POC to show that symantic search can be done using ELMo. A better training data and enough computation can do the magic.