In [23]:
# load packages
import os
import sys
import re
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

from spacy import displacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm") # load english vocab

from IPython.display import HTML

# Load ELMo Embedding

In [2]:
url = "https://tfhub.dev/google/elmo/2" # elmo model
embedding = hub.Module(url)

INFO:tensorflow:Using /var/folders/0p/gsfhfkcj45525788hzflpnqr0000gn/T/tfhub_modules to cache modules.


# Load Data

In [3]:
que_df = pd.read_csv("../DATAHUB/quora-dataset/question-pair-dataset.csv") # load data
que_df.shape

(404351, 6)

In [4]:
que_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0


# Prepare Data

In [5]:
# consider only one set of questions
que_df["processed_q1"] = que_df.question1.apply(lambda sent: re.sub("[^a-z0-9.?! ]", "", str(sent).strip().lower()))
que_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,processed_q1
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,what is the story of kohinoor kohinoor diamond?
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,how can i increase the speed of my internet co...


In [10]:
sentences = list(que_df.processed_q1)
len(sentences)

404351

In [11]:
sentences[:3]

['what is the step by step guide to invest in share market in india?',
 'what is the story of kohinoor kohinoor diamond?',
 'how can i increase the speed of my internet connection while using a vpn?']

In [12]:
sentences = sentences[:1000] # for quich training, will select only about 1000

# Get ELMo Embeddings

In [13]:
embeddings = embedding(sentences, signature="default", as_dict=True)["default"]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [14]:
# generate embeddings for each question
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    x = sess.run(embeddings)

In [16]:
x.shape

(1000, 1024)

In [18]:
# initiate pca for 50 components
pca = PCA(n_components=50)
y = pca.fit_transform(x)

In [19]:
# use t-sne to compress 50 components into 2
z = TSNE(n_components=2).fit_transform(y)

# Visualize Embeddings In HTML

In [21]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

In [22]:
data = [
    go.Scatter(
        x = [i[0] for i in z],
        y = [i[1] for i in z],
        mode = "markers",
        text=[i for i in sentences],
        marker=dict(
            size=16,
            color = [len(i) for i in sentences], #set color equal to a variable
            opacity= 0.8,
            colorscale="Viridis",
            showscale=False
        )
    )
]

layout = go.Layout()

layout = dict(
    yaxis = dict(zeroline = False),
    xaxis = dict(zeroline = False)
)

fig = go.Figure(data=data, layout=layout)

file = plot(fig, filename="Sentence_Embedding.html")

# Symantic Search

In [29]:
results_returned = "3" # show top 3 results


while input_sentence != "done":
    # take input sentence
    input_sentence = input("Search Query:", )

    instance_embd = embedding([input_sentence], signature="default", as_dict=True)["default"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        search_vect = sess.run(instance_embd)

    cosine_similarities = pd.Series(cosine_similarity(search_vect, x).flatten()) # find the similarity

    output = ""

    for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
        output += "<p style='font-family:verdana; font-size:110%; color:yellow'> "
        for i in sentences[i].split():
            if i.lower() in input_sentence:
                output += " <b>" + str(i) + "</b>"
            else:
                output += " "+str(i)
        output += "</p><hr>"
    output = "<h3>Results: </h3>" + output
    display(HTML(output))

Search Quesry:best android phone
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


KeyboardInterrupt: 