# Setup

In [None]:
from bokeh.io import output_notebook
output_notebook()

# Choosing some Static Word Embeddings

First we need:
    
    * a set of documents of search over (i.e. our corpus)
    * a set of word embeddings to employ for these searches
    
For the latter, we turn to Vectorian's embedding zoo, which offers a number of pretrained word embeddings.

In [None]:
from vectorian.embeddings import Zoo

Zoo.list()

Let's load three embeddings. To make this work in environments with little RAM we focus on low-dimension embeddings. We choose the 50-dimensions 6B glove embedding (https://nlp.stanford.edu/projects/glove/), a 50-dimension version of the English Numberbatch embedding (https://github.com/commonsense/conceptnet-numberbatch) and a compressed version of the English fastText embeddings (https://fasttext.cc/).

In [None]:
glove = Zoo.load('glove-6B-50')
numberbatch = Zoo.load('numberbatch-19.08-en-50')
fasttext = Zoo.load('fasttext-en-mini')

# Loading Documents

First load our gold standard that contains our queries.

In [3]:
import json

with open("gold.json", "r") as f:
    queries = json.loads(f.read())

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
#nlp = spacy.load("en_core_web_trf")

In [None]:
from vectorian.importers import StringImporter
from vectorian.embeddings import SpacyTransformerEmbedding
from tqdm import tqdm

#tfm_embedding = SpacyTransformerEmbedding(nlp).compressed(10)
tfm_embedding = None

def import_docs():
    if tfm_embedding:
        im = StringImporter(nlp, embeddings=[tfm_embedding])
    else:
        im = StringImporter(nlp)    

    docs = []

    for query in tqdm(queries, desc="Importing"):
        for m in query["matches"]:
            docs.append(im(
                m["context"],
                title=m["work"],
                author=m["author"],
                unique_id=m["id"]))
            
    return docs

Finally, we want to keep some additional info about the docs (inside `doc_details` and keyed by the doc's unique id). This helps us with adding nicer visuals and extended info to some of the content further down.

In [None]:
def mk_doc_details():
    doc_details = {}

    for query in queries:
        for m in query["matches"]:
            doc_details[m["id"]] = {
                'query': query,
                'match': m
            }
        
    return doc_details
                                  
doc_details = mk_doc_details()

We are now ready to build a Vectorian session that contains our documents and embeddings.

In [None]:
from vectorian.session import LabSession

session = LabSession(
    import_docs(),
    embeddings=[glove, numberbatch, fasttext],
    normalizers="default")

Let's take a look at the documents we imported and that now live inside `session`.

In [None]:
from ipywidgets import interact
from IPython.display import display
import ipywidgets as widgets
import string

class DocFormatter:
    def __init__(self):
        self._template = string.Template("""
            <div style="margin-left:2em">
                <span style="font-variant:small-caps; font-size: 14pt;">${title}</style>
                <span style="float:right; font-size: 10pt;">query: ${phrase}</span>
                <hr>
                <div style="font-variant:normal; font-size: 10pt;">${text}</div>
            </div>
            """)
    
    def enhanced_doc_text(self, doc):
        with doc.text() as text_ref:
            text = text_ref.get()
        quote = doc_details[doc.unique_id]["match"]["quote"]
        try:
            i = text.index(quote)
            return ''.join([
                text[:i],
                '<span style="font-weight:bold;">',
                text[i:i + len(quote)],
                '</span>',
                text[i + len(quote):]
            ])
        except:
            return text
        
    def __call__(self, doc):
        return self._template.substitute(
            phrase=doc_details[doc.unique_id]["query"]["phrase"],
            title=doc.metadata["title"],
            text=self.enhanced_doc_text(doc))

@interact(
    doc_index=widgets.IntSlider(min=1, max=len(session.documents)))
def browse_docs(doc_index):
    doc = session.documents[doc_index - 1]
    doc_formatter = DocFormatter()
    display(widgets.HTML(doc_formatter(doc)))

# Comparing Sentence Embeddings

In a first step, let's look at representing each document with one embedding in order to gather an understanding how different embedding strategies relate to the nearness of documents. We will later turn to individual token embeddings.

We first prepare additional sentence embeddings using SBERT that we will show in our first big visualization.

In [None]:
from vectorian.embeddings import SentenceBERTEncoder
sbert = SentenceBERTEncoder(nlp, "paraphrase-distilroberta-base-v1")

# precompute SBERT sentence embeddings. takes a moment.
sbert.prepare(session.partition("document"), session.documents)

In the TSNE visualization below, dots are documents and the colors are the query that yields that document in our gold standard. By hovering over dots with the mouse you get details on the document and query the dot represents. Nearby dots of the same color indicate that the embedding tends to cluster documents similar to our gold standard.

You can also add an intruder text by entering a text into the text field and pressing RETURN (to refresh the plot). This will move the larger crossed circle to where the currently selected embeddings thinks that the given text should be positioned in terms of the other documents.

In some cases, we can clearly make out clusters visually. For example, in the fastText embedding the blue "to be or not be" documents are clustered nicely. SBERT shows a green cluster of "an old man is twice a child". numberbatch reveals a brown cluster of "llo, ho, ho, my lord".

Finally you can switch between different embeddings using the radio buttons. You can also enable "free_text" and enter custom queries that are not in our gold corpus.

In [None]:
from ipywidgets import interact
from openTSNE import TSNE
import ipywidgets as widgets
import numpy as np

import bokeh.plotting
import bokeh.models
import bokeh.transform
import bokeh.palettes
import bokeh.layouts

from vectorian.embeddings import TokenAveragingEncoder
from vectorian.index import DummyIndex


class EmbeddingPlotter:
    def __init__(self):
        self.partition = session.partition("document")

        self.encoders = dict()
        for embedding in session.embeddings.keys():
            self.encoders[embedding] = TokenAveragingEncoder(
                nlp, session.embeddings[embedding].factory)
            
        self._doc_formatter = DocFormatter()

        self._tooltips = """
            <span style="font-variant:small-caps">@work</span>
            <span style="float:right;">"@query" (@similarity%)</span>
            <br>
            <hr>
            @context
            """
        
    def plot(self, embedding, intruder, show_legend=False):
        encoder = self.encoders[embedding]
        intruder_doc = DummyIndex(self.partition).make_query(intruder)
        
        id_to_doc = dict((doc.unique_id, doc) for doc in session.documents)
        query_docs = []
        
        works = []
        phrases = []
        contexts = []

        query_docs.append(intruder_doc)
        works.append("")
        phrases.append(intruder)
        contexts.append("")

        for q in queries:
            for m in q["matches"]:
                doc = id_to_doc[m["id"]]
                query_docs.append(doc)
                works.append(m["work"])
                phrases.append(q['phrase'])
                contexts.append(self._doc_formatter.enhanced_doc_text(doc))

        data = {
            'work': works,
            'query': phrases,
            'context': contexts,
            'vector': encoder.encode(self.partition, query_docs).unmodified
        }
        np.nan_to_num(data['vector'], 0)

        tsne = TSNE(
            perplexity=30,
            metric="cosine",
            n_jobs=2,
            random_state=42)

        v = np.array(data['vector'])
        v /= np.linalg.norm(v, axis=1)[:, np.newaxis]

        similarity = [1]
        for x in v[1:]:
            similarity.append(np.dot(v[0], x))
        similarity = np.array(similarity) * 100
        
        X = tsne.fit(v)

        p = bokeh.plotting.figure(
            plot_width=900, plot_height=len(queries) * 30,
            title=f"Sentence Embeddings",
            toolbar_location="below", tools="pan", tooltips=self._tooltips)

        source = bokeh.models.ColumnDataSource({
            'x': X[1:, 0],
            'y': X[1:, 1],
            'work': data["work"][1:],
            'query': data["query"][1:],
            'context': data["context"][1:],
            'similarity': similarity[1:]
        })

        p.circle(
            source=source, size=10, legend_field='query',
            color=bokeh.transform.factor_cmap(
                'query',
                palette=bokeh.palettes.Category20[len(queries)],
                factors=[q['phrase'] for q in queries]),
            alpha=0.8)
        
        p.circle_cross(
            source=bokeh.models.ColumnDataSource({
                'x': X[:1, 0],
                'y': X[:1, 1],
                'work': data["work"][:1],
                'query': data["query"][:1],
                'context': data["context"][:1],
                'similarity': similarity[:1]
            }),
            size=20,
            color="blue",
            line_color="darkblue",
            fill_alpha=0.25)

        if show_legend:
            p.legend.orientation = "vertical"
            p.legend.location = "right"
            p.legend.visible = show_legend
        else:
            p.legend.items = []

        bokeh.io.show(p)


plotter = EmbeddingPlotter()
plotter.encoders["SBERT"] = sbert  # add the SBERT embedding
  
def mk_text_widget(free):
    if not free:
        return widgets.Dropdown(options=[q["phrase"] for q in queries])
    else:
        return widgets.Text("horse", continuous_update=False)
    
@interact(free_text=widgets.Checkbox())
def plot_docs(free_text):
    @interact(
        query=mk_text_widget(free_text),
        embedding=widgets.RadioButtons(options=sorted(plotter.encoders.keys())),
        show_legend=widgets.Checkbox())
    def inner(query, embedding, show_legend):
        plotter.plot(embedding, query, show_legend)
    return inner

Now let's run an actual search using Vectorian.

In [None]:
from vectorian.metrics import PartitionEmbeddingSimilarity, CosineSimilarity

@interact(free_text=widgets.Checkbox())
def plot_docs(free_text):
    @interact(
        query=mk_text_widget(free_text),
        embedding=widgets.RadioButtons(options=sorted(plotter.encoders.keys())))
    def inner(query, embedding):
        sent_sim = PartitionEmbeddingSimilarity(
            plotter.encoders[embedding],
            CosineSimilarity())
        index = session.partition("document").index(sent_sim)
        return index.find(query, n=3)

# Exploring Word Embeddings

We now turn to single word embeddings.

In [None]:
session.word_vec(glove, "hot")

In [None]:
from vectorian.metrics import TokenSimilarity, CosineSimilarity

token_sim = TokenSimilarity(
    numberbatch,
    CosineSimilarity()
)

session.similarity(token_sim, "hot", "cold")

In [None]:
token_sim = TokenSimilarity(
    glove,
    CosineSimilarity())

session.similarity(token_sim, "hot", "cold")

The following interactive board allows you to search for a custom token inside a document. You can choose different documents by changing `doc_index`. The plot gives you the similarity of the entered token with the tokens in the chosen document under the selected embedding.

Note that out-of-vocabulary words like "fasterer" will produce zero similarities under standard key-value embeddings, whereas fastText is still able to produce a vector thanks to subword information.

In [None]:
from vectorian.metrics import TokenSimilarity, CosineSimilarity
from functools import partial

import numpy as np

import bokeh.plotting
import bokeh.models
import bokeh.transform
import bokeh.palettes

import collections

def plot_token_similarity(session, doc, token_sim, ref_token):
    partition = session.partition("document")
    
    color_mapper = bokeh.models.LinearColorMapper(
        palette="Cividis256", low=0, high=1)
    
    sim = partial(session.similarity, token_sim)

    data = collections.defaultdict(list)
    seen = set()
    
    for span in doc.spans(partition):
        for k, token in enumerate(span):
            if token.text not in seen:
                s = max(0, sim(token.text, ref_token))
                data['token'].append(token.text)
                data['sim'].append(s)
                seen.add(token.text)
                    
    data['sim'] = np.array(data['sim'])
    order = np.argsort(data['sim'])[::-1]
    data['token'] = [data['token'][i] for i in order]
    data['sim'] = data['sim'][order]
 
    p = bokeh.plotting.figure(
        y_range=list(reversed(data["token"])), plot_height=len(data['token']) * 20,
        title=f"Token Similarity for {doc.metadata['title']}",
        toolbar_location=None, tools="")

    p.hbar(
        "token", right="sim",
        source=bokeh.models.ColumnDataSource(data), height=0.5,
        color={'field': 'sim', 'transform': color_mapper})
    
    p.x_range = bokeh.models.Range1d(0, 1)
    p.ygrid.grid_line_color = None

    bokeh.io.show(p)
    

from ipywidgets import interact
import ipywidgets as widgets

@interact(
    token=widgets.Text(value='high'),
    doc_index=widgets.IntSlider(min=1, max=len(session.documents)),
    embedding=widgets.RadioButtons(options=sorted(session.embeddings.keys())))
def show_tokens(token, doc_index, embedding):
    token_sim = TokenSimilarity(
        session.embeddings[embedding].factory,
        CosineSimilarity())

    plot_token_similarity(session, session.documents[doc_index - 1], token_sim, token)

# A Search Query using Alignment over Similar Tokens

In [None]:
from vectorian.metrics import TokenSimilarity, CosineSimilarity
from vectorian.metrics import AlignmentSimilarity
from vectorian.alignment import WatermanSmithBeyer, ExponentialGapCost

token_sim = TokenSimilarity(
    glove,  # the GloVe embedding we loaded earlier
    CosineSimilarity()  # a standard cosine similarity
)

sent_sim = AlignmentSimilarity(
    token_sim=token_sim,
    alignment=WatermanSmithBeyer(gap=ExponentialGapCost(5), zero=0.25))

index = session.partition("document").index(sent_sim, nlp)

In [None]:
queries[0]["phrase"]

In [None]:
index.find(queries[0]["phrase"], n=1)

# Plotting the NDCG over the Corpus

In [None]:
import numpy as np
import sklearn.metrics
import itertools

import bokeh.plotting
import bokeh.models
import bokeh.transform
import bokeh.palettes

match_label = dict((x, i) for i, (q, x) in enumerate(
    itertools.chain(*[[(q, m["id"]) for m in q["matches"]] for q in queries])))

def measure_ndcg(index, query, k=10):
    y_true = np.zeros((1, len(session.documents)), dtype=np.float32)
    y_score = np.zeros((1, len(session.documents)), dtype=np.float32)

    for m in query["matches"]:
        y_true[0, match_label[m["id"]]] = 1
    
    for m in index.find(query["phrase"], n=k).matches:
        y_score[0, match_label[m.doc.unique_id]] = 1
    
    return sklearn.metrics.ndcg_score(y_true, y_score, k=k)


def plot_ndcg(index):
    data = {
        'phrase': [],
        'ndcg': []
    }
    for q in queries:
        data['phrase'].append(q["phrase"])
        data['ndcg'].append(measure_ndcg(index, q))

    p = bokeh.plotting.figure(
        y_range=data['phrase'], plot_height=20 * len(queries),
        title="NDCG",
        toolbar_location=None, tools="")

    source = bokeh.models.ColumnDataSource(data)
    
    ax = p.hbar(
        "phrase", right="ndcg",
        source=source, height=0.5)
    print(source.data)
    
    p.x_range = bokeh.models.Range1d(0, 1)

    bokeh.io.show(p)
    
    
class NDCGPlotter:
    def _compute_ndcg(self, index):
        return [measure_ndcg(index, q) for q in self._queries]
    
    def __init__(self, queries, index):
        self._queries = queries
        phrase = [q["phrase"] for q in self._queries]

        p = bokeh.plotting.figure(
            y_range=phrase, plot_height=20 * len(self._queries),
            title="NDCG",
            toolbar_location=None, tools="")

        self._source = bokeh.models.ColumnDataSource({
            'phrase': phrase,
            'ndcg': self._compute_ndcg(index)
        })
        
        self._hbar = p.hbar(
            "phrase", right="ndcg",
            source=self._source, height=0.5)
        
        p.x_range = bokeh.models.Range1d(0, 1)

        self._bokeh_handle = bokeh.io.show(p, notebook_handle=True)
        self._p = p
        
    def update(self, index):
        self._source.data['ndcg'] = self._compute_ndcg(index)
        bokeh.io.push_notebook(handle=self._bokeh_handle)
    

NDCGPlotter(queries, index);

# Exploring WSB Parameters 1

In [None]:
from ipywidgets import interact
import ipywidgets as widgets

def illustrate_wsb_parameters():
    gap_cutoff = widgets.FloatSlider(min=1, max=10, step=1, description="gap cutoff")
    zero = widgets.FloatSlider(min=0, max=1, step=0.25, description="zero")
    
    def create_index():    
        sent_sim = AlignmentSimilarity(
            token_sim=token_sim,
            alignment=WatermanSmithBeyer(gap=ExponentialGapCost(gap_cutoff.value), zero=zero.value))

        return session.partition("document").index(sent_sim, nlp)
    
    display(widgets.VBox([gap_cutoff, zero]))
    plotter = NDCGPlotter(queries, create_index())
    
    def on_value_change(change):        
        plotter.update(create_index())
        
    gap_cutoff.observe(on_value_change, names='value')
    zero.observe(on_value_change, names='value')
    
illustrate_wsb_parameters()

# Exploring WSB Parameters 2

In [None]:
from ipywidgets import interact
import ipywidgets as widgets
from vectorian.metrics import PNormDistance, ModifiedMetric, RadialBasis, Power

def illustrate_metric_parameters():
    p = widgets.FloatSlider(min=0.5, max=3, step=0.1, value=2, description="p")
    gamma = widgets.FloatSlider(min=0, max=3, step=0.1, value=1, description="gamma")
    alpha= widgets.FloatSlider(min=0.5, max=2, step=0.1, value=1, description="alpha")
    
    def create_index():
        token_sim = TokenSimilarity(
            glove,
            ModifiedMetric(
                PNormDistance(p.value),
                RadialBasis(gamma.value),
                Power(alpha.value)))

        sent_sim = AlignmentSimilarity(
            token_sim=token_sim,
            alignment=WatermanSmithBeyer(gap=ExponentialGapCost(5), zero=0.25))

        return session.partition("document").index(sent_sim, nlp)
    
    display(widgets.VBox([p, gamma, alpha]))
    plotter = NDCGPlotter(queries, create_index())
    
    def on_value_change(change):        
        plotter.update(create_index())
        
    p.observe(on_value_change, names='value')
    gamma.observe(on_value_change, names='value')
    alpha.observe(on_value_change, names='value')

illustrate_metric_parameters()

# Focussing in on one Query: Understanding Score and Intrusive Results

In [None]:
from ipywidgets import interact
import ipywidgets as widgets


def illustrate_idea3(query_index):
    query = queries[query_index]
    n = 100
    gold_matches = [x["id"] for x in query["matches"]]
    
    r = index.find(query["phrase"], n=n)

    rank = [str(i) for i in range(1, n + 1)]
    
    tooltips = """
        @tooltip
    """

    p = bokeh.plotting.figure(
        x_range=rank, plot_width=1000, plot_height=250,
        title=f"Scores for Query '{query['phrase']}'",
        toolbar_location=None, tools="", tooltips=tooltips)
    
    doc_formatter = DocFormatter()

    source = bokeh.models.ColumnDataSource({
        'rank': rank,
        'score': [m.score for m in r.matches],
        'color': [("green" if m.doc.unique_id in gold_matches else "red") for m in r.matches],
        'tooltip': [doc_formatter(m.prepared_doc) for m in r.matches]
    })
    
    hbar = p.vbar(
        "rank", top="score", color="color", source=source)

    p.y_range = bokeh.models.Range1d(0, 1)
    p.xaxis.major_label_orientation = np.pi / 2

    bokeh_handle = bokeh.io.show(p, notebook_handle=True)
    
    
@interact(query_index=widgets.IntSlider(min=1, max=len(queries)))
def illustrate(query_index):
    illustrate_idea3(query_index - 1)