# Playground for Word and Document Embedding Slides
- Stephen W. Thomas
- Used for MMA 865 and MMAI 891

In [2]:
import flair

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [83]:
import pandas as pd

sentences = ['big dog run park',
             'big cat run park',
             'large dog run park',
             'run large cat dog']

vocab = set()
for sent in sentences:
    for w in sent.split():
        vocab.add(w)
print(vocab)

df = pd.DataFrame(columns=vocab)
df = df.reindex(vocab)
df = df.fillna(0)

for sent in sentences:
    words = sent.split()
    for i, w in enumerate(words):
        low = max(0, i-2)
        high = min(len(words)-1, i+2)
        for j in range(low, high+1):
            if i == j:
                continue
            w2 = words[j]
            df.loc[w, w2] = df.loc[w, w2] + 1
            
print(df)
print(df.loc[['cat', 'dog', 'large', 'big', 'run', 'park']])

{'big', 'cat', 'large', 'run', 'park', 'dog'}
       big  cat  large  run  park  dog
big      0    1      0    2     0    1
cat      1    0      1    2     1    1
large    0    1      0    2     0    2
run      2    2      2    0     3    2
park     0    1      0    3     0    2
dog      1    1      2    2     2    0
       big  cat  large  run  park  dog
cat      1    0      1    2     1    1
dog      1    1      2    2     2    0
large    0    1      0    2     0    2
big      0    1      0    2     0    1
run      2    2      2    0     3    2
park     0    1      0    3     0    2


In [6]:
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence('Steve love Berlin .')

# load the NER tagger
# other models include "chunk", "pos", "frame"
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)

[Sentence: "Steve love Berlin ." - 4 Tokens]

In [7]:
print(sentence)
print('The following NER tags are found:')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

Sentence: "Steve love Berlin ." - 4 Tokens
The following NER tags are found:
PER-span [1]: "Steve"
LOC-span [3]: "Berlin"


In [9]:
print(sentence.to_dict(tag_type='ner'))

{'text': 'Steve love Berlin .', 'labels': [], 'entities': [{'text': 'Steve', 'start_pos': 0, 'end_pos': 5, 'type': 'PER', 'confidence': 0.9994787573814392}, {'text': 'Berlin', 'start_pos': 11, 'end_pos': 17, 'type': 'LOC', 'confidence': 0.998204231262207}]}


In [20]:
from flair.embeddings import WordEmbeddings

# GloVe embedding
glove_embedding = WordEmbeddings('glove')

# FastText embeddings over Web crawls
crawl_embedding = WordEmbeddings('en-crawl')

In [25]:
# create sentence.
sentence = Sentence('The grass is green and red .')

# embed a sentence using glove.
glove_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

[Sentence: "The grass is green and red ." - 7 Tokens]

Token: 1 The
tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  

In [35]:
from scipy.spatial.distance import cosine

s1 = Sentence('queen')
s2 = Sentence('cheese')

glove_embedding.embed(s1)
glove_embedding.embed(s2)

a = s1[0].embedding
b = s2[0].embedding
cosine(a, b)

[Sentence: "queen" - 1 Tokens]

[Sentence: "cheese" - 1 Tokens]

0.8436118066310883