# N-gram lm

[Pytorch Tutorial by Robert Guthrie](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)


In [1]:
import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f605456d1f0>

In [2]:
# Gensim
!pip install gensim==3.8.3
import gensim.downloader as api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O shakespeare.txt
# We will load the file
shakespeare = open("shakespeare.txt").readlines()

# It should contains sentences/paragraphs
shakespeare[0]

--2023-06-22 02:38:09--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2023-06-22 02:38:09 (20.1 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



'First Citizen:\n'

In [4]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings(make vector)
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)


In [6]:
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now

In [7]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

# build a list of tuples.
# Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)
training_ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
# Print the first 3, just so you can see what they look like.
print(training_ngrams[:3])





[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]


In [8]:
print(training_ngrams)

[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege'), (['besiege', 'shall'], 'thy'), (['thy', 'besiege'], 'brow,'), (['brow,', 'thy'], 'And'), (['And', 'brow,'], 'dig'), (['dig', 'And'], 'deep'), (['deep', 'dig'], 'trenches'), (['trenches', 'deep'], 'in'), (['in', 'trenches'], 'thy'), (['thy', 'in'], "beauty's"), (["beauty's", 'thy'], 'field,'), (['field,', "beauty's"], 'Thy'), (['Thy', 'field,'], "youth's"), (["youth's", 'Thy'], 'proud'), (['proud', "youth's"], 'livery'), (['livery', 'proud'], 'so'), (['so', 'livery'], 'gazed'), (['gazed', 'so'], 'on'), (['on', 'gazed'], 'now,'), (['now,', 'on'], 'Will'), (['Will', 'now,'], 'be'), (['be', 'Will'], 'a'), (['a', 'be'], "totter'd"), (["totter'd", 'a'], 'weed'), (['weed', "totter'd"], 'of'), (['of', 'weed'], 'small'), (['small', 'of'], 'worth'), (['worth', 'small'], 'held:'), (['held:', 'worth'], 'Then'), (['Then', 'held:'], 'being'), (['being', 'Then'], 'asked,'), (['asked,', 'being'], 'wher

In [9]:
len(training_ngrams)

113

In [10]:
vocab = set(test_sentence)

In [11]:
len(vocab)

97

In [12]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [14]:
word_to_ix

{'it': 0,
 'thriftless': 1,
 'make': 2,
 'eyes,': 3,
 "excuse,'": 4,
 'old,': 5,
 'forty': 6,
 'much': 7,
 'count,': 8,
 'his': 9,
 'proud': 10,
 'and': 11,
 'shall': 12,
 "youth's": 13,
 'When': 14,
 'in': 15,
 "deserv'd": 16,
 'see': 17,
 "totter'd": 18,
 'be': 19,
 'winters': 20,
 'held:': 21,
 'where': 22,
 'small': 23,
 'dig': 24,
 'succession': 25,
 'old': 26,
 'own': 27,
 'made': 28,
 'thy': 29,
 "feel'st": 30,
 'on': 31,
 'To': 32,
 'answer': 33,
 'beauty': 34,
 'to': 35,
 'use,': 36,
 'Will': 37,
 'treasure': 38,
 'being': 39,
 'within': 40,
 'If': 41,
 'brow,': 42,
 'And': 43,
 'gazed': 44,
 'livery': 45,
 "'This": 46,
 'Thy': 47,
 'my': 48,
 'by': 49,
 'days;': 50,
 'trenches': 51,
 'asked,': 52,
 'were': 53,
 'praise.': 54,
 'the': 55,
 'Were': 56,
 'an': 57,
 'Where': 58,
 'thine': 59,
 'of': 60,
 'worth': 61,
 'Proving': 62,
 'say,': 63,
 'How': 64,
 'cold.': 65,
 'couldst': 66,
 'This': 67,
 'thou': 68,
 'warm': 69,
 'art': 70,
 'sum': 71,
 'thine!': 72,
 'mine': 73,
 'S

In [15]:



class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
model = model.cuda()
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in tqdm.tqdm(range(10)):
    total_loss = 0
    for context, target in tqdm.tqdm(training_ngrams, position=0, leave=True):

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).cuda())

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])

100%|██████████| 113/113 [00:00<00:00, 174.54it/s]
100%|██████████| 113/113 [00:00<00:00, 823.14it/s]
100%|██████████| 113/113 [00:00<00:00, 815.80it/s]
100%|██████████| 113/113 [00:00<00:00, 800.82it/s]
100%|██████████| 113/113 [00:00<00:00, 808.10it/s]
100%|██████████| 113/113 [00:00<00:00, 835.53it/s]
100%|██████████| 113/113 [00:00<00:00, 777.99it/s]
100%|██████████| 113/113 [00:00<00:00, 831.30it/s]
100%|██████████| 113/113 [00:00<00:00, 799.60it/s]
100%|██████████| 113/113 [00:00<00:00, 792.47it/s]
100%|██████████| 10/10 [00:01<00:00,  5.04it/s]


[537.1345686912537, 410.14361119270325, 295.2904963493347, 182.25382804870605, 99.57522623240948, 54.96700619161129, 33.62207396328449, 23.01584132015705, 17.198101000860333, 13.737301269546151]
tensor([-0.5540, -0.1077,  1.9154, -0.5362, -2.8223, -0.2963,  0.2188,  1.2387,
        -0.6733,  0.0596], device='cuda:0', grad_fn=<SelectBackward0>)


In [16]:

def predict(context):
  model.eval()
  context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()
  log_probs = model(context_idxs)
  predict_label = log_probs.argmax(1)

  #print(list(vocab)[predict_label])
  return list(vocab)[predict_label]


In [17]:
import pdb
for ngram in training_ngrams[:100]:
  print(f"Text: {ngram[0][::-1]} Label: {ngram[1]}, Predicted: {predict(ngram[0])}")
  #pdb.set_trace()


Text: ['When', 'forty'] Label: winters, Predicted: winters
Text: ['forty', 'winters'] Label: shall, Predicted: shall
Text: ['winters', 'shall'] Label: besiege, Predicted: besiege
Text: ['shall', 'besiege'] Label: thy, Predicted: thy
Text: ['besiege', 'thy'] Label: brow,, Predicted: brow,
Text: ['thy', 'brow,'] Label: And, Predicted: And
Text: ['brow,', 'And'] Label: dig, Predicted: dig
Text: ['And', 'dig'] Label: deep, Predicted: deep
Text: ['dig', 'deep'] Label: trenches, Predicted: trenches
Text: ['deep', 'trenches'] Label: in, Predicted: in
Text: ['trenches', 'in'] Label: thy, Predicted: thy
Text: ['in', 'thy'] Label: beauty's, Predicted: beauty's
Text: ['thy', "beauty's"] Label: field,, Predicted: use,
Text: ["beauty's", 'field,'] Label: Thy, Predicted: Thy
Text: ['field,', 'Thy'] Label: youth's, Predicted: youth's
Text: ['Thy', "youth's"] Label: proud, Predicted: proud
Text: ["youth's", 'proud'] Label: livery, Predicted: livery
Text: ['proud', 'livery'] Label: so, Predicted: so
Te

In [18]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 300

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [19]:
N_EPOCHS = 1


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
model = model.cuda()
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.1001)

for epoch in tqdm.tqdm(range(N_EPOCHS)):
    total_loss = 0
    for context, target in tqdm.tqdm(training_ngrams, position=0, leave=True):

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).cuda())

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])


def predict(context):
  model.eval()
  context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()
  log_probs = model(context_idxs)
  predict_label = log_probs.argmax(1)

  #print(list(vocab)[predict_label])
  return list(vocab)[predict_label]


100%|██████████| 113/113 [00:00<00:00, 792.68it/s]
100%|██████████| 1/1 [00:00<00:00,  6.69it/s]

[20036.717375340522]
tensor([-0.0908, -0.8087, -1.6637, -0.4504,  1.8161,  0.0564,  0.0661, -1.7400,
        -0.8174, -1.8513,  0.7279,  0.9043,  2.0475, -0.7372, -0.8491, -2.5825,
         1.1250, -0.1675, -0.4100,  0.7948, -1.0720, -1.9118, -0.5851,  1.6556,
        -0.5286,  2.1465, -0.5143,  1.1616, -1.1806, -0.7563,  0.7139,  0.3931,
         0.8602,  0.1276, -1.2815,  2.7407,  0.6991, -0.9759, -0.6220,  1.8017,
        -0.7682,  0.0259,  2.3339, -0.0085, -0.4952, -1.1261,  1.0355, -0.1000,
        -0.3299,  2.5545,  0.7274,  0.8049, -1.1641, -1.0855,  0.8145,  0.5730,
         0.8725,  0.2860,  0.2424,  1.5387,  2.9208,  1.5865, -0.6216,  1.3862,
        -0.2063, -0.3378,  1.1560, -1.1425,  0.2372, -1.6971,  0.9343,  0.6855,
         0.5485,  1.8851,  1.1767,  0.2915,  1.2792,  1.4387,  2.3299,  1.1570,
        -1.6389, -1.1059,  0.9326,  1.2882,  0.6588, -0.2751, -2.5688, -0.5891,
         0.0600, -1.0924, -2.1298,  1.2279, -2.8923, -0.6737,  0.2317, -0.6198,
         0.8297, -1




In [None]:
for ngram in test_ngrams[:100]:
  print(f"Text: {ngram[0][::-1]} Label: {ngram[1]} , Predicted: {predict(ngram[0])}")

In [21]:
import gensim.downloader as api
glove_model = api.load("glove-twitter-25")
#glove_model.most_similar(positive=['fruit', 'flower'], topn=1)



In [22]:
glove_model['flower'].shape

(25,)

In [23]:
len(glove_model)

1193514

In [24]:
# Lets create the ngram function
def sent2ngram(sentence, context_size):
  ngrams = [
      (
          [sentence[i - j - 1] for j in range(context_size)],
          sentence[i]
      )
      for i in range(context_size, len(sentence))
  ]
  return ngrams

sent2ngram(shakespeare[1].split(), 2)

[(['we', 'Before'], 'proceed'),
 (['proceed', 'we'], 'any'),
 (['any', 'proceed'], 'further,'),
 (['further,', 'any'], 'hear'),
 (['hear', 'further,'], 'me'),
 (['me', 'hear'], 'speak.')]

In [25]:
import nltk
nltk.download('punkt')
nltk.word_tokenize(shakespeare[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['First', 'Citizen', ':']

In [26]:
training_ngrams = []
test_ngrams = []

context_size = 4

vocab = set()
for idx, line in enumerate(shakespeare):
  #print(line)
  words = nltk.word_tokenize(line)
  words = [w.lower() for w in words]
  vocab.update(words) # we'll just add train and test to the vocab

  if idx < 3600:
    training_ngrams.extend(sent2ngram(words, context_size))
  else:
    test_ngrams.extend(sent2ngram(words, context_size))
#vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {v:k for k,v in word_to_ix.items()}

print(f"{len(training_ngrams)} train examples")
print(f"{len(test_ngrams)} test examples")

10475 train examples
125796 test examples


In [29]:
CONTEXT_SIZE = 4
# EMBEDDING_DIM = 300
EMBEDDING_DIM = 25
MODEL_DIM = 1024

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, MODEL_DIM)
        self.linear2 = nn.Linear(MODEL_DIM, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [30]:
def init_embeddings(model, glove_model, vocab):
  n = 0
  for word in vocab:
    #print(word)
    if word in glove_model:
      n += 1
      model.embeddings.weight.data[word_to_ix[word]] = torch.cuda.FloatTensor(glove_model[word])
      #print(word)
  print(f"{n} embeddings initialized, out of {len(vocab)} vocab")


In [31]:
# N_EPOCHS = 10
N_EPOCHS = 3


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
model = model.cuda()

init_embeddings(model, glove_model, vocab)

#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in tqdm.tqdm(range(N_EPOCHS)):
    total_loss = 0
    for context, target in tqdm.tqdm(training_ngrams, position=0, leave=True):

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long).cuda())

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])


def predict(context):
  model.eval()
  context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).cuda()
  log_probs = model(context_idxs)
  predict_label = log_probs.argmax(1)

  #print(list(vocab)[predict_label])
  return list(vocab)[predict_label]


  model.embeddings.weight.data[word_to_ix[word]] = torch.cuda.FloatTensor(glove_model[word])


9335 embeddings initialized, out of 12338 vocab


100%|██████████| 10475/10475 [01:03<00:00, 165.67it/s]
100%|██████████| 10475/10475 [01:03<00:00, 164.62it/s]
100%|██████████| 10475/10475 [01:03<00:00, 164.75it/s]
100%|██████████| 3/3 [03:10<00:00, 63.49s/it]

[67093.8152495788, 56293.71774948554, 50868.78137513936]
tensor([-1.1144, -0.7477, -0.6001,  0.3623,  0.9579, -0.1059,  1.3109, -0.0423,
        -0.0942, -0.5713, -0.0470,  0.3227, -3.5531, -0.1659, -0.7028,  0.2251,
         0.3579,  0.1432,  0.3692,  0.7876, -0.8665, -0.3546, -0.9315,  0.2978,
        -0.1700], device='cuda:0', grad_fn=<SelectBackward0>)





In [None]:
for ngram in test_ngrams[:100]:
  print(f"Text: {ngram[0][::-1]} Label: {ngram[1]}, Predicted: {predict(ngram[0])}")

In [33]:
import numpy as np

# Get the interactive Tools for Matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.decomposition import PCA

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [34]:
import gensim.downloader as api
# https://github.com/RaRe-Technologies/gensim-data
glove_model = api.load("glove-wiki-gigaword-300")



In [35]:
def analogy(x1, x2, y1):
    """
    The most_similar function finds the top-N most similar keys.
    Positive keys contribute positively towards the similarity, negative keys negatively.

    This method computes cosine similarity between a simple mean of the projection weight vectors of the given keys and the vectors for each key in the model.
    The method corresponds to the word-analogy and distance scripts in the original word2vec implementation.
    """
    result = glove_model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [36]:
analogy('japan', 'japanese', 'korea')


'korean'

In [37]:
analogy('king', 'man', 'queen') # king -> man, queen -> ?


'woman'

In [40]:
analogy('paltry', 'significant', 'banal')

'important'

In [41]:
analogy('opulent', 'wealth', 'powerful')

'strong'

In [42]:
analogy('regatta', 'oarsman', 'marathon') # failure cases

'swimmer'

In [43]:
analogy('diamond', 'baseball', 'court') # failure cases - we want baseball diamond and squash court. word vectors are limited in dealing with multiple semantics

'appeals'

In [44]:
analogy('one', 'ten', 'two',) # semantic vectors often have imprecise representation of numbers

'eleven'

In [45]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]

    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]

    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [46]:
display_pca_scatterplot(glove_model,
                        ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

<IPython.core.display.Javascript object>

In [47]:
display_pca_scatterplot(glove_model,
                        ['woman', 'queen', 'man', 'king', 'girl', 'boy','prince', 'princess']             )

<IPython.core.display.Javascript object>

In [48]:
display_pca_scatterplot(glove_model, sample=20)

AttributeError: ignored