In [1]:
from collections import Counter
import numpy as np
import torch
import spacy
from sklearn.model_selection import train_test_split

In [3]:
# prepare text using the spacy english pipeline (see https://spacy.io/models/en)
# we'll use it to lemmatize the text, and determine which part of speech each
# lemmatize edits words to become the 'root' word - e.g. holds -> hold;  rubs->rub
# part of speech indicates if the item is a verb, nooun, punctuation, space and so on.
# make sure that the text sent to spacy doesn't end with a period immediately followed by a newline,
# instead, make sure there is a space between the period and the newline, so that the period 
# is correctly identified as punctuation.

def prepare_texts(text):    
    # Get a callable object from spaCy that processes the text - lemmatizes and determines part of speech

    nlp = spacy.load("en_core_web_sm")
    
    # lemmatize the text, get part of speech, and remove spaces and punctuation
    
    lemmas = [tok.lemma_ for tok in nlp(text) if tok.pos_ not in ["PUNCT", "SPACE"]]
    
    # count the number of occurences of each word in the vocabulary
    
    freqs = Counter() 
    for w in lemmas:
        freqs[w] += 1
        
    vocab = list(freqs.items())  # List of (word, occurrence)
    
    vocab = sorted(vocab, key=lambda item: item[1], reverse=True)  # Sort by decreasing frequency
    print(vocab)
    
    # Create word->index dictionary and index->word dictionary
    
    v2i = {v[0]:i for i,v in enumerate(vocab)}
    i2v = {i:v[0] for i,v in enumerate(vocab)}
    
    return lemmas, v2i, i2v

In [13]:
# read in the text file, and prepare the text for use in the model
import os
text = open('SmallSimpleCorpus.txt').read()
a,b,c = prepare_texts(text)
print(c)
print(a)

[('and', 160), ('hold', 128), ('dog', 128), ('cat', 128), ('rub', 128), ('a', 104), ('the', 104), ('can', 104), ('she', 96), ('he', 96), ('I', 80)]
{0: 'and', 1: 'hold', 2: 'dog', 3: 'cat', 4: 'rub', 5: 'a', 6: 'the', 7: 'can', 8: 'she', 9: 'he', 10: 'I'}
['I', 'hold', 'a', 'dog', 'I', 'hold', 'the', 'dog', 'I', 'hold', 'a', 'cat', 'I', 'hold', 'the', 'cat', 'I', 'rub', 'a', 'dog', 'I', 'rub', 'the', 'dog', 'I', 'rub', 'a', 'cat', 'I', 'rub', 'the', 'cat', 'I', 'hold', 'and', 'rub', 'a', 'dog', 'I', 'hold', 'and', 'rub', 'the', 'dog', 'I', 'hold', 'and', 'rub', 'a', 'cat', 'I', 'hold', 'and', 'rub', 'the', 'cat', 'I', 'hold', 'a', 'dog', 'and', 'cat', 'I', 'hold', 'the', 'dog', 'and', 'cat', 'I', 'hold', 'a', 'cat', 'and', 'dog', 'I', 'hold', 'the', 'cat', 'and', 'dog', 'I', 'rub', 'a', 'dog', 'and', 'cat', 'I', 'rub', 'the', 'dog', 'and', 'cat', 'I', 'rub', 'a', 'cat', 'and', 'dog', 'I', 'rub', 'the', 'cat', 'and', 'dog', 'I', 'hold', 'and', 'rub', 'a', 'dog', 'I', 'hold', 'and', 'rub

#### This following function walks through each word, and looks at a window (of size 'window') of words and creates input/output prediction pairs, predicting each of the words surrounding the current word from the current word.  So here we say that we are 'predicting the context' from the word

In [16]:
def tokenize_and_preprocess_text(textlist, v2i, window = 3):

    # Predict context with word. Sample the context within a window size.
    # based on the period at the end of the sentence, we'll split the text into sentences
    nlp = spacy.load("en_core_web_sm")
    sentence = textlist.split(".")
    X, Y = [], []  # is the list of training/test samples
    for i in sentence:
        lemmas = [tok.lemma_ for tok in nlp(i) if tok.pos_ not in ["PUNCT", "SPACE"]]
        for word in lemmas:
            # search each word within the window size from -(window-1)/2 to +(window-1)/2, not including the word itself
            for j in range(-(window-1)//2, (window-1)//2+1):
                if j != 0:
                    if lemmas.index(word)+j >= 0 and lemmas.index(word)+j < len(lemmas):
                        X.append(v2i[word])
                        Y.append(v2i[lemmas[lemmas.index(word)+j]])
    
    # TO DO - create all the X,Y pairs
    
    return X, Y

In [33]:
text = open('SmallSimpleCorpus.txt').read()
a,v2i, i2v = prepare_texts(text)

X, Y = tokenize_and_preprocess_text(text, v2i)

print(X[:20])
print(Y[:20])


[('and', 160), ('hold', 128), ('dog', 128), ('cat', 128), ('rub', 128), ('a', 104), ('the', 104), ('can', 104), ('she', 96), ('he', 96), ('I', 80)]
[10, 1, 1, 5, 5, 2, 10, 1, 1, 6, 6, 2, 10, 1, 1, 5, 5, 3, 10, 1]
[1, 10, 5, 1, 2, 5, 1, 10, 6, 1, 2, 6, 1, 10, 5, 1, 3, 5, 1, 10]


In [26]:
#convert the X,Y index to word
print([i2v[i] for i in X])
print ([i2v[i] for i in Y])

['I', 'hold', 'hold', 'a', 'a', 'dog', 'I', 'hold', 'hold', 'the', 'the', 'dog', 'I', 'hold', 'hold', 'a', 'a', 'cat', 'I', 'hold', 'hold', 'the', 'the', 'cat', 'I', 'rub', 'rub', 'a', 'a', 'dog', 'I', 'rub', 'rub', 'the', 'the', 'dog', 'I', 'rub', 'rub', 'a', 'a', 'cat', 'I', 'rub', 'rub', 'the', 'the', 'cat', 'I', 'hold', 'hold', 'and', 'and', 'rub', 'rub', 'a', 'a', 'dog', 'I', 'hold', 'hold', 'and', 'and', 'rub', 'rub', 'the', 'the', 'dog', 'I', 'hold', 'hold', 'and', 'and', 'rub', 'rub', 'a', 'a', 'cat', 'I', 'hold', 'hold', 'and', 'and', 'rub', 'rub', 'the', 'the', 'cat', 'I', 'hold', 'hold', 'a', 'a', 'dog', 'dog', 'and', 'and', 'cat', 'I', 'hold', 'hold', 'the', 'the', 'dog', 'dog', 'and', 'and', 'cat', 'I', 'hold', 'hold', 'a', 'a', 'cat', 'cat', 'and', 'and', 'dog', 'I', 'hold', 'hold', 'the', 'the', 'cat', 'cat', 'and', 'and', 'dog', 'I', 'rub', 'rub', 'a', 'a', 'dog', 'dog', 'and', 'and', 'cat', 'I', 'rub', 'rub', 'the', 'the', 'dog', 'dog', 'and', 'and', 'cat', 'I', 'rub',

In [2]:
len([1,2,3])

3

## Define Model that will be trained to produce word vectors

In [None]:
class Word2vecModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        # initialize word vectors to random numbers 
        
        #TO DO
        
        # prediction function takes embedding as input, and predicts which word in vocabulary as output

        #TO DO
        
    def forward(self, x):
        """
        x: torch.tensor of shape (bsz), bsz is the batch size
        """
        #TO DO
        return logits, e

#### The training function - give it the text and it does the rest

In [None]:
def train_word2vec(textlist, window, embedding_size ):
    # Set up a model with Skip-gram (predict context with word)
    # textlist: a list of the strings
    

    
    # Create the training data
    
    # TO DO
    
    # Split the training data
    
    # TO DO
    
    # instantiate the network & set up the optimizer
    
    # TO DO
    
    # training loop
    
    # TO DO
    return network

### Run Training and retrieve embedding

In [None]:
network = train_word2vec(lemmas)
embedding = network.embedding

#### Evaluate some properties of the word embedding

In [None]:
def visualize_embedding(embedding, most_frequent_from=0, most_frequent_to=40):
    assert embedding.shape[1] == 2, "This only supports visualizing 2-d embeddings!"
    
    # TO DO
    
visualize_embedding(embedding.detach().numpy(), most_frequent_from=0, most_frequent_to=11)