# A simple implementation of CBOW
Author: Pierre Nugues

The imports

In [1]:
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda, Average
import regex as re
import os
from tensorflow.keras.utils import to_categorical
import numpy as np
from scipy.spatial.distance import cosine

## Parameters

The embedding size and context size

In [2]:
embedding_dim = 100
w_size = 2
c_size = w_size * 2 + 1

## The Corpus

In [3]:
dataset = 'dickens'  # 'homer' dickens' 'selma' 'big'
colab = False # On my machine or on colab
debug = False

In [4]:
if colab:
    BASE_PATH = '/content/drive/My Drive/Colab Notebooks/'
else:
    BASE_PATH = '../../../'

In [5]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

We read the files from a folder

In [6]:
def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    :param dir:
    :param suffix:
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        if file.endswith(suffix):
            files.append(file)
    return files


def load_corpus(path):
    files = get_files(path, 'txt')
    files = [path + file for file in files]
    print(files)
    text = ''
    for file in files:
        text += open(file).read()
    return text

In [7]:
if dataset == 'homer':
    #text = 'Sing, O goddess, the anger of Achilles son of Peleus'.lower()
    text1 = open(BASE_PATH + 'corpus/iliad.mb.txt', encoding='utf-8').read().lower()
    text2 = open(BASE_PATH + 'corpus/odyssey.mb.txt', encoding='utf-8').read().lower()
    text = text1 + text2
    test_words = ['he', 'she', 'ulysses', 'penelope', 'achaeans', 'trojans']
if dataset == 'dickens':
    path = BASE_PATH + 'corpus/Dickens/'
    text = load_corpus(path)
    test_words = ['he', 'she', 'paris', 'london', 'table', 'rare', 'monday', 'sunday', 'man', 'woman', 'king', 'queen', 'boy',
                  'girl']
elif dataset == 'selma':
    path = BASE_PATH + 'corpus/Selma/'
    text = load_corpus(path)
    test_words = ['han', 'hon', 'att', 'bord', 'bordet', 'måndag', 'söndag', 'man', 'kvinna', 'kung', 'drottning',
                  'pojke', 'flicka']
elif dataset == 'big':
    path = BASE_PATH + 'corpus/Dickens/'
    text = load_corpus(path)
    path = BASE_PATH + 'corpus/Norvig/'
    text += load_corpus(path)
    test_words = ['he', 'she', 'paris', 'london', 'table', 'rare', 'monday', 'sunday', 'man', 'woman', 'king', 'queen', 'boy',
                  'girl']   

['../../../corpus/Dickens/Hard Times.txt', '../../../corpus/Dickens/Oliver Twist.txt', '../../../corpus/Dickens/Great Expectations.txt', '../../../corpus/Dickens/The Old Curiosity Shop.txt', '../../../corpus/Dickens/A Tale of Two Cities.txt', '../../../corpus/Dickens/Dombey and Son.txt', '../../../corpus/Dickens/The Pickwick Papers.txt', '../../../corpus/Dickens/Bleak House.txt', '../../../corpus/Dickens/Our Mutual Friend.txt', '../../../corpus/Dickens/The Mystery of Edwin Drood.txt', '../../../corpus/Dickens/Nicholas Nickleby.txt', '../../../corpus/Dickens/David Copperfield.txt', '../../../corpus/Dickens/Little Dorrit.txt', '../../../corpus/Dickens/A Christmas Carol in Prose.txt']


## Processing the Corpus

We set all the text in lowercase

In [8]:
text = text.lower()
words = re.findall('\p{L}+', text)
words[:5]

['hard', 'times', 'and', 'reprinted', 'pieces']

In [9]:
unique_words = sorted(list(set(words)))
unique_words[:10]

['a',
 'aaron',
 'aback',
 'abaft',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandons',
 'abase']

In [10]:
vocab_size = len(unique_words)
vocab_size

35221

In [11]:
word2idx = {word: i for (i, word) in enumerate(unique_words)}
idx2word = {v: k for k, v in word2idx.items()}
#word2idx

In [12]:
X_words = []
y_words = []
for i in range(len(words) - c_size + 1):
    X_words.append(words[i: i + w_size] + words[i + w_size + 1: i + 2 * w_size + 1])
    y_words.append(words[i + w_size])

In [13]:
X_words[:10]

[['hard', 'times', 'reprinted', 'pieces'],
 ['times', 'and', 'pieces', 'by'],
 ['and', 'reprinted', 'by', 'charles'],
 ['reprinted', 'pieces', 'charles', 'dickens'],
 ['pieces', 'by', 'dickens', 'with'],
 ['by', 'charles', 'with', 'illustrations'],
 ['charles', 'dickens', 'illustrations', 'by'],
 ['dickens', 'with', 'by', 'marcus'],
 ['with', 'illustrations', 'marcus', 'stone'],
 ['illustrations', 'by', 'stone', 'maurice']]

In [14]:
X = np.array([list(map(lambda x: word2idx.get(x), x)) for x in X_words])
X.shape

(3355452, 4)

X[:10]

In [15]:
y_words[:10]

['and',
 'reprinted',
 'pieces',
 'by',
 'charles',
 'dickens',
 'with',
 'illustrations',
 'by',
 'marcus']

In [16]:
y = np.array(list(map(lambda x: word2idx.get(x), y_words)))

## The Architecture

In [19]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=2 * w_size),
    Lambda(lambda x: backend.mean(x, axis=1)),
    Dense(vocab_size, activation='softmax')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            3522100   
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 35221)             3557321   
Total params: 7,079,421
Trainable params: 7,079,421
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')

## Training the Model

In [21]:
model.fit(X, y, batch_size=1024, epochs=4,  validation_split=0.10)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7fd3fbe7b790>

In [22]:
vectors = model.get_weights()[0]

In [23]:
def most_sim_vecs(vector, U, nbr_words=10):
    # Here cosine distance and not cosine
    # distance between equal vectors: 0. max distance: 2
    dist = [cosine(vector, U[i, :]) if np.any(U[i, :]) else 2
            for i in range(U.shape[0])]
    sorted_vectors = sorted(range(len(dist)), key=lambda k: dist[k])
    return sorted_vectors[1:nbr_words + 1]

In [24]:
most_sim_words = {}
for w in test_words:
    most_sim_words[w] = most_sim_vecs(vectors[word2idx[w]], vectors)
    most_sim_words[w] = list(map(idx2word.get, most_sim_words[w]))
    print(w, most_sim_words[w])

he ['she', 'it', 'nobody', 'they', 'nicholas', 'i', 'herbert', 'everybody', 'oliver', 'we']
she ['he', 'nobody', 'they', 'florence', 'it', 'i', 'herbert', 'estella', 'edith', 'bella']
paris ['england', 'london', 'france', 'print', 'india', 'yorkshire', 'newgate', 'yarmouth', 'dover', 'parliament']
london ['paris', 'england', 'india', 'france', 'yorkshire', 'town', 'dover', 'greta', 'canterbury', 'yarmouth']
table ['ground', 'wall', 'counter', 'pavement', 'road', 'hearth', 'box', 'staircase', 'grass', 'carpet']
rare ['terrible', 'special', 'trifling', 'delicious', 'mighty', 'desperate', 'tender', 'sturdy', 'singular', 'sober']
monday ['thursday', 'wednesday', 'sunday', 'tuesday', 'noon', 'betimes', 'saturday', 'tiptoe', 'christmas', 'horseback']
sunday ['saturday', 'monday', 'summer', 'wednesday', 'winter', 'day', 'christmas', 'previous', 'stage', 'betimes']
man ['gentleman', 'woman', 'lady', 'person', 'ooman', 'soldier', 'chap', 'un', 'dog', 'creature']
woman ['man', 'lady', 'gentleman