In [47]:
%pip install seaborn

Collecting seaborn
  Downloading seaborn-0.12.1-py3-none-any.whl (288 kB)
[K     |████████████████████████████████| 288 kB 1.3 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.12.1
Note: you may need to restart the kernel to use updated packages.


In [31]:
import numpy as np
import json
import re

np.random.seed(1)

## Read in text data for training

In [32]:
# Read one JSON record per line
def read_jsonl(f):
  f = open(f)
  res = []
  for line in f:
    res.append(json.loads(line))
  f.close()
  return res

data = read_jsonl('dev.json') + read_jsonl('test.json') + read_jsonl('train.json')
data = [line['text'] for line in data]
doc_count = len(data)
doc_count

2000

## Clean data

In [33]:
def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)

tokens_by_doc = [tokenize(doc) for doc in data[:1]]

In [105]:
tokens_by_doc[0]

['under',
 'any',
 'other',
 'circumstances',
 'i',
 'would',
 'not',
 'be',
 'discussing',
 'the',
 'ending',
 'of',
 'a',
 'film',
 'to',
 'the',
 'extent',
 'that',
 'i',
 'will',
 'in',
 'this',
 'particular',
 'review',
 'however',
 'in',
 'order',
 'to',
 'fully',
 'explain',
 'exactly',
 'how',
 'and',
 'why',
 'this',
 'movie',
 'is',
 'so',
 'awful',
 'a',
 'minute',
 'dissection',
 'of',
 'the',
 'ending',
 'is',
 'necessary',
 'even',
 'though',
 'i',
 'will',
 'not',
 'reveal',
 'the',
 'details',
 'of',
 'the',
 'last',
 'scenes',
 'do',
 'proceed',
 'at',
 'your',
 'own',
 'risk',
 'the',
 'movie',
 'opens',
 'quite',
 'poorly',
 'i',
 'might',
 'add',
 'as',
 'child',
 'psychologist',
 'malcolm',
 'crowe',
 'bruce',
 'willis',
 'looking',
 'like',
 'he',
 'was',
 'dragged',
 'out',
 'of',
 'his',
 'trailer',
 'at',
 'the',
 'wee',
 'hours',
 'of',
 'the',
 'morning',
 'to',
 'shoot',
 'each',
 'scene',
 'and',
 'his',
 'wife',
 'are',
 'intruded',
 'upon',
 'by',
 'one',

In [34]:
# want to treat the tokens as one long list, rather than a list of tokens per document
# to do this, will insert a special character in between each document to stop the training there
# TLDR insert a stop token between each document to avoid using next/prev document as context
SPECIAL_STOP_TOKEN = '*!!!*'

def combine_tokens():
  ret = []
  for token_list in tokens_by_doc:
    ret.extend(token_list)
    ret.append(SPECIAL_STOP_TOKEN)

  return ret

tokens_with_stops = combine_tokens()

# should equal number of documents (2000)
len([token for token in tokens_with_stops if token == SPECIAL_STOP_TOKEN])

1

## Create vocabulary maps

In [35]:
token_to_id = {}
id_to_token = {}

def make_vocabulary(tokens):
  for token in tokens:
    if token not in token_to_id:
      id_no = len(token_to_id) + 1
      token_to_id[token] = id_no
      id_to_token[id_no] = token

for doc in tokens_by_doc:
  make_vocabulary(doc)

token_to_id['the'], id_to_token[10]

(10, 'the')

## Create dataset

In [59]:
def create_one_hot(token_id):
  vector = np.zeros(len(token_to_id) + 1)
  vector[token_id] = 1
  return vector

def yield_range(*ranges):
  for iterable in ranges:
    yield from iterable

def create_dataset(window_size=2):
  X = []
  y = []
  
  # note that this includes stop tokens, which will need to be skipped
  token_count = len(tokens_with_stops)

  for i in range(token_count):
    if tokens_with_stops[i] == SPECIAL_STOP_TOKEN:
      continue

    indexes = yield_range(range(max(0, i - window_size), i), range(i, min(token_count, i + window_size + 1)))
    for j in indexes:
      if j == i:
        continue
      
      if tokens_with_stops[j] == SPECIAL_STOP_TOKEN:
        break

      X.append(create_one_hot(token_to_id[tokens_with_stops[i]]))
      y.append(create_one_hot(token_to_id[tokens_with_stops[j]]))

  return np.asarray(X), np.asarray(y)

X, y = create_dataset()

In [60]:
X.shape, y.shape

((3022, 383), (3022, 383))

## Create and Initialize Network

In [70]:
def init_network(vocab_size: int, n_embeddings: int) -> dict:
  model = {
    'w1': np.random.randn(vocab_size + 1, n_embeddings),
    'w2': np.random.randn(n_embeddings, vocab_size + 1),
  }
  return model

### Forward Propagation and Helper Functions

In [102]:
def softmax(outputs):
  res = []
  for x in outputs:
    res.append(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
  return res

def cross_entropy_loss(z, y):
  return - np.sum(y * np.log(z))

def forward_prop(model, X):
  cache = {}
  cache['a1'] = X @ model['w1']
  cache['a2'] = cache['a1'] @ model['w2']
  cache['z'] = softmax(cache['a2'])
  return cache

def back_prop(model, X, y, alpha):
  cache = forward_prop(model, X)
  da2 = cache['z'] - y
  dw2 = cache['a1'].T @ da2
  da1 = da2 @ model['w2'].T
  dw1 = X.T @ da1
  model['w1'] -= alpha * dw1
  model['w2'] -= alpha * dw2
  return cross_entropy_loss(cache['z'], y)

In [104]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

sns.set()

n_iter = 50
learning_rate = 0.05

model = init_network(len(token_to_id), 10)
model['w1'].shape, model['w2'].shape

for i in tqdm(range(n_iter)):
  back_prop(model, X, y, learning_rate)

  return - np.sum(y * np.log(z))
  return - np.sum(y * np.log(z))
100%|██████████| 50/50 [00:08<00:00,  5.75it/s]


In [113]:
def get_embedding(model, word):
  try:
    idx = token_to_id[word]
  except KeyError:
    print(f'{word} not in corpus')
  one_hot = create_one_hot(idx)
  return forward_prop(model, one_hot)['a1']

In [117]:
get_embedding(model, 'film')

array([-1.31762146e+55, -2.39883080e+55, -1.02480703e+54, -3.27794815e+55,
        1.68949533e+55, -3.94678492e+55,  6.88931867e+55, -2.07986571e+55,
        9.71571054e+55, -1.45384088e+55])