In [1]:
%pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import json
import re

np.random.seed(1)

## Read in text data for training

In [3]:
# Read one JSON record per line
def read_jsonl(f):
  f = open(f)
  res = []
  for line in f:
    res.append(json.loads(line))
  f.close()
  return res

data = read_jsonl('dev.json') + read_jsonl('test.json') + read_jsonl('train.json')
data = [line['text'] for line in data]
doc_count = len(data)
doc_count

2000

## Clean data

In [4]:
def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)

tokens_by_doc = [tokenize(doc) for doc in data]

In [5]:
# want to treat the tokens as one long list, rather than a list of tokens per document
# to do this, will insert a special character in between each document to stop the training there
# TLDR insert a stop token between each document to avoid using next/prev document as context
SPECIAL_STOP_TOKEN = '*!!!*'

def combine_tokens():
  ret = []
  for token_list in tokens_by_doc:
    ret.extend(token_list)
    ret.append(SPECIAL_STOP_TOKEN)

  return ret

tokens_with_stops = combine_tokens()

# should equal number of documents (2000)
len([token for token in tokens_with_stops if token == SPECIAL_STOP_TOKEN])

2000

## Create vocabulary maps

In [6]:
token_to_id = {}
id_to_token = {}

def make_vocabulary(tokens):
  for token in tokens:
    if token not in token_to_id:
      id_no = len(token_to_id) + 1
      token_to_id[token] = id_no
      id_to_token[id_no] = token

for doc in tokens_by_doc:
  make_vocabulary(doc)

token_to_id['the'], id_to_token[10]

(10, 'the')

## Create dataset

In [18]:
def create_one_hot(token_id):
  vector = np.zeros(len(token_to_id))
  vector[token_id] = 1
  return vector

def yield_range(*ranges):
  for iterable in ranges:
    yield from iterable

def create_dataset(window_size=2):
  X = []
  y = []
  
  # note that this includes stop tokens, which will need to be skipped
  token_count = len(tokens_with_stops)

  for i in range(token_count):
    if tokens_with_stops[i] == SPECIAL_STOP_TOKEN:
      continue

    indexes = yield_range(range(max(0, i - window_size), i), range(i, min(token_count, i + window_size + 1)))
    for j in indexes:
      if j == i:
        continue
      
      if tokens_with_stops[j] == SPECIAL_STOP_TOKEN:
        break

      X.append(create_one_hot(token_to_id[tokens_with_stops[i]]))
      y.append(create_one_hot(token_to_id[tokens_with_stops[j]]))

  return X, y

X, y = create_dataset()

: 

: 

StopIteration: 

In [14]:
np.save('dataset_X.npy', all_X)
np.save('dataset_y.npy', all_y)

TypeError: cannot pickle 'generator' object