In [1]:
import pickle
import re
import random
import math
import pandas as pd
from pathlib import Path

import nltk.data

In [2]:
# Clean sentences
def preprocess_text(text):
  text = ' '.join(word.lower() for word in text.split(" "))
  text = re.sub(r"([.,!?])", r" \1 ", text)
  text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
  return text

def set_splits(df, val_pct, test_pct=None):
  df['split'] = 'train'
  df_len = len(df)
  idxs = list(range(df_len))
  random.shuffle(idxs)
  
  val_idx = math.ceil(df_len * val_pct)
  val_idxs = idxs[:val_idx]
  df.loc[val_idxs, 'split'] = 'val'
  
  if test_pct:
    test_idx = val_idx + math.ceil(df_len * test_pct)
    test_idxs = idxs[val_idx:test_idx]
    df.loc[test_idxs, 'split'] = 'test'
    
  return df

In [3]:
path = Path('../data/books')
pretrained = Path('../pretrained')
tokenizer_path = pretrained/'punkt/english.pickle'
raw_dataset_txt = path/'frankenstein.txt'
processed_file = path/'frankenstein_with_splits.csv'

In [4]:
window_size = 5
MASK_TOKEN = '<MASK>'
val_pct = 0.15
test_pct = 0.15

In [5]:
tokenizer = pickle.load(tokenizer_path.open('rb'))

with open(raw_dataset_txt, 'r') as fp:
  book = fp.read()
sentences = tokenizer.tokenize(book)
print(f"{len(sentences)} sentences")
print(f"Sample: {sentences[100]}")

3427 sentences
Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [6]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [7]:
window_size = 5
MASK_TOKEN = '<MASK>'

flatten = lambda outer: [item for inner in outer for item in inner]

windows = flatten([list(nltk.ngrams([MASK_TOKEN] * window_size + sentence.split(' ') + \
    [MASK_TOKEN] * window_size, window_size * 2 + 1)) \
    for sentence in cleaned_sentences])

data = []
for window in windows:
  target_token = window[window_size]
  context = []
  for i, token in enumerate(window):
    if token == MASK_TOKEN or i == window_size:
      continue
    else:
      context.append(token)
  data.append([' '.join(token for token in context), target_token])

In [8]:
cbow_data = pd.DataFrame(data, columns=['context', 'target'])
cbow_data.head()

Unnamed: 0,context,target
0,", or the modern prometheus",frankenstein
1,frankenstein or the modern prometheus by,","
2,"frankenstein , the modern prometheus by mary",or
3,"frankenstein , or modern prometheus by mary wo...",the
4,"frankenstein , or the prometheus by mary wolls...",modern


In [9]:
cbow_data = set_splits(cbow_data, val_pct, test_pct)
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,val
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [10]:
cbow_data.to_csv(processed_file, index=False)