In [1]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
import tqdm
from tqdm import tqdm_notebook

In [2]:
args = Namespace(
    raw_data='data/books/frankenstein.txt',
    window_size=5,
    train_prop=0.7,
    val_prop=0.15,
    test_prop=0.15,
    processed_data='data/books/frankeinstein_processed_data.csv',
    seed=1337
)

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_data) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [4]:
print(len(sentences), "sentences")
print("Sample: ", sentences[100])

3427 sentences
Sample:  No incidents have hitherto befallen us that would make a figure in a
letter.


In [5]:
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(' '))
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'[^a-zA-Z.,!?]+', r' ', text)
    return text

In [6]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [7]:
cleaned_sentences[100]

'no incidents have hitherto befallen us that would make a figure in a letter . '

In [8]:
sentences[1]

'I arrived here yesterday, and my first task is to assure\nmy dear sister of my welfare and increasing confidence in the success\nof my undertaking.'

In [9]:
cleaned_sentences[1]

'i arrived here yesterday , and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking . '

In [10]:
MASK_TOKEN = '<MASK>'

In [11]:
# create windows
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + \
                 [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1)) \
                  for sentence in tqdm_notebook(cleaned_sentences)])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/3427 [00:00<?, ?it/s]

In [12]:
cleaned_sentences[0]

'frankenstein , or the modern prometheus by mary wollstonecraft godwin shelley letter st . petersburgh , dec . th , to mrs . saville , england you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings . '

In [13]:
windows[0]

('<MASK>',
 '<MASK>',
 '<MASK>',
 '<MASK>',
 '<MASK>',
 'frankenstein',
 ',',
 'or',
 'the',
 'modern',
 'prometheus')

In [14]:
windows[5]

('frankenstein',
 ',',
 'or',
 'the',
 'modern',
 'prometheus',
 'by',
 'mary',
 'wollstonecraft',
 'godwin',
 'shelley')

In [15]:
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    # everything excluding center word or mask token are the context words
    for i, token in enumerate(window):
        if token == MASK_TOKEN or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
    
cbow_data = pd.DataFrame(data, columns=['context', 'target'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/90698 [00:00<?, ?it/s]

In [16]:
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_prop:
        return 'train'
    elif (row_num > n*args.train_prop) and (row_num <= n*args.train_prop + n*args.val_prop):
        return 'val'
    else:
        return 'test'
cbow_data['split'] = cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [17]:
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [18]:
cbow_data.to_csv(args.processed_data, index=False)