In [1]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
from tqdm.notebook import tqdm_notebook

In [2]:
args = Namespace(
    raw_dataset_txt="../Data/frankenstein.txt",
    window_size=5,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="../Data/frankenstein_with_splits.csv",
    seed=1337
)

In [3]:
# Split the raw text book into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [13]:
print(f"{len(sentences)} Sentences")
print(f"Sample: \n{sentences[np.random.randint(len(sentences))]}")

3427 Sentences
Sample: 
I do not wish to take any unfair advantage, and I beg
therefore that you will take some days to consider of your
determination.'


In [14]:
# general sanitization
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [15]:

processed_sentences = [preprocess_text(sentence) for sentence in sentences]

In [16]:
MASKED_TOKENS = "<MASK>"

In [17]:
# creating window and data
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASKED_TOKENS] * args.window_size + sentence.split(' ') + [MASKED_TOKENS] * args.window_size, args.window_size * 2 + 1)) \
    for sentence in tqdm_notebook(processed_sentences)])

# Create cbow data
data = []
for window in tqdm_notebook(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token == MASKED_TOKENS or i == args.window_size:
            continue
        else:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
    
            
# Convert to dataframe
cbow_data = pd.DataFrame(data, columns=["context", "target"])

  0%|          | 0/3427 [00:00<?, ?it/s]

  0%|          | 0/90698 [00:00<?, ?it/s]

In [22]:
# Create split data
n = len(cbow_data)
def get_split(row_num):
    if row_num <= n*args.train_proportion:
        return 'train'
    elif (row_num > n*args.train_proportion) and (row_num <= n*args.train_proportion + n*args.val_proportion):
        return 'val'
    else:
        return 'test'
cbow_data['split']= cbow_data.apply(lambda row: get_split(row.name), axis=1)

In [23]:
cbow_data.head()

Unnamed: 0,context,target,split
0,", or the modern prometheus",frankenstein,train
1,frankenstein or the modern prometheus by,",",train
2,"frankenstein , the modern prometheus by mary",or,train
3,"frankenstein , or modern prometheus by mary wo...",the,train
4,"frankenstein , or the prometheus by mary wolls...",modern,train


In [24]:
# Write split data to file
cbow_data.to_csv(args.output_munged_csv, index=False)