### Import modules

In [1]:
import pickle

from collections import Counter
from general_functions import SENT_BEG, SENT_END
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from random import shuffle

### Preprocessing

* Read text data
* Preprocess and clean the text
* Save clean data

In [2]:
def read_text(filepath) -> str:
    """Read a text file using UTF-8 encoding
    Parameters:
        filepath (str): the path to the file
    Returns:
        str: the file contents"""
    
    text = None
    
    with open(filepath, 'r', encoding = 'utf-8-sig') as f:  # utf-8-sig removes the byte order maker
        text = f.read()
        
    return text

In [3]:
def preprocess(text) -> (list, Counter):
    """Preprocess text and count tokens
    Parameters:
        text (str): the text to process
    Returns:
        list: a list of tokens in each sentence
        Counter: token counts"""
    
    token_counter = Counter()
    
    text = text.replace('\n', ' ').replace('_', '')  # replace newline char with whitespace, and remove underscore
    
    # split text into sentences
    text = sent_tokenize(text)
    print('Number of sentences:', len(text))
    
    # remove all punctuation and tokenise
    punct_remover = RegexpTokenizer(r'\w+')
    
    for i in range(len(text)):
        sentence = text[i].lower()  # convert to lower case
        
        # remove punctuation, add start-of-sent and end-of-sent chars
        tokens = punct_remover.tokenize(sentence)
        tokens.insert(0, SENT_BEG)
        tokens.append(SENT_END)
        
        token_counter.update(tokens)  # count tokens
        
        text[i] = tokens  # overwrite sentence with list of processed tokens
        
    # vocab stats
    print('Vocab size |V|:', len(token_counter))
    print('Total words excluding {} and {}:'.format(SENT_BEG, SENT_END), sum(token_counter.values()) - token_counter[SENT_BEG] - token_counter[SENT_END])
        
    return text, token_counter

In [4]:
def write_to_pickle(filepath, obj) -> None:
    """Write an object to a pickle file
    Parameters:
        filepath (str): path to the pickle file
        obj (Any): object to write
    Returns:
        None"""
    
    with open(filepath, 'wb') as pckl:
        pickle.dump(obj, pckl, pickle.HIGHEST_PROTOCOL)
        
    pckl.close()

#### Twenty Thousand Leagues Under the Sea

In [5]:
ttl_sents = read_text('./Corpus/TwentyThousandLeagues.txt')

ttl_sents, ttl_counts = preprocess(ttl_sents)

write_to_pickle('./Pickles/ttl_sents.pickle', ttl_sents)
write_to_pickle('./Pickles/ttl_counts.pickle', ttl_counts)

Number of sentences: 6587
Vocab size |V|: 8669
Total words excluding <s> and </s>: 104174


#### Around the World in Eighty Days

In [6]:
atw_sents = read_text('./Corpus/AroundTheWorld.txt')

atw_sents, atw_counts = preprocess(atw_sents)

write_to_pickle('./Pickles/atw_sents.pickle', atw_sents)
write_to_pickle('./Pickles/atw_counts.pickle', atw_counts)

Number of sentences: 2877
Vocab size |V|: 6829
Total words excluding <s> and </s>: 64343


#### Comined corpus

In [7]:
cor_sents = []

cor_sents.extend(ttl_sents)
cor_sents.extend(atw_sents)
shuffle(cor_sents)

cor_counts = ttl_counts + atw_counts

print('Number of sentences:', len(cor_sents))
print('Vocab size |V|:', len(cor_counts))
print('Total words excluding {} and {}:'.format(SENT_BEG, SENT_END), sum(cor_counts.values()) - cor_counts[SENT_BEG] - cor_counts[SENT_END])

Number of sentences: 9464
Vocab size |V|: 11553
Total words excluding <s> and </s>: 168517


In [8]:
write_to_pickle('./Pickles/cor_sents.pickle', cor_sents)
write_to_pickle('./Pickles/cor_counts.pickle', cor_counts)