In [1]:
# -*- coding: utf-8 -*-

In [2]:
import json
import functions as mf

In [3]:
n = 100 # max num of texts per author

texts = mf.read_texts(n)

# Save
with open(f'raw_texts.json', 'w') as f:
    json.dump(texts, f, indent=4)

In [7]:
# 0. Evaluate the size of the corpus before preprocessing
corpus_size = mf.get_corpus_size(texts)

In [8]:
# 1. Named entity recognition + tokenization

texts_ne = mf.get_named_ents(texts)

In [9]:
# 2. Remove punctuation

texts_clean = mf.clean_texts(texts_ne)

In [10]:
# 3. Lemmatize

lemmas = mf.get_lemmas(texts_clean)

In [11]:
# 4. Remove lemmas with length 2 and less

lemmas_no_short = [[word for word in text if len(word) > 2] \
                   for text in lemmas]

In [12]:
# 6. Remove stopwords

lemmas_no_sw = mf.remove_stopwords(lemmas_no_short)

In [13]:
# 7. Retrieve collocations/n-grams

texts_ngrams = mf.get_ngrams(lemmas_no_sw)

In [15]:
# 8. Keep nouns and adjectives only

nouns_adj = mf.get_nouns_adj(texts_ngrams)

# Save
with open(f'tokens.json', 'w') as f:
    json.dump(nouns_adj, f, indent=4)

In [16]:
# Statistics

print(f'Number of documents: {len(texts)}')
print(f'\nCorpus size\n\tBefore preprocessing: {corpus_size:,}' +
      f'\n\tAfter tokenization and NER: {sum([len(doc) for doc in texts_ne]):,}' +
      f'\n\tAfter lemmatization: {sum([len(doc) for doc in lemmas]):,}' +
      f'\n\tAfter stopwords removal: {sum([len(doc) for doc in lemmas_no_sw]):,}' +
      f'\n\tAfter n-gram extraction: {sum([len(doc) for doc in texts_ngrams]):,}' +
      f'\n\tAfter bad POS removal (nouns and adj): {sum([len(doc) for doc in nouns_adj]):,}')

Number of documents: 2643

Corpus size
	Before preprocessing: 2,969,522
	After tokenization and NER: 3,670,204
	After lemmatization: 2,944,566
	After stopwords removal: 1,863,368
	After n-gram extraction: 1,797,966
	After bad POS removal (nouns and adj): 1,296,640
