In [1]:
# -*- coding: utf-8 -*-

In [2]:
import json
import functions as mf

In [3]:
n = 100 # max num of texts per author

texts = mf.read_texts(n)

In [5]:
# 0. Evaluate the size of the corpus before preprocessing
corpus_size = mf.get_corpus_size(n)

In [6]:
# 1. Named entity recognition + tokenization

texts_ne = mf.get_named_ents(texts)

In [7]:
# 2. Remove punctuation

texts_clean = mf.clean_texts(texts_ne)

In [8]:
# 3. Lemmatize

lemmas = mf.get_lemmas(texts_clean)

In [9]:
# 4. Remove lemmas with length 2 and less

lemmas_no_short = [[word for word in text if len(word) > 2] \
                   for text in lemmas]

In [10]:
# 6. Remove stopwords

lemmas_no_sw = mf.remove_stopwords(lemmas_no_short)

In [11]:
# 7. Retrieve collocations/n-grams

texts_ngrams = mf.get_ngrams(lemmas_no_sw)

In [12]:
# 8. Keep nouns only

nouns = mf.get_nouns(texts_ngrams)

In [13]:
# 8 (alt). Keep nouns and adjectives only

nouns_adj = mf.get_nouns_adj(texts_ngrams)

# Save
with open(f'tokens.json', 'w') as f:
    json.dump(nouns_adj, f, indent=4)
# Load
with open(f'tokens.json', 'r') as f:
    nouns_adj = json.load(f)

In [14]:
# Statistics

print(f'Number of documents: {len(texts)}')
print(f'\nCorpus size\n\tBefore preprocessing: {corpus_size:,}' +
      f'\n\tAfter tokenization and NER: {sum([len(doc) for doc in texts_ne]):,}' +
      f'\n\tAfter lemmatization: {sum([len(doc) for doc in lemmas]):,}' +
      f'\n\tAfter stopwords removal: {sum([len(doc) for doc in lemmas_no_sw]):,}' +
      f'\n\tAfter n-gram extraction: {sum([len(doc) for doc in texts_ngrams]):,}' +
      f'\n\tAfter bad POS removal (nouns): {sum([len(doc) for doc in nouns]):,}' +
      f'\n\tAfter bad POS removal (nouns and adj): {sum([len(doc) for doc in nouns_adj]):,}')

Number of documents: 1260

Corpus size
	Before preprocessing: 1,585,992
	After tokenization and NER: 1,952,298
	After lemmatization: 1,570,105
	After stopwords removal: 987,524
	After n-gram extraction: 955,746
	After bad POS removal (nouns): 513,409
	After bad POS removal (nouns and adj): 690,221


In [4]:
# More statistics

mf.get_stats('EL', n)

Александр Козловский:
	Number of articles: 43
	Total number of words: 38259
	Average number of words in an article: 890

Александр Марков:
	Number of articles: 100
	Total number of words: 163181
	Average number of words in an article: 1632

Александр Сергеев:
	Number of articles: 84
	Total number of words: 47791
	Average number of words in an article: 569

Алексей Гиляров:
	Number of articles: 100
	Total number of words: 83601
	Average number of words in an article: 836

Алексей Левин:
	Number of articles: 85
	Total number of words: 178758
	Average number of words in an article: 2103

Алексей Опаев:
	Number of articles: 61
	Total number of words: 60229
	Average number of words in an article: 987

Аркадий Курамшин:
	Number of articles: 42
	Total number of words: 44658
	Average number of words in an article: 1063

Варвара Веденина:
	Number of articles: 67
	Total number of words: 64950
	Average number of words in an article: 969

Вера Башмакова:
	Number of articles: 55
	Total number of wo