In [1]:
# -*- coding: utf-8 -*-

In [2]:
import json
import functions as myf

In [4]:
sources = ['elementy_texts.json']

df = myf.read_texts(sources)

In [5]:
texts = df['Full text'].tolist()

In [6]:
# Save unpreprocessed texts
with open(f'raw_texts.json', 'w') as f:
    json.dump(texts, f, indent=4)

In [7]:
# 1. Tokenization with named entity recognition 

tokens = myf.get_named_ents(texts)

In [None]:
# 1. (Alt) Tokenization without named entity recognition 

tokens = myf.get_tokens(texts)

In [8]:
# 2. Remove punctuation

tokens_clean = myf.clean_texts(tokens)

In [9]:
# 3. Lemmatize

lemmas = myf.get_lemmas(tokens_clean)

In [10]:
# 4. Remove lemmas with length 2 and less

lemmas_no_short = [[word for word in text if len(word) > 2] for text in lemmas]

In [11]:
# 5. Remove stopwords

lemmas_clean = myf.remove_stopwords(lemmas_no_short)

In [12]:
# 6. Keep nouns and adjectives only

nouns_adj = myf.get_nouns_adj(lemmas_clean)

In [13]:
# 7. Retrieve collocations/n-grams

texts_ngrams = myf.get_ngrams(nouns_adj, min_count=200, threshold=100)

# Save
with open('data.json', 'w') as f:
    data = [[word for word in text] for text in texts_ngrams]
    json.dump(data, f, indent=4)

In [15]:
# Statistics

print(f'Number of documents: {len(texts)}\n' + 'Corpus size:')
print(f'\tAfter tokenization: {sum([len(doc) for doc in tokens]):,}' +
      f'\n\tAfter lemmatization: {sum([len(doc) for doc in lemmas]):,}' +
      f'\n\tAfter stopwords removal: {sum([len(doc) for doc in lemmas_clean]):,}' +
      f'\n\tAfter bad POS removal: {sum([len(doc) for doc in nouns_adj]):,}'
      f'\n\tAfter n-gram extraction: {sum([len(doc) for doc in texts_ngrams]):,}')

Number of documents: 2289
Corpus size:
	After tokenization: 3,888,311
	After lemmatization: 3,000,161
	After stopwords removal: 1,949,847
	After bad POS removal: 1,419,004
	After n-gram extraction: 1,415,022
