In [1]:
# -*- coding: utf-8 -*-

In [2]:
import re
import os
import json
import spacy
import string
import pymorphy2
from gensim.models import Phrases
from nltk.tokenize import word_tokenize

In [3]:
# Do NOT split intra-hyphen words (spaCy)
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.pipeline import merge_entities

def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\“\”\"\'~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, \
                     suffix_search=suffix_re.search, \
                     infix_finditer=infix_re.finditer, \
                     token_match=None)

# Load spaCy model
nlp = spacy.load('ru_core_news_lg')

nlp.tokenizer = custom_tokenizer(nlp)
nlp.add_pipe('merge_entities') # allows for merging with _

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [4]:
morph = pymorphy2.MorphAnalyzer()

In [5]:
def add_texts_to_json(json, num, texts):
    for i in range(num):
        text = json['Author'][i]['Full text']
        no_hyphen = re.sub('[\xad…]', '', text) # soft hyphen character          
        no_space = re.sub('[\xa0]', ' ', no_hyphen) # no-break space
        texts.append(no_space)

def get_lemmas(texts):
    return [[morph.normal_forms(word)[0] if '_' not in word \
             else '_'.join(morph.normal_forms(i)[0] for i in word.split('_')) \
             for word in text] for text in texts]

def get_named_ents(texts):
    data = []
    for text in texts:
        data.append([str(word) if not word.ent_type_ else str(word).replace(' ', '_') \
                     for word in nlp(text)])
    return data

def get_ngrams(texts):
    ngram = Phrases(texts, min_count=10, threshold=100)
    return ngram[texts]

def get_nouns(texts):
    return [[word for word in text \
             if morph.parse(word)[0].tag.POS == 'NOUN'] \
            for text in texts]

def get_nouns_adj(texts):
    return [[word for word in text \
             if morph.parse(word)[0].tag.POS == 'NOUN' \
             or morph.parse(word)[0].tag.POS == 'ADJF'] \
            for text in texts]

def clean_texts(texts):
    punct = string.punctuation # !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    punct += '—“”«»<>…...°1234567890'
    remove = punct.replace('_', '').replace('-', '') # do not remove
    pattern = r"[{}]".format(remove)
    data = [[re.sub(pattern, '', word).strip('_-') for word in text] \
            for text in texts]
    return [[word.replace('__', '_') for word in text if word] \
            for text in data]

def remove_stopwords(texts, words):
    with open('swl_optimum.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().split('\n') 
        stopwords.extend(words)
        data = [[word for word in text if re.sub('[Ёё]', 'е', word) \
                 not in stopwords or word not in stopwords] for text in texts]
        # ignore words that contain only latin characters
        return [[word for word in text if re.search(r'[^a-zA-Z]+', word)] \
                for text in data]

In [6]:
dirpath = os.getcwd() + '\\corpus\\'

js_docs, meta = [], []

for file in os.listdir(dirpath):
    if file.startswith('__'):
        with open(dirpath+file, 'r') as f:
            js = json.load(f)
            # authors' names to remove later
            meta.append(js['Author'][0]['Author'].replace(' ', '_'))
            
            if len(js['Author']) > 100: # no more than 100 texts per author
                num = 100
            else: 
                num = len(js['Author'])
            add_texts_to_json(js, num, js_docs)

In [7]:
# 0. Evaluate the size of the corpus before preprocessing
tokens = [word_tokenize(doc) for doc in js_docs]
tokens_clean = clean_texts(tokens)
data = [[word for word in text if word] for text in tokens_clean]
corpus_size = sum([len(token) for token in data])

In [8]:
# 1. Named entity recognition + tokenization

texts_named_ents = get_named_ents(js_docs)

In [9]:
# 2. Remove punctuation

texts_no_punct = clean_texts(texts_named_ents)

# Save
with open('tokens.json', 'w') as f:
    json.dump(texts_no_punct, f, indent=4)
# Load
with open(f'tokens.json', 'r') as f:
    texts_no_punct = json.load(f)

In [10]:
# 3. Lemmatize

lemmas = get_lemmas(texts_no_punct)

# Save
with open(f'lemmas.json', 'w') as f:
    json.dump(lemmas, f, indent=4)
# Load
with open(f'lemmas.json', 'r') as f:
    lemmas = json.load(f)

In [11]:
# 4. Remove lemmas with length 2 and less

lemmas_no_short = [[word for word in text if len(word) > 2] \
                   for text in lemmas]

In [12]:
# 6. Remove stopwords

lemmas_no_sw = remove_stopwords(lemmas_no_short, meta)

In [13]:
# 7. Retrieve collocations/n-grams

texts_ngrams = get_ngrams(lemmas_no_sw)

In [14]:
# 8. Keep nouns only

nouns = get_nouns(texts_ngrams)

# Save
with open(f'nouns.json', 'w') as f:
    json.dump(nouns, f, indent=4)
# Load
with open(f'nouns.json', 'r') as f:
    nouns = json.load(f)

In [15]:
# 8 (alt). Keep nouns and adjectives only

nouns_adj = get_nouns_adj(texts_ngrams)

# Save
with open(f'nouns_adj.json', 'w') as f:
    json.dump(nouns_adj, f, indent=4)
# Load
with open(f'nouns_adj.json', 'r') as f:
    nouns_adj = json.load(f)

In [16]:
# Statistics

print(f'Number of documents: {len(js_docs)}')

print(f'\nCorpus size\n\tBefore preprocessing: {corpus_size:,}' +
      f'\n\tAfter tokenization and NER: {sum([len(doc) for doc in texts_named_ents]):,}' +
      f'\n\tAfter lemmatization: {sum([len(doc) for doc in lemmas]):,}' +
      f'\n\tAfter stopwords removal: {sum([len(doc) for doc in lemmas_no_sw]):,}' +
      f'\n\tAfter n-gram extraction: {sum([len(doc) for doc in texts_ngrams]):,}' +
      f'\n\tAfter bad POS removal (nouns): {sum([len(doc) for doc in nouns]):,}' +
      f'\n\tAfter bad POS removal (nouns and adj): {sum([len(doc) for doc in nouns_adj]):,}')

Number of documents: 1260

Corpus size
	Before preprocessing: 1,585,992
	After tokenization and NER: 1,952,298
	After lemmatization: 1,570,105
	After stopwords removal: 987,524
	After n-gram extraction: 955,746
	After bad POS removal (nouns): 513,409
	After bad POS removal (nouns and adj): 690,221


In [17]:
# More statistics

def add_to_docs(file):
    docs = []
    with open(dirpath+file, 'r') as f:
        js = json.load(f)
        name = js['Author'][0]['Author']
        if len(js['Author']) > 100: # no more than 100 texts per author
            num = 100
        else:
            num = len(js['Author'])
        add_texts_to_json(js, num, docs)
        return docs, name, num
            
def count_tokens(texts):
    tokens = [word_tokenize(text) for text in texts]
    tokens_clean = clean_texts(tokens)
    data = [[word for word in text if word] for text in tokens_clean]
    count = sum([len(token) for token in data])
    return count, round(count/len(texts))


n, m, k = 0, 0, 0
for file in os.listdir(dirpath):
    if file.endswith('EL.json'):
        n += 1
        texts, name, num_art = add_to_docs(file)
        count, avg_count = count_tokens(texts)
        k += count
        m += num_art
        print(f'{name}:\n\tNumber of articles: {num_art}' + 
              f'\n\tTotal number of words: {count}' + 
              f'\n\tAverage number of words in an article: {avg_count}\n')

print(f'Total number of authors: {n}' + 
      f'\nTotal number of articles: {m}' + 
      f'\nCorpus size (before preprocessing): {k:,}')

Александр Козловский:
	Number of articles: 43
	Total number of words: 38259
	Average number of words in an article: 890

Александр Марков:
	Number of articles: 100
	Total number of words: 163181
	Average number of words in an article: 1632

Александр Сергеев:
	Number of articles: 84
	Total number of words: 47791
	Average number of words in an article: 569

Алексей Гиляров:
	Number of articles: 100
	Total number of words: 83601
	Average number of words in an article: 836

Алексей Левин:
	Number of articles: 85
	Total number of words: 178758
	Average number of words in an article: 2103

Алексей Опаев:
	Number of articles: 61
	Total number of words: 60229
	Average number of words in an article: 987

Аркадий Курамшин:
	Number of articles: 42
	Total number of words: 44658
	Average number of words in an article: 1063

Варвара Веденина:
	Number of articles: 67
	Total number of words: 64950
	Average number of words in an article: 969

Вера Башмакова:
	Number of articles: 55
	Total number of wo