## Preprocessing options: Stanza and Pymorphy2
*Anastasia Nikiforova. HSE, Computational Linguistics*

*Thesis: Metaphor Identification using Topic Modeling*

This notebook is only FYI, for the actual preprocessing pipeline refer to the Topic_modeling_on_wiki.ipynb

In [None]:
from nltk.corpus import stopwords
from string import punctuation
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from string import punctuation
from itertools import chain
from collections import Counter

import nltk
nltk.download("stopwords")

#Create lemmatizer and stopwords list 
russian_stopwords = stopwords.words("russian")
russian_stopwords += ['ваш', 'наш', 'твой', 'который', 'это', 'то', 'что', 'кто', 'какой']

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

## Option 1: Tokenization, lemmatization and POS-tagging with Stanza
Second option - pymorphy2, which is significantly faster. Skip to the next section for pymorphy2.

In [None]:
import stanza
stanza.download('ru')

from nltk.corpus import stopwords
from string import punctuation
from tqdm import tqdm
import pandas as pd    # in case if wasn't imported before

import nltk
nltk.download("stopwords")

In [None]:
#Create lemmatizer and stopwords list 
russian_stopwords = stopwords.words("russian")
russian_stopwords += ['ваш', 'наш', 'твой', 'который', 'это', 'то', 'что', 'кто', 'какой']

In [None]:
nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma', use_gpu=True)

In [None]:
def clear_stdin():
    if hasattr(tqdm, '_instances'):
        for instance in list(tqdm._instances):
            tqdm._decr_instances(instance)

def preprocess_corpus(corpus):
    '''
    Use Stanza to preprocess, pos-tag and lemmatize the corpus.
    Save results to a text file after processing.
    
    Args:
        corpus:   list of strings: [article 1, article 2.....]
        
    Output:
         Tuple(Lemmas, POS-Tags, Lemma_POS) for eatch article. Choose what's relevamt for you.
         The full output will be saved to 'lemma_pos_wiki_articles.txt' file.
    '''
    clear_stdin()

    lemmas = []
    pos = []
    lemmas_pos = []
    
    with open('lemma_pos_wiki_articles.txt', 'w') as f:
        for i in tqdm(corpus):
            for sent in nlp(i).sentences:
                lemmas.append(' '.join([word.lemma.lower() for word in sent.words
                            if word.lemma.lower() not in russian_stopwords
                            and word.lemma.lower().strip() not in punctuation]))
                pos.append(' '.join([word.pos for word in sent.words 
                                   if word.lemma.lower() not in russian_stopwords
                                   and word.lemma.lower().strip() not in punctuation]))
                l_p = ' '.join([word.lemma.lower() + "_" + word.pos
                                   for word in sent.words 
                                   if word.lemma.lower() not in russian_stopwords
                                   and word.lemma.lower().strip() not in punctuation])
                lemmas_pos.append(l_p)
            
                print(l_p, file=f)
    
    return lemmas, pos, lemmas_pos

In [None]:
wiki_cleaned = [' '.join([t.lower() for t in seq.split() if t.lower() not in russian_stopwords]) for seq in wiki_vw]

In [None]:
lemmas, pos, lemmas_pos = preprocess_corpus(wiki_cleaned)

In [None]:
df = pd.DataFrame()
df['lemmas'] = lemmas
df['pos'] = pos
df['lemmas_pos'] = lemmas_pos

## Option: Pymorphy2 POS Tagger

Для сравнения: Stanza обрабатывает 10 статей за 2 минуты, тогда как у Pymorphy2 на это уходит 5 секунд.
На 50 тыс. статей Stanza необходимо около 120 часов на GPU, тогда как Pymorphy2 обрабатывает этот же корпус за 1,5 часа.

In [None]:
def clear_stdin():
    if hasattr(tqdm, '_instances'):
        for instance in list(tqdm._instances):
            tqdm._decr_instances(instance)

def pos_lemma_word(word):
    if '<num' in word or word.isdecimal():
        word_lemma = word
        word_pos = 'NUMR'
        word_lemma_pos = '_NUMR'
    else:
        res = morph.parse(word)[0]
        if res.tag.POS and res.normal_form and len(res.tag.POS)>1 \
            and res.normal_form not in russian_stopwords:
            word_lemma = res.normal_form
            word_pos =res.tag.POS
            word_lemma_pos = str(res.normal_form) + '_' + str(res.tag.POS)
            
        else:
            word_lemma = None
            word_pos = None
            word_lemma_pos = None
            
    return word_lemma, word_pos, word_lemma_pos
            
def preprocess_pymorphy(corpus):
    clear_stdin()

    lemmas = []
    pos = []
    lemmas_pos = []
    
    #with open('lemma_pos_wiki_pymorphy.txt', 'w') as f:
    for article in tqdm(corpus):
        sent_lemmas = []
        sent_pos = []
        sent_lemma_pos = []

        for word in article.split():
            
            word_lemma, word_pos, word_lemma_pos = pos_lemma_word(word)
            
            if word_lemma is not None:
                sent_lemmas.append(word_lemma)
                sent_pos.append(word_pos)
                sent_lemma_pos.append(word_lemma_pos)
            
            else:
                continue

        lemmas.append(' '.join(sent_lemmas))
        pos.append(' '.join(sent_pos))
        lemmas_pos.append(' '.join(sent_lemma_pos))
            
            #print(sent_lemma_pos, file=f)
        #f.close()
        
    return lemmas, pos, lemmas_pos

In [None]:
wiki_cleaned = [' '.join([t.lower() for t in seq.split() if t.lower() not in russian_stopwords]) for seq in wiki_vw]

In [None]:
df = pd.DataFrame()
df['lemmas'] = lemmas
df['pos'] = pos
df['lemmas_pos'] = lemmas_pos

In [None]:
df.to_csv('wiki_articles_pymorphy.csv', index=False)

### Another option - count all word occurrences in the corpus and, thus, tag less and tag faster

BigARTM receives texts in vowpal wabbit format.

It means that:
* Each text is presented as a bag-of-words.
* A document can consist preprocessed words with repetitions or preprocessed words with the number of occurrences, like: "|text this:1 word:3, is:4 repeated:2".
* Word order is not important

In [None]:
wiki_c = [i for i in wiki_cleaned if len(i)>0]
len(wiki_c), len(wiki_cleaned)

In [None]:
# Counter counts occurrences of items in the list and creates a dictionary
wiki_counter = [dict(Counter(i.split())) for i in wiki_cleaned]

In [None]:
pos_lemma_word('литва')[2]

In [None]:
tagged_wiki_counter = []
for i, count_dict in enumerate(wiki_counter):
    proc_count_dict = {pos_lemma_word(word)[2]: counts for word, counts in count_dict.items() 
                       if pos_lemma_word(word)[2] is not None}
    tagged_wiki_counter.append(proc_count_dict)

In [None]:
exclusions = ['также_CONJ', 'б_PRCL', 'около_PREP', 'ещё_ADVB', 'э_INTJ', 'её_ADJF',
              'мм_INTJ', 'однако_CONJ', 'например_CONJ', 'из-за_PREP', 'среди_PREP']

In [None]:
with open('vw_wiki_counts.txt', 'wt', encoding='utf-8') as f:
    for article in tagged_wiki_counter:
        text = "|text "
        for k, v in article.items():
            if k not in exclusions:
                text += str(k) + ':' + str(v) + ' '
        print(text, file=f)

In [None]:
testing = open('vw_wiki_counts.txt', encoding='utf-8').readlines()

In [None]:
len(testing)