### **Imports**

In [1]:
!pip install -U fastparquet pyarrow

In [None]:
!pip install --upgrade spacy -q
!python -m spacy download ru_core_news_sm -q

In [2]:
!pip install pyLDAvis wordcloud -q

In [4]:
import re
from statistics import mean
from math import ceil
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm

In [3]:
# spaCy for lemmatization

import spacy

In [3]:
# Gensim

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import models

In [3]:
# Plotting tools

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline

In [None]:
# NLTK stop-words

import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

In [5]:
# Importing dataset

df = pd.read_csv('november_prose_ds.csv')

In [6]:
print(df.head(12))
print('\n- Total size of the corpus:', len(df), 'short stories')

      id                                               text
0   1046  \nЩепинские рассказы, или Яшкины были. \n \n  ...
1   1052  \n     Речушка была и не широкой, и не глубоко...
2   1061  \nСолнечное весеннее утро. Сквозь нежно-зелены...
3   1067  \nМирабель уверенным шагом шла по мостовой. До...
4   1072  \n                 \n     Первое сентября, пер...
5   1078  \n       Предыдущая глава здесь: \n \n       Т...
6   1080  \n     Бабушка. Слово-то какое доброе, теплое....
7   1088  \nЧудеса  происходят и сегодня. \n \n  \n \n М...
8   1095  \nГ л у п о  п р о д а в а т ь  в е щ и  в  г ...
9   1099  \nЭнергия направленного действия. \nГл. из ром...
10  1108  \nАвтобус с омоновцами остановился на въезде в...
11  1109  \nВ преферанс я научился играть… нет, не раньш...

- Total size of the corpus: 2376 short stories


### **Text preprocessing**

In [7]:
pattern_1 = re.compile(pattern='[^а-яё\.?!\s\\n-]|\W+-\W+', flags=re.IGNORECASE)
pattern_2 = re.compile(pattern='!+|\?+|\.+')
pattern_3 = re.compile(pattern='(?<=\s)\.(?=\s\w)')
pattern_4 = re.compile(pattern=' {2,}')

In [8]:
df['text'] = df['text'].replace(to_replace=pattern_1, value=' ', regex=True)
df['text'] = df['text'].replace(to_replace=pattern_2, value='.', regex=True)
df['text'] = df['text'].replace(to_replace=pattern_3, value='', regex=True)
df['text'] = df['text'].replace(to_replace=pattern_4, value=' ', regex=True)

In [10]:
# Some text statistics for chunking
df['text_word_len'] = df['text'].str.split().str.len()
df['text_char_len'] = df['text'].str.len()

avg_word_text = ceil(df['text_word_len'].mean())
avg_char_text = ceil(df['text_char_len'].mean())

print('\n- Average size of short stories in words:', avg_word_text)
print('\n- Average size of short stories in characters:', avg_char_text)


- Average size of short stories in words: 1760

- Average size of short stories in characters: 11258


In [11]:
# Some setence statistics for chunking

def get_sent_stats_words(text):
    words = ceil(mean([len(sent.split()) for sent in [story for story in text.split('.')]]))
    return words

def get_sent_stats_chars(text):
    chars = ceil(mean([len(sent) for sent in text.split('.')]))
    return chars

df['sent_word_len'] = df['text'].apply(get_sent_stats_words)
df['sent_char_len'] = df['text'].apply(get_sent_stats_chars)

avg_word_sent = ceil(df['sent_word_len'].mean())
avg_char_sent = ceil(df['sent_char_len'].mean())

print('\n- Average sentence length in words:', avg_word_sent)
print('\n- Average sentence length in characters:', avg_char_sent)


- Average sentence length in words: 13

- Average sentence length in characters: 79


In [12]:
# Transforming to lower case

df['text_lower']  = df['text'].str.lower()

In [18]:
# Removing stop words

def remove_stops(data):
    no_stops = ' '.join([word for word in str(data).split() if word not in stop_words])
    return no_stops 

In [19]:
df['no_stops_text'] = df['text_lower'].apply(remove_stops)

In [21]:
# Initialize spacy 'ru' model, keeping only tagger component (for efficiency)

nlp = spacy.load('ru_core_news_sm', disable=['parser', 'ner'])
nlp.max_length = 1500000

In [22]:
def lemmatize(data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    lemmatized = []
    for text in data:
        doc = nlp(text) 
        lemmatized.append([token.lemma_ for token in tqdm(doc) if token.pos_ in allowed_postags])
    return lemmatized

In [24]:
df['text_lemma'] = data_lemmatized

In [None]:
# Lemmatizing tokens with spaCy 

data_lemmatized = lemmatize(df['no_stops_text'])

In [25]:
# Converting to list

lemmatized = df['text_lemma'].tolist()

In [None]:
# Build the bigram and trigram models

bigram = gensim.models.Phrases(lemmatized, min_count=5, threshold=100) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[lemmatized], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram

bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

In [13]:
# Define functions for bigrams, trigrams and lemmatization
# Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [28]:
# Form Bigrams

data_bigrams = make_bigrams(lemmatized)

In [29]:
df['text_bigrams'] = data_bigrams

In [30]:
df.head()

Unnamed: 0,title,text,text_word_len,text_char_len,sent_word_len,sent_char_len,text_lower,no_stops_text,text_lemma,text_bigrams
0,"36. Прогулка с лётчиком. from1988, Из цикла Мо...",. ПРОГУЛКА С ЛЁТЧИКОМ. \n Замерзшая Атлантика...,1663,11131,17,110,. прогулка с лётчиком. \n замерзшая атлантика...,. прогулка лётчиком. замерзшая атлантика нарис...,"[прогулка, лётчик, замёрзнуть, атлантика, нари...","[прогулка, лётчик, замёрзнуть, атлантика, нари..."
1,Награды и награждаемые из книги академия жизни-1,\nНАГРАДЫ И НАГРАЖДАЕМЫЕ \n \n \nВообще наград...,1648,10525,11,67,\nнаграды и награждаемые \n \n \nвообще наград...,награды награждаемые вообще награды дело хорош...,"[награда, награждать, вообще, награда, дело, х...","[награда, награждать, вообще, награда, дело, х..."
2,Отрывок из книги Мукамал,\nМ У К А М А Л -отрывок. \n БЫЛЬ \n .Эта подл...,1287,8106,16,100,\nм у к а м а л -отрывок. \n быль \n .эта подл...,м м л -отрывок. быль .эта подлинная история де...,"[м, м, л, быль, .эта, подлинный, история, деву...","[м, м, л, быль, .эта, подлинный, история, деву..."
3,Князь Владимир 12,Т е б я с ю д а в л о в у ш к у з а м а н и л ...,370,1750,7,32,т е б я с ю д а в л о в у ш к у з а м а н и л ...,т е б ю д л ш з м н л киевский князь ярополк о...,"[е, б, ю, д, л, киевский, князь, рогнеда, выхо...","[е, б_ю, д_л, киевский_князь, рогнеда, выходит..."
4,Кулёма - забытое слово из далёкого детства,\nЕсли быть предельно точным то правильно это ...,472,3286,17,117,\nесли быть предельно точным то правильно это ...,предельно точным правильно это слово звучит ку...,"[предельно, точный, правильно, слово, звучать,...","[предельно, точный, правильно, слово, звучать,..."


In [None]:
# df.to_parquet('proza_lem.parquet')

### **Latent Dirichlet Allocation (LDA)**
##### *(topics as probability distributions for the occurrence of different words)*

In [4]:
df = pd.read_parquet('proza_lem.parquet', engine='pyarrow', columns=['title', 'text_bigrams'])

In [10]:
lemmas = df1['text_bigrams'].tolist()

In [18]:
# Creating Dictionary

id2word = corpora.Dictionary(data_bigrams)

# Creating Corpus

texts = data_bigrams

# Term Document Frequency

corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# View

print(corpus[0])

# Human readable term-frequency

[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [20]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=6):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# running several models

model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=49, step=6)

In [None]:
# Show graph

limit=49; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.




In [None]:
# Select the model and print the topics

optimal_model = model_list[5]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
# generating word clouds for each topic                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

for topic in range(optimal_model.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(optimal_model.show_topic(topic, 20))))
    plt.axis("off")
    plt.title("Topic #" + str(topic))
    plt.show()

In [None]:
# Visualize the topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word, mds='mmds') 
vis