In [None]:
!pip install pymorphy2[fast]

In [None]:
import re
from collections import Counter
import pandas as pd
import numpy as np
from gensim import corpora, models
import pymorphy2
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
def lemmatize(token):
    return morph.parse(token)[0].normal_form

In [None]:
news_df = pd.read_csv('/kaggle/input/russian-news-2020/news.csv')

In [None]:
news_df.head()

In [None]:
news_df.shape

# Data cleaning

In [None]:
news_df.loc[news_df['source'] == 'ria.ru', 'publication_date'] = (news_df.loc[news_df['source'] == 'ria.ru', 'publication_date'].str
                                                              .extract(r'(?P<date>\d{2}\.\d{2}\.\d{4})', expand=False)
                                                              .apply(lambda x: '-'.join(reversed(x.split('.'))) if type(x) is str else x))

In [None]:
news_df.loc[news_df['source'] == 'lenta.ru', 'publication_date'] = news_df.loc[news_df['source'] == 'lenta.ru', 'publication_date'].str.split('T').str.get(0)

In [None]:
month_mapper = {
    'января': '01',
    'февраля': '02',
    'марта': '03',
    'апреля': '04',
    'мая': '05',
    'июня': '06',
    'июля': '07',
    'августа': '08',
    'сентября': '09',
    'октября': '10',
    'ноября': '11',
    'декабря': '12'
}
news_df.loc[news_df['source'] == 'meduza.io', 'publication_date'] = (news_df.loc[news_df['source'] == 'meduza.io', 'publication_date']
                                                                     .apply(lambda x: f'{x.split()[3]}-{month_mapper[x.split()[2]]}-{x.split()[1].zfill(2)}' if type(x) is str else x))

In [None]:
news_df.loc[news_df['source'] == 'tjournal.ru', 'publication_date'] = pd.to_datetime(news_df.loc[news_df['source'] == 'tjournal.ru', 'publication_date'], unit='s').dt.strftime('%Y-%m-%d')

In [None]:
news_df.loc[news_df['source'] == 'tjournal.ru', 'text'] = news_df.loc[news_df['source'] == 'tjournal.ru', 'text'].str.replace('\n', '').str.replace(r'\s+', ' ')

In [None]:
news_df.loc[news_df['source'] == 'tjournal.ru', 'tags'] = news_df.loc[news_df['source'] == 'tjournal.ru', 'text'].str.findall(r'#\w+').str.join(', ').str.replace('#', '')

In [None]:
news_df.loc[news_df['source'] == 'tjournal.ru', 'text'] = news_df.loc[news_df['source'] == 'tjournal.ru', 'text'].apply(lambda x: x[:x.find('#')])

In [None]:
documents = news_df.text.tolist()

# Text Preprocessing
Split the text into tokens, bring the tokens to normal form and take only tokens longer than two characters.

In [None]:
texts = [
    [lemmatize(word) for word in re.findall(r'\w+', document.lower()) if len(word) > 2]
    for document in documents
]

Let's create a dictionary of words from our texts. Let's leave only words that occur at least 5 times and no more than 25% of documents.

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.25, keep_n=25000)
corpus = [dictionary.doc2bow(text) for text in texts]

# Training model
![topic_modeling](https://miro.medium.com/max/1200/1*IJw8N-HSEzLpwJDS6JVs-w.png)

In [None]:
ldamodel = models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=100, passes=50, alpha='symmetric', eta=None, decay=0.5)

# Perplexity


![perplexity](https://wikimedia.org/api/rest_v1/media/math/render/svg/fc7974a9bf394db8698fb76c0fa060c6c21068ed)

In [None]:
perplexity = ldamodel.log_perplexity(corpus)
print(2**(-perplexity))

# Analysis of the resulting topics
Let's see the resulting topics and their most frequent words.

In [None]:
for t, top_words in ldamodel.print_topics(num_topics=-1, num_words=10):
    print("Topic", t, ":", top_words)
    print()

Let's see the distribution of rubrics, subrubrics and tags by topics

In [None]:
news_df['topic'] = [max(i, key=lambda x: x[1])[0] for i in ldamodel[corpus]]

In [None]:
for i in range(news_df.topic.max()):
    print(f'Topic: {i}')
    counts = news_df[news_df.topic == i].rubric.value_counts()
    print(counts[counts > 5])
    print()

In [None]:
for i in range(news_df.topic.max()):
    print(f'Topic: {i}')
    counts = news_df[news_df.topic == i].subrubric.value_counts()
    print(counts[counts > 5])
    print()

In [None]:
for i in range(news_df.topic.max()):
    print(f'Topic: {i}')
    tags = []
    for i in news_df[news_df.topic == i].tags.dropna():
        tags += i.split(', ')
    counts = Counter(tags)
    print('\n'.join(map(str, counts.most_common()[:5])))
    print()

# Wordcloud


Visualizing each topic with a word cloud

In [None]:
for i in range(news_df.topic.max()):
    print(f'Topic: {i}')
    frequencies = dict(ldamodel.show_topic(i, topn=100))
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(frequencies)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Distribution of topics over time

In [None]:
f = plt.figure()
f, ax = plt.subplots(100, 1, figsize=(75, 900))

for i, topic_name in enumerate(range(news_df.topic.max())):
    counts = news_df[news_df.topic == topic_name]['publication_date'].dropna().value_counts().to_dict()
    ax[i].bar(news_df['publication_date'].dropna().drop_duplicates().sort_values(), news_df['publication_date'].dropna().drop_duplicates().sort_values().map(counts))
    ax[i].set_title(topic_name)
    ax[i].tick_params(labelrotation=90)