In [None]:
#import libs
import pandas as pd
import gensim
from nltk.stem.snowball import *
from nltk.stem import WordNetLemmatizer
import pymorphy2
from bs4 import BeautifulSoup
import re

from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#read file
file_path = None
text_column = None
a=pd.read_csv(file_path, error_bad_lines=False)

In [None]:
#preprocessing
stemmer = SnowballStemmer('russian')

def tfidf_clean_text(text):
    text=text.lower()
    text = BeautifulSoup(text,'lxml').text
    text = re.sub(r'\|\|\|',r' ', text)
    text = re.sub(r'http\S+',r'<URL>', text)
    text = text.replace('x','')
    text = text.replace('\\n',' ')
    text = text.replace('%',' <проценты>')
    return text

my_stopwords_rus=[]

with open('../my_stopwords_rus.txt', encoding = "cp1251") as file:
    my_stopwords_rus = [line.strip() for line in file]
    
morph= pymorphy2.MorphAnalyzer()
    
def tfidf_preprocess(text):
    result = []
    text = tfidf_clean_text(text)
    for token in gensim.utils.simple_preprocess(text,min_len=2,max_len=30):
        if token not in stopwords.words('russian'):
                norm=morph.parse(token)[0].normal_form
                if norm not in my_stopwords_rus and norm  not in stopwords.words('russian'):
                    result.append(token)
    return result

In [None]:
#vectorization
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer( use_idf=True, tokenizer=tfidf_preprocess, analyzer='word', ngram_range=(2,5), max_df=0.9, min_df=10)
tfidf.fit(a[text_column])

In [None]:
# parallellized transformation
%%time
import multiprocessing as mp
import scipy.sparse as sp

num_partitions=176
num_workers=60

def parallelize_dataframe(df,func):
    df_split = np.array_split(df,num_partitions)
    del df
    pool = mp.Pool(num_workers)
    print('Start mapping')
    df =sp.vstack(pool.map(func,df_split),format='csr')
    print('Concat together')
    pool.close()
    pool.join()
    return df

def func(df):
    print('Apply to partition')
    tfidf_matrix = tfidf.transform(df[text_column])
    return tfidf_matrix

X = parallelize_dataframe(a,func)                           

In [None]:
# LSA aka SVD topic extraction
%%time
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=9,algorithm='randomized',n_iter=50,random_state=22)
svd_model.fit(X)

In [None]:
# top terms for every topic
terms = tfidf.get_feature_names()
topics=[]
for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms,comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1],reverse=True)[:50]
    topics.append({})
    print('Topic '+str(i)+': ')
    for t in sorted_terms:
        topics[i][t[0]]=t[1]
        print(t[0])
        print(' ')

In [None]:
# wordcloud generation


mask = np.array(Image.open("borders2.png"))
wordcloud = WordCloud(width=1000,height=1000, max_words=50, background_color='white', colormap='plasma',mask=mask).generate_from_frequencies(topics[8])

plt.rcParams.update({'font.size':20})
plt.figure(figsize=(15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()