In [None]:
!pip install pymorphy2
!wget -O pymorphy2-dicts-ru.tar.gz https://files.pythonhosted.org/packages/b2/b4/732ff6eeac8c9ea22e7e1c7a321b21b6f3ba19d5e0a8925f35da9c8ebbb2/pymorphy2-dicts-ru-2.4.404381.4453942.tar.gz

In [None]:
import pickle

import tarfile
from tqdm import tqdm_notebook as tqdm

from multiprocessing import Pool


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('russian')

from string import punctuation
punkt= [p for p in punctuation] + ["`", "``" ,"''", "'"]


import gensim
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import fasttext
import pymorphy2

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split


# !pip install MulticoreTSNE
from sklearn.decomposition import TruncatedSVD
from MulticoreTSNE import MulticoreTSNE as TSNE


In [None]:
rudict = tarfile.open('./pymorphy2-dicts-ru.tar.gz')
rudict.extractall()
rudict.close()

In [None]:
lemmatizer = pymorphy2.MorphAnalyzer(path='./pymorphy2-dicts-ru-2.4.404381.4453942/pymorphy2_dicts_ru/data',lang='ru')

In [None]:

data = pd.read_csv('../input/corpus-of-russian-news-articles-from-lenta/lenta-ru-news.csv')

In [None]:
data.date = pd.to_datetime(data.date)

In [None]:
topics_to_use = data.topic.value_counts()[data.topic.value_counts() > 5000].index

In [None]:
data = data[data.topic.isin(topics_to_use)]

In [None]:
data.topic.value_counts(normalize=True)

In [None]:
data.date.dt.year.hist()

In [None]:
data.head(10)

In [None]:
data.size

In [None]:
data[data.topic.isna()]

In [None]:
news_title = data['title']
news_text = data['text']
news_topic = data['topic']

In [None]:
def tokenize(sent):
    try:
        sent = word_tokenize(sent)
        return [word for word in sent if word not in stop and word not in punkt]
    
    except:
        return []
    
def lemmatize(sent):
    try:
        return " ".join([lemmatizer.normal_forms(word)[0] for word in sent])
    except:
        return " "

    
def preprocess_sent(sent):
    return lemmatize(tokenize(sent))

In [None]:
s = news_text[:100].tolist()

In [None]:
preprocess_sent(s[0])

In [None]:
with Pool(8) as p:
    titles_preprocessed = list(tqdm(
                                    p.imap(preprocess_sent, news_title.tolist()),
                                    total=news_title.size
                                    )
                             )

In [None]:
# with open('titles_preprocessed.pkl', 'wb') as write_titles:
#     pickle.dump(titles_preprocessed, write_titles)

In [None]:
y = news_topic.tolist()

In [None]:
title_train, title_test, y_train, y_test = train_test_split(titles_preprocessed, y, test_size=0.25, stratify=y, random_state=33)

In [None]:
with open('train_data_titles.txt', 'w+', encoding='utf-8') as tr:
    for idx in range(len(title_train)):
        tr.write('__label__' + y_train[idx]+' '+title_train[idx]+'\n')
        
with open('test_data_titles.txt', 'w+', encoding='utf-8') as te:
    for idx in range(len(title_test)):
        te.write('__label__' + y_test[idx]+' '+title_test[idx]+'\n')        

In [None]:
%%time
ft_model = fasttext.train_supervised(input='train_data_titles.txt')

In [None]:
ft_model.predict(preprocess_sent('Акула съела банкира: акции банка упали на 25%'), k=3)

In [None]:
ft_model.predict(preprocess_sent('Кремль обеспокоен: висит груша, нельзя скушать'), k=3)

In [None]:
ft_model.predict(preprocess_sent('В Австралии перестали интересоваться футболом'), k=3)

In [None]:
ft_model.test('test_data_titles.txt', k=1)

In [None]:
%%time
ft_model_p1 = fasttext.train_supervised(input='train_data_titles.txt', epoch=10, wordNgrams=2, loss='hs', lr=1.0)

In [None]:
ft_model_p1.test('test_data_titles.txt',k=1)

In [None]:
%%time
ft_model_p_o = fasttext.train_supervised(input='train_data_titles.txt', autotuneValidationFile='test_data_titles.txt')

In [None]:
ft_model_p_o.test('test_data_titles.txt')

In [None]:
print(f'Best learning rate: {ft_model_p_o.lr} \nBest epochs: {ft_model_p_o.epoch} \nBest_word_Ngrams: {ft_model_p_o.wordNgrams}')

In [None]:
# ft_model_p_o.save_model('lenta_titles_ft_model.bin')

In [None]:
with open('titles_unsupervised.txt', 'w+', encoding='utf-8') as titles:
    for idx in range(len(titles_preprocessed)):
        titles.write(titles_preprocessed[idx]+'\n')

In [None]:
%%time
ft_vectors = fasttext.train_unsupervised('titles_unsupervised.txt', minn=3,maxn=5, dim=50)

In [None]:
ft_vectors.get_word_vector('инвестиция')

In [None]:
ft_vectors.get_subwords('инвестиция')

In [None]:
ft_vectors.get_nearest_neighbors('инвестиция')

In [None]:
ft_vectors.get_nearest_neighbors('инвестеция')

In [None]:
print(len(ft_vectors.words))
top3k = ft_vectors.words[:3000]
top3k[:10]

In [None]:
top3k_vectors = [ft_vectors.get_word_vector(word) for word in top3k]

In [None]:
%%time
tsne_emb = TSNE(n_components=2, n_iter=2000, n_jobs=-1).fit_transform(np.array(top3k_vectors))

In [None]:
tsne_emb.shape

In [None]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

output_notebook()


p = figure(tools="pan,wheel_zoom,reset,save", title="TSNE representation of FastText vectors (top 3k words)")

source = ColumnDataSource(data=dict(x1=tsne_emb[:,0], x2=tsne_emb[:,1], names=top3k))

p.scatter(x="x1", y="x2", source=source, size=5)

words = LabelSet(x="x1", y="x2", text="names", source=source,
                y_offset=6, text_font_size="6pt",text_color="#555555", text_align="center")

p.add_layout(words)

show(p)


In [None]:
titles_for_w2v = [sent.split(" ") for sent in titles_preprocessed]

In [None]:
%%time
from gensim.models import Word2Vec
w2v = Word2Vec(sentences=titles_for_w2v, min_count=3, size=50, window=6, seed=33, workers=4)


In [None]:
w2v.save('lenta_titles_w2v_model.bin.gz')

In [None]:
w2v = Word2Vec.load('lenta_titles_w2v_model.bin.gz')
type(w2v)

In [None]:
w2v_vectors = w2v.wv

In [None]:
w2v_vectors['инвестиция']

In [None]:
w2v_vectors['инвестеция']

In [None]:
w2v.most_similar('инвестиция')

In [None]:
w2v.most_similar('коррупция')

In [None]:
w2v.most_similar(positive=['чиновник'], negative=['взяточничество'])