In [1]:
import json
import pandas as pd
import numpy as np

import nltk
# nltk.download("stopwords")
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

import string
import re
from pymystem3 import Mystem

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

import fasttext
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook as tqdm

In [2]:
with open(r'tn.json', 'r', encoding="utf8") as f:
    data_tn = json.loads(f.read())

with open(r'tp.json', 'r', encoding="utf8") as f:
    data_tp = json.loads(f.read())
    
df_tp = pd.DataFrame(data_tp, columns=['sentences'])
df_tp['event'] = np.ones(len(data_tp)).astype(int)

df_tn = pd.DataFrame(data_tn, columns=['sentences'])
df_tn['event'] = np.zeros(len(data_tn)).astype(int)

train_df = pd.concat([df_tn, df_tp], ignore_index = True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_df.sentences, train_df.event, test_size=0.2, random_state = 17)

In [4]:
X_train.shape, train_df.shape

((1335,), (1669, 2))

In [5]:
train_df.event.value_counts()

0    1340
1     329
Name: event, dtype: int64

Очень мало данных для обучения(

## Create preprocessed df

In [6]:
m = Mystem()
def preprocessing(data):
    a = data.copy()
    for i in tqdm(a.index):
        # remove special characters
        a[i] = re.sub(r'\W', ' ', a[i])
        #remove numbers
        a[i] = re.sub(r'\d', '', a[i].lower())
        # Substituting multiple spaces with single space
        a[i] = re.sub(r'\s+', ' ', a[i], flags=re.I)

        a[i] = m.lemmatize(a[i])
        a[i] = ''.join(a[i])
    return a

In [7]:
stopwords_ru = stopwords.words("russian")

In [8]:
# X_train_prep = preprocessing(X_train)

In [9]:
# X_train_prep.to_pickle('lem_prep_train_df.pickle')
X_train_prep = pd.read_pickle('lem_prep_train_df.pickle')

In [10]:
# X_test_prep = preprocessing(X_test)

In [11]:
# X_test_prep.to_pickle('lem_prep_test_df.pickle')
X_test_prep = pd.read_pickle('lem_prep_test_df.pickle')

In [12]:
def tfidf_vect(X_train, X_test, stopwords = None):
    vectorizer = TfidfVectorizer(analyzer='word', max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    X_train_tfidf = pd.DataFrame.sparse.from_spmatrix(X_train_tfidf, columns=vectorizer.get_feature_names())
    X_test_tfidf = pd.DataFrame.sparse.from_spmatrix(X_test_tfidf, columns=vectorizer.get_feature_names())
    return X_train_tfidf, X_test_tfidf

In [13]:
def count_vect(X_train, X_test, stopwords = None, ngram = (1, 1)):
    vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords, ngram_range=ngram)
    X_train_count = vectorizer.fit_transform(X_train)
    X_test_count = vectorizer.transform(X_test)
    X_train_count = pd.DataFrame.sparse.from_spmatrix(X_train_count, columns=vectorizer.get_feature_names())
    X_test_count = pd.DataFrame.sparse.from_spmatrix(X_test_count, columns=vectorizer.get_feature_names())
    return X_train_count, X_test_count

In [14]:
stemmer = SnowballStemmer("russian", ignore_stopwords=True)
def do_stemming(data):
    data_stem = data.copy()
    for i in tqdm(data_stem.index):
        data_stem[i] = re.sub(r'\W', ' ', data_stem[i])
        data_stem[i] = re.sub(r'\d', '', data_stem[i].lower())
        text = []
        tokens = [token for token in word_tokenize(data_stem[i]) if token not in stopwords_ru and token.strip() not in string.punctuation+'«—»']
        for token in tokens:
            text.append(stemmer.stem(token))
        data_stem[i] = ' '.join(text)
    return data_stem

X_train_stem = do_stemming(X_train)
X_test_stem = do_stemming(X_test)

HBox(children=(IntProgress(value=0, max=1335), HTML(value='')))




HBox(children=(IntProgress(value=0, max=334), HTML(value='')))




## CountVect + LogReg

In [15]:
X_train_count, X_test_count = count_vect(X_train, X_test)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.547008547008547

In [16]:
precision_recall_fscore_support(y_test, y_pred)

(array([0.87368421, 0.65306122]),
 array([0.93609023, 0.47058824]),
 array([0.90381125, 0.54700855]),
 array([266,  68], dtype=int64))

## CountVec + nltk stopwords + LogReg

In [17]:
X_train_count, X_test_count = count_vect(X_train, X_test, stopwords=stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.5087719298245614

## Stemming + CountVec(with nltk stopwords) + LogReg

In [18]:
X_train_count, X_test_count = count_vect(X_train_stem, X_test_stem, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.5932203389830508

## Stemming (with nltk stopwords) + TF-IDF Vec + LogReg

In [19]:
X_train_tfidf, X_test_tfidf = tfidf_vect(X_train_stem, X_test_stem, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

f1_score(y_test, y_pred)



0.39583333333333337

## Lemmatization + TF-IDF Vec + LogReg

In [20]:
X_train_tfidf, X_test_tfidf = tfidf_vect(X_train_prep, X_test_prep, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

f1_score(y_test, y_pred)



0.41237113402061853

## Lemmatization + Count Vec + LogReg

In [21]:
X_train_count, X_test_count = count_vect(X_train_prep, X_test_prep, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.6050420168067226

## Lemmatization + Count Vec + CatBoost

In [22]:
X_train_count, X_test_count = count_vect(X_train_prep, X_test_prep, stopwords_ru)

model = CatBoostClassifier(verbose=False)
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)

0.5739130434782609

## Stemming + Count Vec + CatBoost

In [23]:
X_train_count, X_test_count = count_vect(X_train_stem, X_test_stem, stopwords_ru)

model = CatBoostClassifier(verbose=False)
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)

0.6140350877192983

In [24]:
from sklearn.metrics import roc_auc_score
y_proba = model.predict_proba(X_test_count)[:, 1]

roc_auc_score(y_test, y_proba)

0.870217823971694

## Lemmatization + Count Vec (n_gram) + LogReg

In [25]:
X_train_count, X_test_count = count_vect(X_train_prep, X_test_prep, stopwords_ru, (1, 2))

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.5999999999999999

In [26]:
# from varname import varname
# models = [('logreg', LogisticRegression()), ('catboost', CatBoostClassifier(verbose=False))]
# word_proc = [('stem', X_train_stem, X_test_stem), ('lem', X_train_prep, X_test_prep)]
# vector = [('count', X_train_count)]
# for name, model in models:
#     print(name)
# https://www.codeastar.com/choose-machine-learning-models-python/

## Word2Vec

### Для обучения эмбедингов будем использовать данные из тестовой выборки тоже.

In [27]:
with open(r'test_data.json', 'r', encoding="utf8") as f:
    data = json.loads(f.read())
test_df = pd.DataFrame(data)

In [28]:
ind = test_df.loc[test_df.duplicated('title', keep = 'first'), 'title'].index
for i in tqdm(test_df.index):
    if i in ind:
        test_df.loc[i, 'new_title'] = test_df.loc[i, 'title'] + str(i)
    else: test_df.loc[i, 'new_title'] = test_df.loc[i, 'title']

HBox(children=(IntProgress(value=0, max=10782), HTML(value='')))




In [29]:
new_test_df = pd.DataFrame()
for i in tqdm(test_df.index):
    test_sent = nltk.tokenize.sent_tokenize(test_df.text[i], 'russian')
    test_new_title = [test_df.new_title[i]]*len(test_sent)
    new_test_dict = {'title': test_new_title, 'sentences': test_sent}
    a = pd.DataFrame(new_test_dict)
    new_test_df = new_test_df.append(a, ignore_index = True)

HBox(children=(IntProgress(value=0, max=10782), HTML(value='')))




In [30]:
# new_test_df['stem_sent'] = do_stemming(new_test_df.sentences)
# new_test_df.to_pickle('stem_new_test_data.pickle')
new_test_df = pd.read_pickle('stem_new_test_data.pickle')

In [31]:
new_test_df[new_test_df.stem_sent.str.len() < 50]

Unnamed: 0,title,sentences,stem_sent
2,Уралкуз” изготовил рекордное количество осей,За 10 месяцев 2019 года отгружено более 12 тыс...,месяц год отгруж тыс единиц продукц
3,Уралкуз” изготовил рекордное количество осей,"""Мы побили рекорд 2013 года.",поб рекорд год
4,Уралкуз” изготовил рекордное количество осей,"Тогда за весь год мы отгрузили почти 11,2 тыс....",ве год отгруз тыс штук локомотивн ос
20,Отключение Ирана от интернета выявило сильные ...,Рост тарифов на бензин стал таким вынужденным ...,рост тариф бензин стал так вынужден шаг
25,Отключение Ирана от интернета выявило сильные ...,О новых тарифах было объявлено ночью в выходно...,нов тариф объявл ноч выходн ден ноябр
...,...,...,...
165150,Энергия старта: участники программы выступили ...,"""Я в ''Океане'' уже в четвертый раз, мне очень...",океан четверт очен понрав эт смен
165151,Энергия старта: участники программы выступили ...,До этого я никогда раньше не имела дела с отра...,раньш имел дел отрасл энергетик
165153,Энергия старта: участники программы выступили ...,По итогам форума эксперты определили лучшие пр...,итог форум эксперт определ лучш проект
165154,Энергия старта: участники программы выступили ...,Победители и призёры были отмечены на дружинно...,победител призер отмеч дружин церемон награжден


In [32]:
data = new_test_df.stem_sent.copy()
data.rename('sentences')

0         па уральск кузниц вход групп мечел итог месяц ...
1         уралкуз октябр год постав абсолютн рекорд исто...
2                       месяц год отгруж тыс единиц продукц
3                                            поб рекорд год
4                      ве год отгруз тыс штук локомотивн ос
                                ...                        
165151                      раньш имел дел отрасл энергетик
165152    эт заинтересова прекрасн куратор дума одн сам ...
165153               итог форум эксперт определ лучш проект
165154      победител призер отмеч дружин церемон награжден
165155             фотограф энергофорум ожн посмотрет ссылк
Name: sentences, Length: 165156, dtype: object

In [33]:
data = pd.concat([data, X_train_stem, X_test_stem], ignore_index=True)

In [34]:
data_list = data.to_list()

In [35]:
# data_list

In [36]:
# data.to_csv('data.txt', sep='\n', index=False)

In [37]:
# from gensim.models.wrappers import FastText 
from gensim.models import Word2Vec
# model = fasttext.train_unsupervised('data.txt', model='cbow', dim=300, thread=2)
model = Word2Vec(data_list, sg=0, size=100, workers=2)

ModuleNotFoundError: No module named 'gensim'

In [None]:
# !pip install genism
# !conda install -c anaconda gensim=0.12.4

In [27]:
# import os
# path = r'C:\Users\safre\Documents\Sber_NLP\tn.json'
# print(os.path.basename(path))

# path = 'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path = r''+'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path
# # with open(path, 'r', encoding="utf8") as f:
# #     data_tp = json.loads(f.read())

In [28]:
# print('Введите путь к данным')
# path = input()

In [29]:
# path = u'test_data.json'
# with open(path, 'r', encoding="utf8") as f:
#     data_test = json.loads(f.read())

# data_test = pd.DataFrame(data_test)