In [50]:
import json
import pandas as pd
import numpy as np

import nltk
# nltk.download("stopwords")
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

import string
import re
from pymystem3 import Mystem

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression


In [2]:
path = u'tn.json'
with open(path, 'r', encoding="utf8") as f:
    data_tn = json.loads(f.read())

with open(path, 'r', encoding="utf8") as f:
    data_tp = json.loads(f.read())

In [3]:
df_tp = pd.DataFrame(data_tp, columns=['sentences'])
df_tp['event'] = np.ones(len(data_tp)).astype(int)

df_tn = pd.DataFrame(data_tn, columns=['sentences'])
df_tn['event'] = np.zeros(len(data_tn)).astype(int)

train_df = pd.concat([df_tn, df_tp], ignore_index = True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_df.sentences, train_df.event, test_size=0.3, random_state = 17)

## CountVect + LogReg

In [25]:
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)
X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.15869017632241816

In [20]:
precision_recall_fscore_support(y_test, y_pred)

(array([0.17718447, 0.16071429]),
 array([0.18159204, 0.15671642]),
 array([0.17936118, 0.15869018]),
 array([402, 402], dtype=int64))

## CountVec + nltk stopwords + LogReg

In [24]:
stopwords_ru = stopwords.words("russian")

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords_ru)
X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.17336683417085427

## Stemming + CountVec(with nltk stopwords) + LogReg

In [62]:
# word_tokenize(X_train[2051])
[token for token in word_tokenize(X_train[2051]) if token not in stopwords_ru and token.strip() not in string.punctuation+'«—»']

['Летом',
 '2017',
 'года',
 'перенесли',
 'указом',
 'президента',
 'года',
 'май',
 '2019',
 'года',
 'серийных',
 'ЛК-60',
 'Сибирь',
 'Урал',
 '2021',
 '2022',
 'годы',
 'соответственно']

In [65]:
def do_stemming(data):
    for i in data.index:
        tokens = [token for token in word_tokenize(data[i]) if token not in stopwords_ru and token.strip() not in string.punctuation+'«—»']
        for token in tokens:
            
            

In [31]:
stemmer = SnowballStemmer("russian", ignore_stopwords=True)
X_train_stem = X_train.copy().apply(lambda x: stemmer.stem(word) for word in x)
X_train_stem

NameError: name 'x' is not defined

In [26]:
stemmer = SnowballStemmer("russian", ignore_stopwords=True)


stopwords_ru = stopwords.words("russian")

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords_ru)
X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)

['04',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '2003',
 '2007',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '21',
 '22',
 '24',
 '25',
 '27',
 '28',
 '29',
 '30',
 '300',
 '40',
 '450',
 '48',
 '50',
 '500',
 '562',
 '60',
 '70',
 '73',
 '74',
 '740',
 '96',
 'com',
 'ru',
 'znak',
 'аварии',
 'аварийного',
 'аварийным',
 'августа',
 'августе',
 'агентства',
 'агентство',
 'агрономическая',
 'администрации',
 'администрацию',
 'администрация',
 'адресу',
 'активно',
 'акты',
 'александр',
 'алексей',
 'андерсен',
 'андрей',
 'ао',
 'апреле',
 'апрель',
 'апреля',
 'арбитражном',
 'арбитражный',
 'аренду',
 'аренды',
 'артстройтехно',
 'архитектуры',
 'аэропорта',
 'балтийская',
 'банкротства',
 'беглов',
 'без',
 'безопасности',
 'благоустройство',
 'ближайшее',
 'ближайшие',
 'блок',
 'более',
 'больше',
 'брусника',
 'будет',
 'будут',
 'бы',
 'был',
 'была',
 'были

In [3]:
# import os
# path = r'C:\Users\safre\Documents\Sber_NLP\tn.json'
# print(os.path.basename(path))

# path = 'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path = r''+'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path
# # with open(path, 'r', encoding="utf8") as f:
# #     data_tp = json.loads(f.read())

tn.json


In [26]:
# print('Введите путь к данным')
# path = input()

Введите путь к данным
tp.json


In [16]:
# path = u'test_data.json'
# with open(path, 'r', encoding="utf8") as f:
#     data_test = json.loads(f.read())

# data_test = pd.DataFrame(data_test)

In [6]:
# stopwords_ru = stopwords.words("russian")

In [7]:
# def remove_punct(text):    
# string.punctuation
m = Mystem()
train_df.sentences = train_df.sentences.apply(lambda x: m.lemmatize(x))
# train_df.sentences = train_df.sentences.apply(lambda x: x.lower())
train_df

Installing mystem to C:\Users\safre/.local/bin\mystem.exe from http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip


Unnamed: 0,sentences,event
0,"[причем, , часть, , котельный, , быть, , з...",0
1,"[ооо, «, балтинвестстрой, » , взять, , обяз...",0
2,"[школа, , на, , южный, , шоссе, , на, , 8...",0
3,"[школа, , на, , южный, , шоссе, , планиров...",0
4,"[беглов, , перечислять, , школа, , , который...",0
...,...,...
2675,"[подрядчик, , ооо, «, альянсдорстрой, » , до...",1
2676,"[он, , заявлять, , , что, , запуск, , ракет...",1
2677,"[застройщик, , ЖК, "", медовый, , долина, "" ...",1
2678,"[официальный, , срок, , , указывать, , в, ,...",1
