In [6]:
import json
import pandas as pd
import numpy as np

import nltk
# nltk.download("stopwords")
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

import string
import re
from pymystem3 import Mystem

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

import fasttext
from catboost import CatBoostClassifier

In [3]:
path = u'tn.json'
with open(path, 'r', encoding="utf8") as f:
    data_tn = json.loads(f.read())

with open(path, 'r', encoding="utf8") as f:
    data_tp = json.loads(f.read())

In [4]:
df_tp = pd.DataFrame(data_tp, columns=['sentences'])
df_tp['event'] = np.ones(len(data_tp)).astype(int)

df_tn = pd.DataFrame(data_tn, columns=['sentences'])
df_tn['event'] = np.zeros(len(data_tn)).astype(int)

train_df = pd.concat([df_tn, df_tp], ignore_index = True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_df.sentences, train_df.event, test_size=0.3, random_state = 17)

## CountVect + LogReg

In [5]:
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7)
X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.15869017632241816

In [6]:
precision_recall_fscore_support(y_test, y_pred)

(array([0.17718447, 0.16071429]),
 array([0.18159204, 0.15671642]),
 array([0.17936118, 0.15869018]),
 array([402, 402], dtype=int64))

## CountVec + nltk stopwords + LogReg

In [7]:
stopwords_ru = stopwords.words("russian")

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords_ru)
X_train_trans = vectorizer.fit_transform(X_train)
X_test_trans = vectorizer.transform(X_test)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.17336683417085427

## Stemming + CountVec(with nltk stopwords) + LogReg

In [8]:
stemmer = SnowballStemmer("russian", ignore_stopwords=True)
def do_stemming(data):
    data_stem = data.copy()
    for i in data_stem.index:
        text = []
        tokens = [token for token in word_tokenize(data_stem[i]) if token not in stopwords_ru and token.strip() not in string.punctuation+'«—»']
        for token in tokens:
            text.append(stemmer.stem(token))
        data_stem[i] = ' '.join(text)
    return data_stem

In [9]:
stopwords_ru = stopwords.words("russian")

X_train_stem = do_stemming(X_train)
X_test_stem = do_stemming(X_test)

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords_ru)
X_train_trans = vectorizer.fit_transform(X_train_stem)
X_test_trans = vectorizer.transform(X_test_stem)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.17772215269086358

## Stemming (with nltk stopwords) + TF-IDF Vec + LogReg

In [10]:
stopwords_ru = stopwords.words("russian")

X_train_stem = do_stemming(X_train)
X_test_stem = do_stemming(X_test)

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1500)
X_train_trans = vectorizer.fit_transform(X_train_stem)
X_test_trans = vectorizer.transform(X_test_stem)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



0.17632241813602018

## Lemmatization + TF-IDF Vec + LogReg

In [11]:
def do_lemmatization(data):
    a = data.copy()
    for i in a.index:
        # remove special characters
        a[i] = re.sub(r'\W', ' ', a[i])
        #remove numbers
        a[i] = re.sub(r'\d', '', a[i].lower())
        # Substituting multiple spaces with single space
        a[i] = re.sub(r'\s+', ' ', a[i], flags=re.I)
        # Converting to Lowercase
#         a[i] = a[i].lower()
        m = Mystem()
#         a[i] = [m.lemmatize(word)[0] for word in a[i].split()]
        a[i] = m.lemmatize(a[i])
        a[i] = ''.join(a[i])
    return a

In [12]:
%%time
X_train_lem = do_lemmatization(X_train)
X_test_lem = do_lemmatization(X_test)

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1500)
X_train_trans = vectorizer.fit_transform(X_train_lem)
X_test_trans = vectorizer.transform(X_test_lem)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



Wall time: 1h 11min 58s


0.17861635220125782

In [13]:
X_train_trans.to_pickle('lem_trans_train_df.pickle')
X_test_trans.to_pickle('lem_trans_test_df.pickle')
# df2 = pd.read_pickle('my_df.pickle')

In [19]:
X_train_trans = pd.read_pickle('lem_trans_train_df.pickle')
X_train_trans['два'].head()

0    0.162982
1    0.000000
2    0.000000
3    0.000000
4    0.000000
Name: два, dtype: Sparse[float64, 0.0]

Очень много времени считается(

In [20]:
%%time
X_train_lem = do_lemmatization(X_train)
X_test_lem = do_lemmatization(X_test)

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords_ru)
X_train_trans = vectorizer.fit_transform(X_train_lem)
X_test_trans = vectorizer.transform(X_test_lem)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = LogisticRegression()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)



Wall time: 58min 54s


0.1639344262295082

## Lemmatization + TF-IDF Vec + CatBoost

In [None]:
%%time
X_train_lem = do_lemmatization(X_train)
X_test_lem = do_lemmatization(X_test)

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1500)
X_train_trans = vectorizer.fit_transform(X_train_lem)
X_test_trans = vectorizer.transform(X_test_lem)

X_train_trans = pd.DataFrame.sparse.from_spmatrix(X_train_trans, columns=vectorizer.get_feature_names())
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=vectorizer.get_feature_names())

model = CatBoostClassifier()
model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

f1_score(y_test, y_pred)

## FastText word embeding + LogReg

In [3]:
# import os
# path = r'C:\Users\safre\Documents\Sber_NLP\tn.json'
# print(os.path.basename(path))

# path = 'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path = r''+'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path
# # with open(path, 'r', encoding="utf8") as f:
# #     data_tp = json.loads(f.read())

tn.json


In [26]:
# print('Введите путь к данным')
# path = input()

Введите путь к данным
tp.json


In [16]:
# path = u'test_data.json'
# with open(path, 'r', encoding="utf8") as f:
#     data_test = json.loads(f.read())

# data_test = pd.DataFrame(data_test)

In [6]:
# stopwords_ru = stopwords.words("russian")

In [7]:
# # def remove_punct(text):    
# # string.punctuation
# m = Mystem()
# train_df.sentences = train_df.sentences.apply(lambda x: m.lemmatize(x))
# # train_df.sentences = train_df.sentences.apply(lambda x: x.lower())
# train_df

Installing mystem to C:\Users\safre/.local/bin\mystem.exe from http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip


Unnamed: 0,sentences,event
0,"[причем, , часть, , котельный, , быть, , з...",0
1,"[ооо, «, балтинвестстрой, » , взять, , обяз...",0
2,"[школа, , на, , южный, , шоссе, , на, , 8...",0
3,"[школа, , на, , южный, , шоссе, , планиров...",0
4,"[беглов, , перечислять, , школа, , , который...",0
...,...,...
2675,"[подрядчик, , ооо, «, альянсдорстрой, » , до...",1
2676,"[он, , заявлять, , , что, , запуск, , ракет...",1
2677,"[застройщик, , ЖК, "", медовый, , долина, "" ...",1
2678,"[официальный, , срок, , , указывать, , в, ,...",1
