In [1]:
import json
import pandas as pd
import numpy as np

import nltk
# nltk.download("stopwords")
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

import string
import re
from pymystem3 import Mystem

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

import fasttext
from catboost import CatBoostClassifier
from tqdm import tqdm_notebook as tqdm

In [2]:
with open(r'tn.json', 'r', encoding="utf8") as f:
    data_tn = json.loads(f.read())

with open(r'tp.json', 'r', encoding="utf8") as f:
    data_tp = json.loads(f.read())
    
df_tp = pd.DataFrame(data_tp, columns=['sentences'])
df_tp['event'] = np.ones(len(data_tp)).astype(int)

df_tn = pd.DataFrame(data_tn, columns=['sentences'])
df_tn['event'] = np.zeros(len(data_tn)).astype(int)

train_df = pd.concat([df_tn, df_tp], ignore_index = True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_df.sentences, train_df.event, test_size=0.2, random_state = 17)

In [4]:
X_train.shape, train_df.shape

((1335,), (1669, 2))

In [5]:
train_df.event.value_counts()

0    1340
1     329
Name: event, dtype: int64

Очень мало данных для обучения(

## Create preprocessed df

In [6]:
def preprocessing(data):
    a = data.copy()
    for i in tqdm(a.index):
        # remove special characters
        a[i] = re.sub(r'\W', ' ', a[i])
        #remove numbers
        a[i] = re.sub(r'\d', '', a[i].lower())
        # Substituting multiple spaces with single space
        a[i] = re.sub(r'\s+', ' ', a[i], flags=re.I)

        m = Mystem()
        a[i] = m.lemmatize(a[i])
        a[i] = ''.join(a[i])
    return a

In [7]:
stopwords_ru = stopwords.words("russian")

In [8]:
# X_train_prep = preprocessing(X_train)

In [9]:
# X_train_prep.to_pickle('lem_prep_train_df.pickle')
X_train_prep = pd.read_pickle('lem_prep_train_df.pickle')

In [10]:
# X_test_prep = preprocessing(X_test)

In [11]:
# X_test_prep.to_pickle('lem_prep_test_df.pickle')
X_test_prep = pd.read_pickle('lem_prep_test_df.pickle')

In [12]:
def tfidf_vect(X_train, X_test, stopwords = None):
    vectorizer = TfidfVectorizer(analyzer='word', max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    X_train_tfidf = pd.DataFrame.sparse.from_spmatrix(X_train_tfidf, columns=vectorizer.get_feature_names())
    X_test_tfidf = pd.DataFrame.sparse.from_spmatrix(X_test_tfidf, columns=vectorizer.get_feature_names())
    return X_train_tfidf, X_test_tfidf

In [13]:
def count_vect(X_train, X_test, stopwords = None):
    vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords)
    X_train_count = vectorizer.fit_transform(X_train)
    X_test_count = vectorizer.transform(X_test)
    X_train_count = pd.DataFrame.sparse.from_spmatrix(X_train_count, columns=vectorizer.get_feature_names())
    X_test_count = pd.DataFrame.sparse.from_spmatrix(X_test_count, columns=vectorizer.get_feature_names())
    return X_train_count, X_test_count

In [14]:
stemmer = SnowballStemmer("russian", ignore_stopwords=True)
def do_stemming(data):
    data_stem = data.copy()
    for i in data_stem.index:
        text = []
        tokens = [token for token in word_tokenize(data_stem[i]) if token not in stopwords_ru and token.strip() not in string.punctuation+'«—»']
        for token in tokens:
            text.append(stemmer.stem(token))
        data_stem[i] = ' '.join(text)
    return data_stem

X_train_stem = do_stemming(X_train)
X_test_stem = do_stemming(X_test)

## CountVect + LogReg

In [15]:
X_train_count, X_test_count = count_vect(X_train, X_test)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.547008547008547

In [16]:
precision_recall_fscore_support(y_test, y_pred)

(array([0.87368421, 0.65306122]),
 array([0.93609023, 0.47058824]),
 array([0.90381125, 0.54700855]),
 array([266,  68], dtype=int64))

## CountVec + nltk stopwords + LogReg

In [17]:
X_train_count, X_test_count = count_vect(X_train, X_test, stopwords=stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.5087719298245614

## Stemming + CountVec(with nltk stopwords) + LogReg

In [18]:
X_train_count, X_test_count = count_vect(X_train_stem, X_test_stem, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.608695652173913

## Stemming (with nltk stopwords) + TF-IDF Vec + LogReg

In [19]:
X_train_tfidf, X_test_tfidf = tfidf_vect(X_train_stem, X_test_stem, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

f1_score(y_test, y_pred)



0.3829787234042553

## Lemmatization + TF-IDF Vec + LogReg

In [20]:
X_train_tfidf, X_test_tfidf = tfidf_vect(X_train_prep, X_test_prep, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

f1_score(y_test, y_pred)



0.41237113402061853

## Lemmatization + Count Vec + LogReg

In [21]:
X_train_count, X_test_count = count_vect(X_train_prep, X_test_prep, stopwords_ru)

model = LogisticRegression()
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)



0.6050420168067226

## Lemmatization + Count Vec + CatBoost

In [22]:
X_train_count, X_test_count = count_vect(X_train_prep, X_test_prep, stopwords_ru)

model = CatBoostClassifier(verbose=False)
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)

0.5739130434782609

In [27]:
## Stemming + Count Vec + CatBoost

In [26]:
X_train_count, X_test_count = count_vect(X_train_stem, X_test_stem, stopwords_ru)

model = CatBoostClassifier(verbose=False)
model.fit(X_train_count, y_train)
y_pred = model.predict(X_test_count)

f1_score(y_test, y_pred)

0.6017699115044248

In [23]:
# import os
# path = r'C:\Users\safre\Documents\Sber_NLP\tn.json'
# print(os.path.basename(path))

# path = 'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path = r''+'C:\Users\safre\Documents\Sber_NLP\tn.json'
# path
# # with open(path, 'r', encoding="utf8") as f:
# #     data_tp = json.loads(f.read())

In [24]:
# print('Введите путь к данным')
# path = input()

In [25]:
# path = u'test_data.json'
# with open(path, 'r', encoding="utf8") as f:
#     data_test = json.loads(f.read())

# data_test = pd.DataFrame(data_test)