In [45]:
import pandas as pd
import numpy as np
import string
import re
import emoji

from pymorphy2 import MorphAnalyzer
from razdel import tokenize
from nltk.corpus import stopwords

from tqdm import tqdm

from gensim.models import Word2Vec, KeyedVectors

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [46]:
def give_emoji_free_text(text):
    allchars = [str for str in text]
    text = [c for c in allchars if c not in emoji.UNICODE_EMOJI]
    return "".join(text)


morph = MorphAnalyzer()
stop = set(stopwords.words('russian'))


def my_preprocess(text: str):
    text = str(text)
    text = give_emoji_free_text(text)
    text = text.replace("\n", " ")
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokenized_text = list(tokenize(text))
    lemm = [morph.parse(i.text)[0].normal_form for i in tokenized_text]
    words = [i for i in lemm if i not in stop and not i.isdigit() and len(i) > 2]
    return " ".join(words)

In [47]:
def logreg(X_train, X_test, y_train, y_test):
    y_train = pd.get_dummies(y_train)
    y_test = pd.get_dummies(y_test)
    clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("LogisticRegression classifier score: ")
    print("f1 score: ", f1_score(y_test, y_pred, average='weighted'))
    print("precision: ", precision_score(y_test, y_pred, average='weighted'))
    print("recall: ", recall_score(y_test, y_pred, average='weighted'))
    print("accuracy score: ", accuracy_score(y_test, y_pred))
    

def SDG(X_train, X_test, y_train, y_test):
    y_train = pd.get_dummies(y_train)
    y_test = pd.get_dummies(y_test)
    clf = MultiOutputClassifier(SGDClassifier(loss='hinge',
                                              penalty='l2',
                                              alpha=1e-3,
                                              random_state=42,
                                              max_iter=5,
                                              tol=None)).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("SGDClassifier classifier score: ")
    print("f1 score: ", f1_score(y_test, y_pred, average='weighted'))
    print("precision: ", precision_score(y_test, y_pred, average='weighted'))
    print("recall: ", recall_score(y_test, y_pred, average='weighted'))
    print("accuracy score: ", accuracy_score(y_test, y_pred))
    
    
def XGBoost(X_train, X_test, y_train, y_test):
    y_train = pd.get_dummies(y_train)
    y_test = pd.get_dummies(y_test)
    clf = MultiOutputClassifier(XGBClassifier()).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("XGBClassifier classifier score: ")
    print("f1 score: ", f1_score(y_test, y_pred, average='weighted'))
    print("precision: ", precision_score(y_test, y_pred, average='weighted'))
    print("recall: ", recall_score(y_test, y_pred, average='weighted'))
    print("accuracy score: ", accuracy_score(y_test, y_pred))

In [48]:
def check_tf_idf_score(X_train, X_test):
    labelToId = {'negative': 0, 'neutral': 1, 'positive': 2, 'skip': 1, 'speech': 1}
    y_train = []
    y_test = []

    for label in tqdm(X_train['разметка'].values):
        y_train.append(labelToId[label])
    for label in tqdm(X_test['разметка'].values):
        y_test.append(labelToId[label])
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train.lemms.values)
    X_train = vectorizer.fit_transform(X_train['lemms'].values)
    X_test = vectorizer.transform(X_test['lemms'].values)

    logreg(X_train, X_test, y_train, y_test)
    print()
    SDG(X_train, X_test, y_train, y_test)
    print()
    XGBoost(X_train, X_test, y_train, y_test)

In [49]:
def create_doc_vector(model, text):
    lemmas = text.split()
    lemmas_vectors = np.zeros((len(lemmas), model.vector_size))
    vec = np.zeros((model.vector_size,))

    for idx, lemma in enumerate(lemmas):
        if lemma in model:
            lemmas_vectors[idx] = model[lemma] / np.linalg.norm(model[lemma])
    res = lemmas_vectors.mean(axis=0)
    if np.all(np.isfinite(res)):
        return res
    else:
        return np.zeros(300)
    


def check_fasttext_score(X_train, X_test, pathToModelFolder=None):
    labelToId = {'negative': 0, 'neutral': 1, 'positive': 2, 'skip': 1, 'speech': 1}
    y_train = []
    y_test = []

    for label in tqdm(X_train['разметка'].values):
        y_train.append(labelToId[label])
    for label in tqdm(X_test['разметка'].values):
        y_test.append(labelToId[label])
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    # загрузим модель
    if pathToModelFolder is None:
        pathToModelFolder = 'araneum/araneum_none_fasttextcbow_300_5_2018.model'
    else:
        pathToModelFolder = pathToModelFolder
    model = KeyedVectors.load(pathToModelFolder)

    X_train = np.array([create_doc_vector(model, doc) for doc in X_train['lemms'].values])
    X_test = np.array([create_doc_vector(model, doc) for doc in X_test['lemms'].values])

    logreg(X_train, X_test, y_train, y_test)
    print()
    SDG(X_train, X_test, y_train, y_test)
    print()
    XGBoost(X_train, X_test, y_train, y_test)

## Создайте в датафрейме колонку "lemms", в котором буду лежать предобработанные предложения, затем разделите датафрейм на train и test подмножества и вызывайте на них функции:

### check_tf_idf_score(X_train, X_test)
векторизует и выдаст скор для переданных даных

### check_fasttext_score(X_train, X_test, pathToModelFolder=None)
использует усреднённые вектора fasttext, последним аргументом передаётся путь к модели (необходимо 4 файла, лежащих в одной папке), по умолчанию равен $araneum/araneum_none_fasttextcbow_300_5_2018.model$


##### Стоит знать, что внутри функции преобразуют входящую разметку в числа по соответствию:

{'negative': 0, 'neutral': 1, 'positive': 2, 'skip': 1, 'speech': 1} - то есть метки skip, neutral и speech считаются одним классом

### Далее пример, предобработаем тексты (код препроцессинга выше), разделим на test и train, закинем в модели

In [50]:
data = pd.read_csv("df.csv")
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,разметка,текст,контекст,community_id,community_type
0,0,speech,Спасибо!,post,1,nontoxic
1,1,speech,вау спасибо большое!!,post,1,nontoxic
2,2,speech,Супер! Благодарю🙏,post,1,nontoxic
3,3,speech,"офигеть, спасибо!",post,1,nontoxic
4,4,speech,спасибо за проделанную работу 💙,post,1,nontoxic


In [51]:
buf = []
for doc in tqdm(data['текст'].values):
    buf.append(my_preprocess(doc))

data['lemms'] = buf
data.head()

100%|██████████| 3524/3524 [00:15<00:00, 230.72it/s]


Unnamed: 0.1,Unnamed: 0,разметка,текст,контекст,community_id,community_type,lemms
0,0,speech,Спасибо!,post,1,nontoxic,спасибо
1,1,speech,вау спасибо большое!!,post,1,nontoxic,вау спасибо большой
2,2,speech,Супер! Благодарю🙏,post,1,nontoxic,супер благодарить
3,3,speech,"офигеть, спасибо!",post,1,nontoxic,офигеть спасибо
4,4,speech,спасибо за проделанную работу 💙,post,1,nontoxic,спасибо проделать работа


In [42]:
train, test = train_test_split(data, test_size=0.3)

check_tf_idf_score(train, test)

100%|██████████| 2466/2466 [00:00<00:00, 551723.14it/s]
100%|██████████| 1058/1058 [00:00<00:00, 550294.35it/s]


LogisticRegression classifier score: 
f1 score:  0.90046375065592
precision:  0.9157519754228688
recall:  0.9272211720226843
accuracy score:  0.9272211720226843

SGDClassifier classifier score: 
f1 score:  0.9090314880050534
precision:  0.9201929176850433
recall:  0.9310018903591682
accuracy score:  0.9310018903591682



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


XGBClassifier classifier score: 
f1 score:  0.9136237963536673
precision:  0.9077677932556515
recall:  0.9253308128544423
accuracy score:  0.9234404536862004


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [43]:
check_fasttext_score(train, test)

100%|██████████| 2466/2466 [00:00<00:00, 463444.47it/s]
100%|██████████| 1058/1058 [00:00<00:00, 437630.54it/s]


LogisticRegression classifier score: 
f1 score:  0.9103134015700511
precision:  0.9216918562938894
recall:  0.9310018903591682
accuracy score:  0.9310018903591682

SGDClassifier classifier score: 
f1 score:  0.9121874665620431
precision:  0.9304138795986622
recall:  0.9328922495274102
accuracy score:  0.9328922495274102



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


XGBClassifier classifier score: 
f1 score:  0.9287504555935558
precision:  0.9336986240263773
recall:  0.94234404536862
accuracy score:  0.9376181474480151


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
