<a href="https://colab.research.google.com/github/shitkov/categorizer/blob/main/categorizer_datamaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!wget https://github.com/shitkov/categorizer/raw/main/data_train.zip
!wget https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt
!unzip /content/data_train.zip

In [None]:
train_path = '/content/HeadHunter_train.csv'

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(train_path)

In [None]:
data = data.fillna('')

In [None]:
# onehot для кажого таргета
for i in range(0, 9):
    col_name = 'tag_' + str(i)
    labels = [1 if str(i) in t.split(',') else 0 for t in list(data['target'])]
    data[col_name] = pd.Series(labels, index=data.index)

In [None]:
# длина отзыва
data['positive_len'] = [len(str(pos)) for pos in list(data['positive'])]
data['negative_len'] = [len(str(pos)) for pos in list(data['negative'])]

In [None]:
import re

In [None]:
with open('/content/stopwords-ru.txt') as f:
    stopwords = f.readlines()

In [None]:
stopwords = [line.rstrip('\n') for line in stopwords]

In [None]:
!pip install pymystem3

from pymystem3 import Mystem
mstm = Mystem()

!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

In [None]:
from pymystem3 import Mystem
lemmatizer = Mystem()

In [None]:
# почистить + лемматизировать отзывы
def get_clean_text(texts, stopwords, lemmatizer):
    texts = [re.sub('[^а-яё ]', ' ', str(t).lower()) for t in texts]
    texts = [re.sub(r" +", " ", t).strip() for t in texts]
    clean_texts = []
    for text in texts:
        if len(text) > 0:
            lemmatized_text_list = [token for token in lemmatizer.lemmatize(text)[:-1] if token != ' ']
            text = ' '.join([word for word in lemmatized_text_list if word not in stopwords])
            # костыль для сокращения
            text = text.replace('з п', 'зп')
        clean_texts.append(text)
    return clean_texts

In [None]:
data['clean_positive'] = get_clean_text(list(data['positive']), stopwords, lemmatizer)

In [None]:
data['clean_negative'] = get_clean_text(list(data['negative']), stopwords, lemmatizer)

In [None]:
# количество уникальных слов в отзыве
data['unique_positive'] = [len(list(set(t.split(' ')))) for t in list(data['clean_positive'])]
data['unique_negative'] = [len(list(set(t.split(' ')))) for t in list(data['clean_negative'])]

In [None]:
!pip install transformers sentencepiece

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
# Sentiment
model_checkpoint = 'cointegrated/rubert-tiny-sentiment-balanced'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def get_sentiment(text, return_type='label'):
    """ Calculate sentiment of a text. `return_type` can be 'label', 'score' or 'proba' """
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])
    return proba

In [None]:
from tqdm import tqdm

In [None]:
data['sentiment_positive_label'] = [get_sentiment(text, 'label') for text in tqdm(list(data['positive']))]

In [None]:
data['sentiment_negative_label'] = [get_sentiment(text, 'label') for text in tqdm(list(data['negative']))]

In [None]:
data['sentiment_positive_score'] = [get_sentiment(text, 'score') for text in tqdm(list(data['positive']))]

In [None]:
data['sentiment_negative_score'] = [get_sentiment(text, 'score') for text in tqdm(list(data['negative']))]

In [None]:
# Emotion detection
# LABELS = ['no_emotion', 'joy', 'sadness', 'surprise', 'fear', 'anger']
model_checkpoint = 'cointegrated/rubert-tiny2-cedr-emotion-detection'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def get_emotion(text, return_type='label'):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([0,1,-1,0,-1,-1])
    return proba

In [None]:
data['emotion_positive_label'] = [get_emotion(text, 'label') for text in tqdm(list(data['positive']))]

In [None]:
data['emotion_negative_label'] = [get_emotion(text, 'label') for text in tqdm(list(data['negative']))]

In [None]:
data['emotion_positive_score'] = [get_emotion(text, 'score') for text in tqdm(list(data['positive']))]

In [None]:
data['emotion_negative_score'] = [get_emotion(text, 'score') for text in tqdm(list(data['negative']))]

In [None]:
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def text2toxicity(text, return_type = 'label'):
    """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return 1 - proba.T[0] * (1 - proba.T[-1])

In [None]:
data['toxic_positive_label'] = [text2toxicity(text, 'label') for text in (list(data['positive']))]

In [None]:
data['toxic_negative_label'] = [text2toxicity(text, 'label') for text in (list(data['negative']))]

In [None]:
data['toxic_positive_score'] = [text2toxicity(text, 'score') for text in (list(data['positive']))]

In [None]:
data['toxic_negative_score'] = [text2toxicity(text, 'score') for text in (list(data['negative']))]

In [None]:
# Для сохранения пропорций разбить отдельно для каждой метки и склеить
from sklearn.model_selection import train_test_split

targets = ['tag_0', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7', 'tag_8']
df_list_train = []
df_list_valid = []
df_list_test = []
for target in targets:
    df = data[data[target] == 1]
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
    df_valid, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
    df_list_train.append(df_train)
    df_list_valid.append(df_valid)
    df_list_test.append(df_test)

In [None]:
data_train = pd.concat(df_list_train).sort_values(by='review_id').reset_index(drop=True)
data_valid = pd.concat(df_list_valid).sort_values(by='review_id').reset_index(drop=True)
data_test = pd.concat(df_list_test).sort_values(by='review_id').reset_index(drop=True)

In [None]:
data_train.to_csv('data_train.csv', index = False)
data_valid.to_csv('data_valid.csv', index = False)
data_test.to_csv('data_test.csv', index = False)