<a href="https://colab.research.google.com/github/shitkov/categorizer/blob/main/get_data_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!wget https://boosters.pro/api/ch/files/pub/HeadHunter_test.csv
!wget https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt
!pip install transformers sentencepiece
!pip install pymystem3

from pymystem3 import Mystem
mstm = Mystem()

!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

In [2]:
import re
import pandas as pd
from pymystem3 import Mystem
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
columns = [
    'positive_len',
    'negative_len',
    'clean_positive',
    'clean_negative',
    'unique_positive',
    'unique_negative',

    'sentiment_positive_label',
    'sentiment_negative_label',
    'sentiment_positive_score',
    'sentiment_negative_score',

    'emotion_positive_label',
    'emotion_negative_label',
    'emotion_positive_score',
    'emotion_negative_score',
    
    'toxic_positive_label',
    'toxic_negative_label',
    'toxic_positive_score',
    'toxic_negative_score'
]

In [4]:
data = pd.read_csv('/content/HeadHunter_test.csv')

In [5]:
data = data.fillna('')

In [6]:
def get_len(data):
    positive_len_list = [len(str(pos)) for pos in list(data['positive'])]
    negative_len_list = [len(str(pos)) for pos in list(data['negative'])]
    return positive_len_list, negative_len_list

In [7]:
positive_len_list, negative_len_list = get_len(data)

data['positive_len'] = positive_len_list
data['negative_len_list'] = negative_len_list

In [8]:
with open('/content/stopwords-ru.txt') as f:
    stopwords = f.readlines()
    
stopwords = [line.rstrip('\n') for line in stopwords]

lemmatizer = Mystem()

def get_clean_text(texts, stopwords, lemmatizer):
    texts = [re.sub('[^а-яё ]', ' ', str(t).lower()) for t in texts]
    texts = [re.sub(r" +", " ", t).strip() for t in texts]
    clean_texts = []
    for text in tqdm(texts):
        if len(text) > 0:
            lemmatized_text_list = [token for token in lemmatizer.lemmatize(text)[:-1] if token != ' ']
            text = ' '.join([word for word in lemmatized_text_list if word not in stopwords])
            # костыль для сокращения
            text = text.replace('з п', 'зп')
        clean_texts.append(text)
    return clean_texts

data['clean_positive'] = get_clean_text(list(data['positive']), stopwords, lemmatizer)
data['clean_negative'] = get_clean_text(list(data['negative']), stopwords, lemmatizer)

100%|██████████| 50651/50651 [00:53<00:00, 944.99it/s] 
100%|██████████| 50651/50651 [00:34<00:00, 1469.68it/s]


In [9]:
def get_unique(data):
    unique_positive_list = [len(list(set(t.split(' ')))) for t in list(data['clean_positive'])]
    unique_negative_list = [len(list(set(t.split(' ')))) for t in list(data['clean_negative'])]
    return unique_positive_list, unique_negative_list

unique_positive_list, unique_negative_list = get_unique(data)

data['unique_positive'] = unique_positive_list
data['unique_negative'] = unique_negative_list

In [10]:
# Sentiment
model_checkpoint = 'cointegrated/rubert-tiny-sentiment-balanced'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def get_sentiment(text, return_type='label'):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])
    return proba

Downloading:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/884 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.0M [00:00<?, ?B/s]

In [11]:
data['sentiment_positive_label'] = [get_sentiment(text, 'label') for text in tqdm(list(data['positive']))]

100%|██████████| 50651/50651 [02:17<00:00, 367.16it/s]


In [12]:
data['sentiment_negative_label'] = [get_sentiment(text, 'label') for text in tqdm(list(data['negative']))]

100%|██████████| 50651/50651 [02:16<00:00, 370.33it/s]


In [13]:
data['sentiment_positive_score'] = [get_sentiment(text, 'score') for text in tqdm(list(data['positive']))]

100%|██████████| 50651/50651 [02:15<00:00, 374.13it/s]


In [14]:
data['sentiment_negative_score'] = [get_sentiment(text, 'score') for text in tqdm(list(data['negative']))]

100%|██████████| 50651/50651 [02:15<00:00, 374.84it/s]


In [15]:
# Emotion detection
model_checkpoint = 'cointegrated/rubert-tiny2-cedr-emotion-detection'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def get_emotion(text, return_type='label'):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([0,1,-1,0,-1,-1])
    return proba

Downloading:   0%|          | 0.00/379 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/111M [00:00<?, ?B/s]

In [16]:
data['emotion_positive_label'] = [get_emotion(text, 'label') for text in tqdm(list(data['positive']))]

100%|██████████| 50651/50651 [02:14<00:00, 375.27it/s]


In [17]:
data['emotion_negative_label'] = [get_emotion(text, 'label') for text in tqdm(list(data['negative']))]

100%|██████████| 50651/50651 [02:16<00:00, 370.40it/s]


In [18]:
data['emotion_positive_score'] = [get_emotion(text, 'score') for text in tqdm(list(data['positive']))]

100%|██████████| 50651/50651 [02:14<00:00, 377.22it/s]


In [19]:
data['emotion_negative_score'] = [get_emotion(text, 'score') for text in tqdm(list(data['negative']))]

100%|██████████| 50651/50651 [02:13<00:00, 378.97it/s]


In [20]:
# toxicity
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def text2toxicity(text, return_type = 'label'):
    """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if return_type == 'label':
        return model.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return 1 - proba.T[0] * (1 - proba.T[-1])

Downloading:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.0M [00:00<?, ?B/s]

In [21]:
data['toxic_positive_label'] = [text2toxicity(text, 'label') for text in (list(data['positive']))]

In [22]:
data['toxic_negative_label'] = [text2toxicity(text, 'label') for text in (list(data['negative']))]

In [23]:
data['toxic_positive_score'] = [text2toxicity(text, 'score') for text in (list(data['positive']))]

In [24]:
data['toxic_negative_score'] = [text2toxicity(text, 'score') for text in (list(data['negative']))]

In [26]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [27]:
data.to_csv('/drive/MyDrive/ml/hh/data_test_featured.csv')