In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [56]:
import os

# For data manipulation
import numpy as np
import pandas as pd
import warnings
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
from pylab import rcParams

import torch
import torch.nn as nn

# For Transformer Models
from transformers import (
    AdamW,
    AutoTokenizer, 
    pipeline,
    AutoModel,
    AutoConfig,
    AutoModelForSequenceClassification, 
    AutoModelForTokenClassification,
    DataCollatorForLanguageModeling, 
    get_scheduler,
    Trainer, 
    TrainingArguments
)

from tqdm.auto import tqdm

## Загрузка данных

In [57]:
df = pd.read_csv("https://raw.githubusercontent.com/skillfactory-hackaton-team9/twitter-analyzer/main/parsed_data/twitter.csv", index_col=0)

In [58]:
df.head()

Unnamed: 0,date,author,twitter_name,text,number_of_reply,number_of_retweets,number_of_likes
0,2016-10-30T10:16:12.000Z,Виталий Матросов,@vit_matrosov,"Тренинг ""Управление конфликтом"" @ Семинар ""Бла...",0,0,0
1,2016-10-25T07:52:13.000Z,Марцева Милана,@kp_milana,"На субботнике в фонде ""Подари жизнь""\n#субботн...",0,0,0
2,2016-10-20T21:51:02.000Z,Евгения Малеваная,@EjonokEvgenia,благотворительный фонд подари жизнь презентаци...,0,0,0
3,2016-10-20T08:31:43.000Z,Новости Улан-Удэ,@UlanUde_Novosti,Фонд «Подари мне жизнь» вручил подарки осужде...,0,0,0
4,2016-10-20T08:11:19.000Z,Новости России,@Russia_all_News,Фонд «Подари мне жизнь» вручил подарки осужден...,0,0,0


In [59]:
df.shape

(1110, 7)

In [60]:
def text2toxicity(text, aggregate=False):
    """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if aggregate:
        return 1 - proba.T[0] * (1 - proba.T[-1])
    return proba

## Предсказания первой модели

In [61]:
MODEL_NAME = 'cointegrated/rubert-tiny-toxicity'

In [62]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
if torch.cuda.is_available():
    model.cuda()

Downloading:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.0M [00:00<?, ?B/s]

In [63]:
df.text.head(10)

0    Тренинг "Управление конфликтом" @ Семинар "Бла...
1    На субботнике в фонде "Подари жизнь"\n#субботн...
2    благотворительный фонд подари жизнь презентаци...
3    Фонд «Подари мне жизнь»  вручил подарки осужде...
4    Фонд «Подари мне жизнь» вручил подарки осужден...
5    Кружка из-под кофе с молоком недельной давност...
6    фонд "Подари жизнь" передал в дар лаборатории ...
7    Фонд "Подари жизнь" и "ТВ Центр" собирают сред...
8    Фонд "Подари жизнь" и "ТВ Центр" собирают сред...
9    HAPPY BIRTHDAY, CHULPAN! Today is the birthday...
Name: text, dtype: object

In [64]:
sentiment = df.text.apply(text2toxicity)

In [65]:
sentiment_m1_df = pd.DataFrame.from_records(sentiment, columns=["m1_notoxic", "m1_insult", "m1_obscenity", "m1_threat", "m1_dangerous"])

In [66]:
df.join(sentiment_m1_df)

Unnamed: 0,date,author,twitter_name,text,number_of_reply,number_of_retweets,number_of_likes,m1_notoxic,m1_insult,m1_obscenity,m1_threat,m1_dangerous
0,2016-10-30T10:16:12.000Z,Виталий Матросов,@vit_matrosov,"Тренинг ""Управление конфликтом"" @ Семинар ""Бла...",0,0,0,0.999103,0.000713,0.000271,0.000153,0.228175
1,2016-10-25T07:52:13.000Z,Марцева Милана,@kp_milana,"На субботнике в фонде ""Подари жизнь""\n#субботн...",0,0,0,0.995868,0.003864,0.000246,0.000664,0.703467
2,2016-10-20T21:51:02.000Z,Евгения Малеваная,@EjonokEvgenia,благотворительный фонд подари жизнь презентаци...,0,0,0,0.998906,0.001153,0.000230,0.000164,0.586002
3,2016-10-20T08:31:43.000Z,Новости Улан-Удэ,@UlanUde_Novosti,Фонд «Подари мне жизнь» вручил подарки осужде...,0,0,0,0.999766,0.000276,0.000130,0.000217,0.288635
4,2016-10-20T08:11:19.000Z,Новости России,@Russia_all_News,Фонд «Подари мне жизнь» вручил подарки осужден...,0,0,0,0.999573,0.000562,0.000200,0.000365,0.573881
...,...,...,...,...,...,...,...,...,...,...,...,...
1105,2016-11-15T17:20:30.000Z,Елена Иванова,@Elena81985,Я перечисляю каждый месяц в эти два благотвори...,0,0,0,0.999427,0.000596,0.000206,0.000157,0.190671
1106,2016-11-15T16:48:23.000Z,Irina Adnan,@IreneCh87,"Поможем деткам! Фонд ""Подари жизнь"": https://d...",0,0,0,0.999227,0.000762,0.000175,0.000170,0.484968
1107,2016-11-15T08:10:00.000Z,Елена Иванова,@Elena81985,"Чтобы сделать пожертвование в Фонд ""Подари жиз...",0,0,0,0.999605,0.000434,0.000252,0.000277,0.199245
1108,2016-11-14T14:17:10.000Z,Melon Rich,@Melon_Rich,"Фонд «Подари жизнь», «Азбука Вкуса» и российск...",0,0,0,0.999619,0.000376,0.000241,0.000146,0.299836


In [67]:
df.shape

(1110, 7)

## Вторая модель

In [68]:
MODEL_NAME = 'SkolkovoInstitute/russian_toxicity_classifier'

In [69]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
if torch.cuda.is_available():
    model.cuda()

Downloading:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

In [70]:
df.text.head(10).apply(text2toxicity)

0      [0.930439, 0.10373296]
1      [0.957753, 0.07210839]
2    [0.9781948, 0.035129562]
3     [0.9561413, 0.08019254]
4     [0.9458012, 0.09931419]
5     [0.9470253, 0.07730452]
6    [0.9702223, 0.048114214]
7    [0.9635194, 0.061070666]
8     [0.9408365, 0.09904781]
9     [0.18996058, 0.6976328]
Name: text, dtype: object

In [71]:
df.loc[9,"text"]

'HAPPY BIRTHDAY, CHULPAN! Today is the birthday of famous actress, mother of three and Фонд Подари жизнь... http://fb.me/SofWKYfJ'

In [72]:
sentiment2 = df.text.apply(text2toxicity)

In [73]:
sentiment_m2_df = pd.DataFrame.from_records(sentiment2, columns=["m2_neutral", "m2_toxic",])

## Третья модель (определение эмоций)

In [74]:
MODEL_NAME = 'cointegrated/rubert-tiny2-cedr-emotion-detection'

In [75]:
labels = ["no_emotion", "joy", "sadness", "surprise", "fear", "anger"]

In [76]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
if torch.cuda.is_available():
    model.cuda()

In [77]:
df.text.head(10).apply(text2toxicity)

0    [0.99043334, 0.0045969104, 0.005468997, 0.0045...
1    [0.66048443, 0.17482068, 0.0027271644, 0.00532...
2    [0.20940742, 0.7338375, 0.0023706034, 0.015222...
3    [0.94594383, 0.031731542, 0.001756635, 0.00696...
4    [0.8790501, 0.06935868, 0.0020480987, 0.005457...
5    [0.9901203, 0.0051632514, 0.007601077, 0.00477...
6    [0.99127626, 0.0048513245, 0.0060705934, 0.005...
7    [0.99079984, 0.0054704007, 0.0053896694, 0.004...
8    [0.99038434, 0.006141322, 0.004957042, 0.00465...
9    [0.014773562, 0.9481621, 0.004153302, 0.034658...
Name: text, dtype: object

In [78]:
df.loc[2,"text"]

'благотворительный фонд подари жизнь презентация http://amnagarments.com/layouts/board/blagotvoritelnyy-fond-podari-zhizn-prezentaciya-ce.html…'

In [79]:
emotions3 = df.text.apply(text2toxicity)

In [80]:
emotions_m3_df = pd.DataFrame.from_records(emotions3, columns=[f"m3_{label}" for label in labels])

## 4-ая модель 

In [81]:
MODEL_NAME = 'Aniemore/rubert-tiny2-russian-emotion-detection'

In [82]:
labels = ['neutral', 'happiness', 'sadness', 'enthusiasm', 'fear', 'anger', 'disgust']

In [83]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
if torch.cuda.is_available():
    model.cuda()

In [84]:
emotions4 = df.text.apply(text2toxicity)

In [85]:
emotions_m4_df = pd.DataFrame.from_records(emotions4, columns=[f"m4_{label}" for label in labels])

## Объединим результаты

In [86]:
result_df = (
    df.join(sentiment_m1_df).
      join(sentiment_m2_df).
      join(emotions_m3_df).
      join(emotions_m4_df)
)

In [87]:
result_df.head()

Unnamed: 0,date,author,twitter_name,text,number_of_reply,number_of_retweets,number_of_likes,m1_notoxic,m1_insult,m1_obscenity,...,m3_surprise,m3_fear,m3_anger,m4_neutral,m4_happiness,m4_sadness,m4_enthusiasm,m4_fear,m4_anger,m4_disgust
0,2016-10-30T10:16:12.000Z,Виталий Матросов,@vit_matrosov,"Тренинг ""Управление конфликтом"" @ Семинар ""Бла...",0,0,0,0.999103,0.000713,0.000271,...,0.0045,0.005192,0.00535,0.985386,0.005814,0.005619,0.005058,0.004997,0.00588,0.002905
1,2016-10-25T07:52:13.000Z,Марцева Милана,@kp_milana,"На субботнике в фонде ""Подари жизнь""\n#субботн...",0,0,0,0.995868,0.003864,0.000246,...,0.005324,0.002251,0.025031,0.595108,0.051616,0.007104,0.020108,0.013577,0.371969,0.011223
2,2016-10-20T21:51:02.000Z,Евгения Малеваная,@EjonokEvgenia,благотворительный фонд подари жизнь презентаци...,0,0,0,0.998906,0.001153,0.00023,...,0.015223,0.002595,0.041638,0.337133,0.128092,0.005587,0.071242,0.006938,0.310579,0.010395
3,2016-10-20T08:31:43.000Z,Новости Улан-Удэ,@UlanUde_Novosti,Фонд «Подари мне жизнь» вручил подарки осужде...,0,0,0,0.999766,0.000276,0.00013,...,0.006967,0.002752,0.007393,0.812209,0.066211,0.003152,0.033642,0.003603,0.052148,0.005064
4,2016-10-20T08:11:19.000Z,Новости России,@Russia_all_News,Фонд «Подари мне жизнь» вручил подарки осужден...,0,0,0,0.999573,0.000562,0.0002,...,0.005458,0.002653,0.016325,0.311728,0.100365,0.012316,0.035802,0.01719,0.638166,0.019009


In [88]:
result_df.shape

(1110, 27)

In [89]:
result_df.to_csv("sentiment_from_4_models.csv")

In [91]:
result_df.sort_values("m4_disgust", ascending=False).text.head().values

array(['пиздец я только сейчас узнала что «подари жизнь» которым столько прикрывали и прикрывают хаматову это даже не фонд хаматовой',
       'С подачи И.В.Безруковой интернет разукрашен бредом обо мне и С.В.Безрукове, и опять же устами той, что всех детей предлагала травить зарином. Весёлая компания идиоток, учитывая, что фонд их Подари жизнь тоже для детей, но строящий хосписы для умирающих, а не больницы для лечения',
       'Фонд «Зекет» запустил акцию «Подари надежду на жизнь» https://kazislam.kz/%d1%84%d0%be%d0%bd%d0%b4-%d0%b7%d0%b5%d0%ba%d0%b5%d1%82-%d0%b7%d0%b0%d0%bf%d1%83%d1%81%d1%82%d0%b8%d0%bb-%d0%b0%d0%ba%d1%86%d0%b8%d1%8e-%d0%bf%d0%be%d0%b4%d0%b0%d1%80%d0%b8-%d0%bd/?lang=ru… через @Қазақстандағы Ислам',
       'круто прикрываться больными? Фондом  с хозяевами мз Великобритании? "Вы плюнете мне в лицо, скажете все гадости, какие только хотели, но переведете деньги в фонд «Подари жизнь»\n\nПодробнее на РБК:\nhttps://rbc.ru/society/23/04/2020/5ea120159a79476a5a32a0de?utm_sour

## Определение объектов в тексте

In [41]:
MODEL_NAME = 'KoichiYasuoka/bert-base-russian-upos'

In [48]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
if torch.cuda.is_available():
    model.cuda()
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [55]:
text = "Я ездил в барселону"
with torch.no_grad():
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
    result = model(**inputs).cpu()

AttributeError: ignored

In [54]:
result

TokenClassifierOutput([('logits',
                        tensor([[[ 2.8388e-01, -2.2314e+00, -5.0003e-02, -1.0622e+00,  2.3878e-02,
                                  -3.8417e-02, -6.9057e-01, -1.2558e+00, -9.0557e-01, -1.7249e+00,
                                  -2.1065e+00, -6.6369e-01, -9.3004e-01, -5.4125e-01, -1.7381e+00,
                                  -6.9241e-01, -1.1078e+00,  1.5367e-01, -3.5232e-01,  3.6398e-01,
                                   3.2950e-01, -1.8395e+00, -8.7411e-01, -1.5958e+00, -8.9268e-01,
                                  -8.7887e-01, -2.0593e+00,  2.1465e-01, -4.1670e-01, -7.4354e-01,
                                  -6.0911e-01, -7.1134e-01, -4.4692e-01, -1.0846e+00, -1.4355e+00,
                                  -2.4782e-01, -1.4506e+00,  4.3895e-01, -2.6500e-01, -5.5100e-01,
                                  -1.3554e+00, -1.0825e+00, -1.8040e+00, -1.4799e+00, -1.0781e+00,
                                  -1.4917e+00, -1.8832e-01, -1.4016e+00, -6