In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from transformers.trainer_callback import EarlyStoppingCallback
from sklearn.metrics import roc_auc_score

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
df = pd.read_csv('./compare_manual_manuallVSsemiauto/train_manualVSsemiauto_rst_2.csv')
df_val = pd.read_csv('./compare_manual_manuallVSsemiauto/val_manual_only_rst_2.csv')
df_test = pd.read_csv('./compare_manual_manuallVSsemiauto/test_manual_only_rst_2.csv')

In [3]:
df.head()

Unnamed: 0,text,offline_crime,online_crime,drugs,gambling,pornography,prostitution,slavery,suicide,terrorism,weapons,body_shaming,health_shaming,politics,racism,religion,sexual_minorities,sexism,social_injustice
0,есть такой лайфхак у футбольных фанатов перед ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Еще один йоба-знаток, у которого грабеж==разбой",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17
2,Зря тут этот пост. Теперь дом спиздят,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Не знаю статью, но один мужчина заступился физ...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0
4,idДвачую адвоката адеквата.,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
necessary_columns = list(df.columns)[1:] 
# necessary_columns

In [5]:
def get_labels(dataframe):
    labels =[]
    for i, el in dataframe.iterrows():
        current_sample_labels = []
        any_class = False
        for clm in necessary_columns:
            if el[clm] == 1:
                any_class = True
                current_sample_labels.append(clm)
        if any_class == False:
            current_sample_labels.append("none")
        current_sample_labels = ','.join(current_sample_labels)
        labels.append(current_sample_labels)
    return labels
train_labels = get_labels(df)
val_labels = get_labels(df_val)
test_labels = get_labels(df_test)

In [6]:
len(set(test_labels))

194

In [7]:
df_train_adjusted = pd.DataFrame({'text':list(df['text']), 'labels':train_labels})
df_val_adjusted = pd.DataFrame({'text':list(df_val['text']), 'labels':val_labels})
df_test_adjusted = pd.DataFrame({'text':list(df_test['text']), 'labels':test_labels})

In [8]:
mapping = dict()
mapping['none'] = 0

for label in train_labels:
    if label not in mapping:
        mapping[label] = len(mapping)

for label in test_labels:
    if label not in mapping:
        mapping[label] = len(mapping)
        
for label in val_labels:
    if label not in mapping:
        mapping[label] = len(mapping)

In [9]:
len(mapping)

337

In [10]:
df_train_adjusted['class'] = df_train_adjusted['labels'].apply(lambda x: mapping[x])
df_test_adjusted['class'] = df_test_adjusted['labels'].apply(lambda x: mapping[x])
df_val_adjusted['class'] = df_val_adjusted['labels'].apply(lambda x: mapping[x])

In [11]:
labels_val = df_val_adjusted['labels'].tolist()

In [12]:
x_train = df_train_adjusted['text'].tolist()
y_train = df_train_adjusted['class'].tolist()
x_test = df_test_adjusted['text'].tolist()
y_test = df_test_adjusted['class'].tolist()
x_val = df_val_adjusted['text'].tolist()
y_val = df_val_adjusted['class'].tolist()

In [13]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        
        return len(self.texts)

    
    def __getitem__(self, index):
        x = self.texts[index]

        enc_dict = self.tokenizer(x, truncation=True, max_length=self.max_len, padding='max_length')
      
        item = {key: torch.tensor(val).long() for key, val in enc_dict.items()}
        item['labels'] = torch.tensor(self.targets[index]).long()

        return item 

In [14]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [15]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = len(mapping))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# !nvidia-smi

In [17]:
train_dataset = UnsafeData(x_train, y_train, tokenizer, max_len = 60)
test_dataset = UnsafeData(x_test, y_test, tokenizer, max_len = 60)
val_dataset = UnsafeData(x_val, y_val, tokenizer, max_len = 60)

In [18]:
len(train_dataset), len(test_dataset), len(val_dataset)

(33100, 1585, 1442)

In [19]:
# val_dataset[10]

In [20]:
topics_list = necessary_columns + ['none']
# topics_list

In [21]:
target_vaiables_id2topic_dict = {val:key for key, val in mapping.items()}

In [22]:
# import json
# with open("id2topic.json","w") as f:
#     json.dump(target_vaiables_id2topic_dict, f, indent = 2)
    

In [23]:
len(target_vaiables_id2topic_dict)

337

In [24]:
def adjust_multilabel(y, is_pred = False):
    y_adjusted = []
    for y_c in y:
        y_test_curr = [0]*19
        if is_pred == True:
            y_c = target_vaiables_id2topic_dict[np.argmax(y_c)]
        else:
            y_c = target_vaiables_id2topic_dict[y_c]
        for tag in y_c.split(","):
            topic_index = topics_list.index(tag)
            y_test_curr[topic_index] = 1
        y_adjusted.append(y_test_curr)
    return y_adjusted



In [25]:
# X, y = load_breast_cancer(return_X_y=True)
# clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y)
# roc_auc_score(y, clf.predict_proba(X)[:, 1])

In [26]:
def compute_metrics(pred):
    labels = pred.label_ids
    labels = adjust_multilabel(labels, is_pred = False)
    preds = pred.predictions
    
    preds = adjust_multilabel(preds, is_pred = True)
        
    rauc = roc_auc_score(labels,preds)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division = 0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'rauc':rauc
    }

In [27]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

device = torch.device('cuda:1')

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device

In [28]:
torch.cuda.set_device(device)
model.to(device);

In [29]:
training_args = TrAr(
    output_dir='/multi_model/publ',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps = 600,
    evaluation_strategy = 'steps',
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_steps = 500,
    eval_steps = 500, #500,
    metric_for_best_model  = 'f1',
    greater_is_better = True,
    load_best_model_at_end = True, report_to = 'none' 
)

In [30]:
import gc
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [33]:
trainer.add_callback(EarlyStoppingCallback(2))

In [34]:
trainer.train()

***** Running training *****
  Num examples = 33100
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10345


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Rauc
500,No log,2.630738,0.478502,0.486983,0.724036,0.477069,0.71794
1000,3.040000,2.387581,0.514563,0.608976,0.771198,0.551924,0.782451
1500,1.129600,2.178283,0.538141,0.621697,0.772885,0.568793,0.766451
2000,0.978200,1.961584,0.5638,0.681448,0.776126,0.616236,0.7884
2500,0.804900,1.902267,0.573509,0.677517,0.781782,0.617818,0.784402
3000,0.665100,1.934716,0.576283,0.687686,0.78035,0.631523,0.791105
3500,0.665100,1.92318,0.578363,0.684015,0.786313,0.625198,0.806356
4000,0.655600,1.742534,0.602635,0.701976,0.801302,0.647865,0.795234
4500,0.618000,1.87351,0.594313,0.718118,0.795319,0.665261,0.817082
5000,0.416600,1.836406,0.60957,0.716671,0.786064,0.672114,0.795633


***** Running Evaluation *****
  Num examples = 1442
  Batch size = 16
Saving model checkpoint to /multi_model/publ/checkpoint-500
Configuration saved in /multi_model/publ/checkpoint-500/config.json
Model weights saved in /multi_model/publ/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1442
  Batch size = 16
Saving model checkpoint to /multi_model/publ/checkpoint-1000
Configuration saved in /multi_model/publ/checkpoint-1000/config.json
Model weights saved in /multi_model/publ/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1442
  Batch size = 16
Saving model checkpoint to /multi_model/publ/checkpoint-1500
Configuration saved in /multi_model/publ/checkpoint-1500/config.json
Model weights saved in /multi_model/publ/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1442
  Batch size = 16
Saving model checkpoint to /multi_model/publ/checkpoint-2000
Configuration saved in /multi_model/publ/ch

TrainOutput(global_step=8000, training_loss=0.7502470574378968, metrics={'train_runtime': 944.2284, 'train_samples_per_second': 175.275, 'train_steps_per_second': 10.956, 'total_flos': 3958165616781600.0, 'train_loss': 0.7502470574378968, 'epoch': 3.87})

In [35]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1585
  Batch size = 16


{'eval_loss': 2.443035364151001,
 'eval_accuracy': 0.5709779179810726,
 'eval_f1': 0.6998859950532954,
 'eval_precision': 0.7702140126538036,
 'eval_recall': 0.651976374375284,
 'eval_rauc': 0.8050120193381921,
 'eval_runtime': 2.9449,
 'eval_samples_per_second': 538.214,
 'eval_steps_per_second': 33.957,
 'epoch': 3.87}

In [1]:
rst2 = {'eval_loss': 2.443035364151001,
 'eval_accuracy': 0.5709779179810726,
 'eval_f1': 0.6998859950532954,
 'eval_precision': 0.7702140126538036,
 'eval_recall': 0.651976374375284,
 'eval_rauc': 0.8050120193381921,
 'eval_runtime': 2.9449,
 'eval_samples_per_second': 538.214,
 'eval_steps_per_second': 33.957,
 'epoch': 3.87}

In [2]:
rst1 = {'eval_loss': 2.356205701828003,
 'eval_accuracy': 0.5754895767530006,
 'eval_f1': 0.7109142859770795,
 'eval_precision': 0.7573820406086228,
 'eval_recall': 0.6744186046511628,
 'eval_rauc': 0.8187230883371196,
 'eval_runtime': 3.0342,
 'eval_samples_per_second': 521.714,
 'eval_steps_per_second': 32.628,
 'epoch': 4.83}

In [3]:
rst0 = {'eval_loss': 2.919194221496582,
 'eval_accuracy': 0.6082018927444794,
 'eval_f1': 0.7347078777111259,
 'eval_precision': 0.7787538848299613,
 'eval_recall': 0.7023004059539919,
 'eval_rauc': 0.8316562762180016,
 'eval_runtime': 3.0124,
 'eval_samples_per_second': 526.152,
 'eval_steps_per_second': 33.196,
 'epoch': 9.18}

In [4]:
res_list = [rst0, rst1, rst2]

In [7]:
import numpy as np
collected_data = []
for r in res_list:
    collected_data.append([r['eval_precision'],r['eval_recall'],r['eval_f1'],r['eval_rauc']])

In [8]:
np.mean(collected_data, axis = 0)

array([0.76878331, 0.67623179, 0.71516939, 0.81846379])

In [9]:
np.std(collected_data, axis = 0)

array([0.00878347, 0.02058467, 0.01453089, 0.01087902])

In [None]:
trainer.save_model('multi-class')

Оценка на val_dataset

In [50]:
pred = trainer.predict(val_dataset)



In [51]:
pr = pred.predictions

In [52]:
len(df),len(df_test), len(df_val), len(adjust_multilabel(y_val, is_pred = False))

(31130, 1481, 692, 692)

In [53]:
print(classification_report(adjust_multilabel(y_val, is_pred = False), adjust_multilabel(pr, is_pred = True),
                           target_names=topics_list, zero_division = 0))

                   precision    recall  f1-score   support

    offline_crime       0.64      0.54      0.58        52
     online_crime       0.46      0.43      0.44        14
            drugs       0.88      0.88      0.88        41
         gambling       0.50      0.50      0.50         2
      pornography       0.77      0.68      0.72        87
     prostitution       0.87      0.80      0.84        41
          slavery       0.72      0.87      0.79        15
          suicide       0.50      0.67      0.57         3
        terrorism       0.50      0.39      0.44        18
          weapons       0.90      0.94      0.92        65
     body_shaming       0.86      0.67      0.75        48
   health_shaming       0.86      0.65      0.74        49
         politics       0.73      0.56      0.63       109
           racism       0.82      0.59      0.69        86
         religion       0.90      0.80      0.84        44
sexual_minorities       0.69      0.55      0.61       

Оценка на test_dataset

In [105]:
pred2 = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1585
  Batch size = 16


In [106]:
pr2 = pred2.predictions

In [107]:
print(classification_report(adjust_multilabel(y_test, is_pred = False), adjust_multilabel(pr2, is_pred = True),
                           target_names=topics_list, zero_division = 0))

                   precision    recall  f1-score   support

    offline_crime       0.69      0.60      0.64       124
     online_crime       0.88      0.50      0.64        42
            drugs       0.87      0.80      0.84        86
         gambling       0.67      0.67      0.67         9
      pornography       0.79      0.67      0.72       180
     prostitution       0.77      0.82      0.80        88
          slavery       0.81      0.85      0.83        34
          suicide       0.44      0.44      0.44         9
        terrorism       0.67      0.66      0.67        44
          weapons       0.92      0.90      0.91       233
     body_shaming       0.84      0.78      0.81       107
   health_shaming       0.83      0.74      0.78       104
         politics       0.76      0.60      0.67       222
           racism       0.85      0.65      0.73       177
         religion       0.93      0.67      0.78        95
sexual_minorities       0.78      0.54      0.63       

In [60]:
import os
path = "../../../../../russian-sensitive-topics"
os.listdir(path)

['.git', '.gitattributes']

In [61]:
trainer.save_model(path)

In [62]:
tokenizer.save_pretrained(path)

('../../../../../russian-sensitive-topics/tokenizer_config.json',
 '../../../../../russian-sensitive-topics/special_tokens_map.json',
 '../../../../../russian-sensitive-topics/vocab.txt',
 '../../../../../russian-sensitive-topics/added_tokens.json')

In [63]:
from transformers import TFBertForSequenceClassification

In [64]:
tf_model = TFBertForSequenceClassification.from_pretrained(path, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [65]:
tf_model.save_pretrained(path)