# Использование

## pipeline

In [1]:
from transformers import pipeline

clf = pipeline('sentiment-analysis', model='SkolkovoInstitute/russian_toxicity_classifier')

text = ['У нас в есть убунты и текникал превью.',
        'Как минимум два малолетних дегенерата в треде, мда.']

clf(text)

[{'label': 'neutral', 'score': 0.9872767329216003},
 {'label': 'toxic', 'score': 0.985331654548645}]

In [3]:
pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')

<transformers.pipelines.question_answering.QuestionAnsweringPipeline at 0x7fc057f79490>

In [6]:
from transformers import pipeline

clf = pipeline('sentiment-analysis', model='SkolkovoInstitute/russian_toxicity_classifier')

text = ['У нас в есть убунты и текникал превью.',
        'Как минимум два малолетних дегенерата в треде, мда.']

clf(text, top_k=None)

[[{'label': 'neutral', 'score': 0.9872767329216003},
  {'label': 'toxic', 'score': 0.012723307125270367}],
 [{'label': 'toxic', 'score': 0.985331654548645},
  {'label': 'neutral', 'score': 0.01466838177293539}]]

In [7]:
from transformers import pipeline

clf = pipeline('sentiment-analysis', model='SkolkovoInstitute/russian_toxicity_classifier')

text = ['У нас в есть убунты и текникал превью.',
        'Как минимум два малолетних дегенерата в треде, мда.']

def data(text):
    for row in text:
        yield row

for out in clf(data(text)):
    print(out)

{'label': 'neutral', 'score': 0.9872767329216003}
{'label': 'toxic', 'score': 0.985331654548645}


## torch

In [14]:
import torch
import requests
from PIL import Image
from io import BytesIO
from transformers import AutoImageProcessor, AutoModelForImageClassification

response = requests.get(
    'https://github.com/laxmimerit/dog-cat-full-dataset/blob/master/data/train/cats/cat.10055.jpg?raw=true')
img = Image.open(BytesIO(response.content))

feature_extractor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = AutoModelForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = feature_extractor(img, return_tensors='pt')

with torch.no_grad():
    logits = model(**inputs).logits

predicted_id = logits.argmax(-1).item()
predicted_label = model.config.id2label[predicted_id]
print(predicted_id, '-', predicted_label)

281 - tabby, tabby cat


# Обучение

## trainer

In [1]:
import datasets
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

df = pd.read_csv('toxic.csv').sample(1000)
df.columns = ['text','label']
df['label'] = df['label'].astype(int)

train, test = train_test_split(df, test_size=0.3)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

tokenizer = AutoTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train.map(tokenize_function)
tokenized_test = test.map(tokenize_function)

  0%|          | 0/700 [00:00<?, ?ex/s]

  0%|          | 0/300 [00:00<?, ?ex/s]

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    'SkolkovoInstitute/russian_toxicity_classifier', 
    num_labels=2)

training_args = TrainingArguments(
    output_dir = 'test_trainer_log',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = 6,
    per_device_eval_batch_size = 6,
    num_train_epochs = 5,
    report_to='none'
)

metric = evaluate.load('f1')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at /home/slivka_83/.cache/huggingface/hub/models--SkolkovoInstitute--russian_toxicity_classifier/snapshots/2b9a086ec05c2dc202fea11ed15f317b1676b18c/config.json
Model config BertConfig {
  "_name_or_path": "SkolkovoInstitute/russian_toxicity_classifier",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "neutral",
    "1": "toxic"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "neutral": 0,
    "toxic": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_

In [5]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 700
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 585
  Number of trainable parameters = 177854978


Epoch,Training Loss,Validation Loss,F1
1,No log,0.401824,0.86758
2,No log,0.617232,0.855769
3,No log,0.696998,0.877828
4,No log,0.708729,0.875576
5,0.079500,0.714718,0.875576


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 6
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 6
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can sa

TrainOutput(global_step=585, training_loss=0.06792833231405443, metrics={'train_runtime': 244.4831, 'train_samples_per_second': 14.316, 'train_steps_per_second': 2.393, 'total_flos': 920888693760000.0, 'train_loss': 0.06792833231405443, 'epoch': 5.0})

In [4]:
save_directory = './pt_save_pretrained'
#tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

Configuration saved in ./pt_save_pretrained/config.json
Model weights saved in ./pt_save_pretrained/pytorch_model.bin


## torch

In [1]:
import torch
import evaluate
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm

df = pd.read_csv('toxic.csv').sample(3_000)
df.columns = ['text','label']
df['label'] = df['label'].astype(int)

train, test = train_test_split(df, test_size=0.2)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

tokenizer = AutoTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

def ds_preproc(ds):
    ds = ds.map(tokenize_function)
    ds = ds.remove_columns(['text', '__index_level_0__'])
    ds = ds.rename_column('label', 'labels')
    ds.set_format('torch')
    return ds

tokenized_train = ds_preproc(train)
tokenized_test = ds_preproc(test)

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_test, batch_size=8)

  0%|          | 0/2400 [00:00<?, ?ex/s]

  0%|          | 0/600 [00:00<?, ?ex/s]

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    'SkolkovoInstitute/russian_toxicity_classifier',
    num_labels=2)

optimizer = AdamW(model.parameters(), lr=1e-7)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name = 'linear', 
    optimizer = optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = num_training_steps
)

device = 'cuda'
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [3]:
for epoch in tqdm(range(num_epochs)):
    model.train()
    for batch in tqdm(train_dataloader, leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    metric = evaluate.load('f1')
    model.eval()
    for batch in tqdm(test_dataloader, leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        metric.add_batch(predictions=predictions, references=batch['labels'])
    
    print(f'epoch {epoch} -', metric.compute())

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

epoch 0 - {'f1': 0.8862559241706162}


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

epoch 1 - {'f1': 0.8941176470588236}


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

epoch 2 - {'f1': 0.9018691588785047}


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

epoch 3 - {'f1': 0.9018691588785047}


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

epoch 4 - {'f1': 0.9018691588785047}


## embeddings

### sentence_transformers

In [None]:
#https://www.sbert.net/

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('SkolkovoInstitute/russian_toxicity_classifier')

text = ['У нас в есть убунты и текникал превью.',
        'Как минимум два малолетних дегенерата в треде, мда.']

embeddings = model.encode(text)

df = pd.DataFrame(embeddings)

df

No sentence-transformers model found with name /home/slivka_83/.cache/torch/sentence_transformers/SkolkovoInstitute_russian_toxicity_classifier. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/slivka_83/.cache/torch/sentence_transformers/SkolkovoInstitute_russian_toxicity_classifier were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.168708,-0.379199,0.288084,0.100402,0.412716,0.362394,0.127428,-0.517129,0.379522,-0.71949,...,-0.061971,-0.007156,-0.701285,-0.724906,-0.102113,0.10003,-0.452336,-0.047709,0.368882,-0.322735
1,-0.045079,-0.097835,-0.479516,-0.319487,0.071947,-0.116214,1.176675,1.031475,-0.618434,-0.281351,...,0.314532,0.450745,-0.385783,0.696997,0.175919,-0.393985,0.645511,0.190868,0.15747,0.147497


### Усреднение

In [5]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
df = pd.DataFrame(sentence_embeddings).astype('float')

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.0746,-0.233042,-0.084994,0.076429,0.215108,0.662372,0.160069,-0.172681,0.19304,-0.431338,...,0.307956,0.13679,-0.092838,-0.06786,0.173766,0.075278,-0.084738,0.544805,0.672492,-0.230025
1,0.260372,0.253684,0.14473,0.156579,-0.163387,0.384969,0.456058,0.033556,-0.03084,-0.035321,...,0.125319,-0.011805,0.141031,-0.213023,0.021587,-0.168322,0.314002,0.306754,0.391649,-0.153518
2,0.23829,0.319653,0.261314,0.420612,0.145059,-0.204098,-0.01413,-0.325124,-0.013538,0.11972,...,-0.356111,-0.029595,-0.101007,-0.34251,-0.210141,0.188064,0.301274,0.282925,0.304315,0.553583


# Токенвйзер

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

In [15]:
tokenizer.tokenize('У нас в есть убунты и текникал превью.')

['У',
 'нас',
 'в',
 'есть',
 'убу',
 '##нты',
 'и',
 'тек',
 '##ника',
 '##л',
 'превью',
 '.']

In [16]:
encoding = tokenizer('У нас в есть убунты и текникал превью.')
encoding

{'input_ids': [101, 486, 1159, 340, 999, 63692, 10285, 322, 3100, 1352, 343, 85379, 132, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
tokenizer.decode(encoding['input_ids'])

'[CLS] У нас в есть убунты и текникал превью. [SEP]'

In [66]:
text = ['У нас в есть убунты',
        'Как минимум два малолетних дегенерата в треде, мда.']

encoding = tokenizer(
    text, 
    padding=True,
    truncation=True,
    max_length=512
)

encoding

{'input_ids': [[101, 486, 1159, 340, 999, 63692, 10285, 102, 0, 0, 0, 0, 0], [101, 1235, 3932, 1617, 53502, 97527, 303, 340, 39685, 128, 48557, 132, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# ImageProcessor

In [6]:
from transformers import AutoImageProcessor
from PIL import Image
from io import BytesIO
import requests

response = requests.get(
    'https://github.com/laxmimerit/dog-cat-full-dataset/blob/master/data/train/cats/cat.10055.jpg?raw=true')
img = Image.open(BytesIO(response.content))

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

inputs = image_processor(img, return_tensors='pt')

inputs

{'pixel_values': tensor([[[[ 0.4275,  0.4275,  0.4196,  ...,  0.0902,  0.1216,  0.0667],
          [ 0.4431,  0.4353,  0.4118,  ...,  0.0902,  0.0588,  0.0118],
          [ 0.4431,  0.4353,  0.4039,  ...,  0.1686,  0.1059,  0.0431],
          ...,
          [-0.1373, -0.0745, -0.0431,  ...,  0.2941,  0.2863,  0.2627],
          [-0.1529, -0.1137, -0.0588,  ...,  0.2784,  0.2627,  0.2627],
          [-0.1529, -0.1294, -0.0745,  ...,  0.2706,  0.2392,  0.2392]],

         [[ 0.4275,  0.4431,  0.4588,  ...,  0.0275,  0.0667,  0.0588],
          [ 0.4431,  0.4510,  0.4510,  ...,  0.0275,  0.0039,  0.0039],
          [ 0.4431,  0.4431,  0.4431,  ...,  0.1059,  0.0510,  0.0275],
          ...,
          [-0.2392, -0.1765, -0.1451,  ...,  0.1922,  0.1922,  0.1765],
          [-0.2549, -0.2157, -0.1608,  ...,  0.1765,  0.1765,  0.1922],
          [-0.2549, -0.2314, -0.1765,  ...,  0.1686,  0.1529,  0.1765]],

         [[ 0.4431,  0.4510,  0.4275,  ..., -0.0902, -0.0824, -0.0980],
          [ 0