In [None]:
!pip install transformers
!pip install datasets



# подготовка данных

In [1]:
# тут мы подгружаем датасет
import pandas as pd
from datasets import Dataset

# df_0k1k = pd.read_csv('cases_random_0kto1k.csv')
# df_1k2k = pd.read_csv('cases_random_1kto2k.csv')
# df_2k3k = pd.read_csv('cases_random_2kto3k.csv')
# df_3k4k = pd.read_csv('cases_random_3kto4k.csv')
# df_4k5k = pd.read_csv('cases_random_4kto5k.csv')
# df_5k6k = pd.read_csv('cases_random_5kto6k.csv')
# df_6k7k = pd.read_csv('cases_random_6kto7k.csv')
# df_7k8k = pd.read_csv('cases_random_7kto8k.csv')
# df_8k9k = pd.read_csv('cases_random_8kto9k.csv')
df_9k10k = pd.read_csv('cases_random_9kto10k.csv')

In [2]:
# df_50 = pd.read_csv('example50.csv')  # сейчас тут 50 строк, нужно добавить тексты приговоров до 150
# df_100 = pd.read_csv('example100.csv')
df_150 = pd.read_csv('example150.csv')

In [3]:
# тут мы выделяем последнюю часть текста, в которой содержится информация о статьях и наказании
import re

def extract_text_part(text):
    import re
    if not isinstance(text, str):
        return ""
    match1 = re.search(r"(составляет|является|составляло|являлось|)\s*[:]", text.lower())
    if match1:
      match2 = re.search(r"(размер|размера|размером)", text[match1.end():].lower())
    if match1 and match2:
        return text[match1.end():match2.start()].strip()
    else:
        return text  # если нет, брать весь текст

# подключение и обучение модели

In [4]:
# это чтобы не подключаться к WabdB и чтобы он не беспокоил просьбами об апи ключе
import os
os.environ["WANDB_DISABLED"] = "true"

In [6]:
# подключаемся к большой модели, обученной на русском языке, с пайпланом классификации текстов
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=3)

# Метки
label2id = {
    "значительный": 0,
    "крупный": 1,
    "особо крупный": 2
}
id2label = {v: k for k, v in label2id.items()}

def preprocess(example):
    text = extract_text_part(example['result_text'])
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=512)
    encoding['labels'] = label2id[example['drug_amount']]
    return encoding

# берем из размеченного нами датасета только текст приговора и нужную сейчас характеристику (amount)
train_dataset = Dataset.from_pandas(df_150[['result_text', 'drug_amount']])
train_dataset = train_dataset.map(preprocess, batched=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [7]:
import torch

# class weights — инверсно пропорционально частотам
class_counts = torch.tensor([110, 39, 0], dtype=torch.float)
class_weights = 1.0 / (class_counts + 1)
class_weights = class_weights / class_weights.sum()  # нормировка

class_weights = class_weights.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [8]:
# настройка обучения
from transformers import Trainer
from torch.nn import CrossEntropyLoss

training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=95, training_loss=0.6918534128289474, metrics={'train_runtime': 79.2026, 'train_samples_per_second': 9.406, 'train_steps_per_second': 1.199, 'total_flos': 196019496207360.0, 'train_loss': 0.6918534128289474, 'epoch': 5.0})

In [9]:
# сохраняем модель и токенизатор, чтобы потом снова не обучать (ну, придется обучить еще раз на датасете из 150 кейсов)
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")

('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json')

# загрузка нашей обученной модели и предсказание значений

In [10]:
# загружаем нашу обученную на примере модель
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("./my_model")
tokenizer = BertTokenizer.from_pretrained("./my_model")
model.eval()  # переводим в режим инференса

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [11]:
# предсказываем значение характеристики для неразмеченного большого датасета
from torch import no_grad

def predict_amount(text):
    text = extract_text_part(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
    with no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()
    return id2label[predicted_class_id]



In [None]:
# 1k
big_df = df_0k1k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_0k1k.csv", index=False)

predicted_amount_df_0k1k = big_df[['id', 'predicted_amount']]
predicted_amount_df_0k1k.to_csv("predicted_amount_0k1k.csv", index=False)

In [None]:
# 1k to 2k
big_df = df_1k2k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_1k2k.csv", index=False)

predicted_amount_df_1k2k = big_df[['id', 'predicted_amount']]
predicted_amount_df_1k2k.to_csv("predicted_amount_1k2k.csv", index=False)

In [None]:
# 2k to 3k
big_df = df_2k3k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_2k3k.csv", index=False)

predicted_amount_df_2k3k = big_df[['id', 'predicted_amount']]
predicted_amount_df_2k3k.to_csv("predicted_amount_2k3k.csv", index=False)

In [None]:
# 3k to 4k
big_df = df_3k4k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_3k4k.csv", index=False)

predicted_amount_df_3k4k = big_df[['id', 'predicted_amount']]
predicted_amount_df_3k4k.to_csv("predicted_amount_3k4k.csv", index=False)

In [None]:
# 4k to 5k
big_df = df_4k5k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_4k5k.csv", index=False)

predicted_amount_df_4k5k = big_df[['id', 'predicted_amount']]
predicted_amount_df_4k5k.to_csv("predicted_amount_4k5k.csv", index=False)

In [None]:
# 5k to 6k
big_df = df_5k6k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_5k6k.csv", index=False)

predicted_amount_df_5k6k = big_df[['id', 'predicted_amount']]
predicted_amount_df_5k6k.to_csv("predicted_amount_5k6k.csv", index=False)

In [None]:
# 6k to 7k
big_df = df_6k7k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_6k7k.csv", index=False)

predicted_amount_df_6k7k = big_df[['id', 'predicted_amount']]
predicted_amount_df_6k7k.to_csv("predicted_amount_6k7k.csv", index=False)

In [None]:
# 7k to 8k
big_df = df_7k8k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_7k8k.csv", index=False)

predicted_amount_df_7k8k = big_df[['id', 'predicted_amount']]
predicted_amount_df_7k8k.to_csv("predicted_amount_7k8k.csv", index=False)

In [None]:
# 8k to 9k
big_df = df_8k9k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_8k9k.csv", index=False)

predicted_amount_df_8k9k = big_df[['id', 'predicted_amount']]
predicted_amount_df_8k9k.to_csv("predicted_amount_8k9k.csv", index=False)

In [12]:
# 9k to 10k
big_df = df_9k10k
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_9k10k.csv", index=False)

predicted_amount_df_9k10k = big_df[['id', 'predicted_amount']]
predicted_amount_df_9k10k.to_csv("predicted_amount_9k10k.csv", index=False)

In [None]:
# for 100 cases
big_df = df_100
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_ex100.csv", index=False)

predicted_df_100 = big_df
predicted_amount_df_ex100 = big_df[['id', 'predicted_amount']]
predicted_amount_df_ex100.to_csv("predicted_amount_ex100.csv", index=False)

In [None]:
# for 150 cases
big_df = df_150
big_df['predicted_amount'] = big_df['result_text'].apply(predict_amount)
# big_df.to_csv("predicted_df_ex150.csv", index=False)

predicted_df_150 = big_df
predicted_amount_df_ex150 = big_df[['id', 'predicted_amount']]
predicted_amount_df_ex150.to_csv("predicted_amount_ex150.csv", index=False)

# расчёт метрик accuracy и macro F1

In [None]:
# расчет таких метрик не актуален, потому что у нас нет колонки с истинными значениями
# можем посчитать эти метрики только для других размеченных частей (например, обучить на 50 и проверить на 100 строках)

from sklearn.metrics import accuracy_score, f1_score, classification_report

# accuracy
accuracy = accuracy_score(big_df['drug_amount'], big_df['predicted_amount'])

# F1 (макроусреднение – чтобы учитывать все классы одинаково)
f1 = f1_score(big_df['drug_amount'], big_df['predicted_amount'], average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1-score: {f1:.4f}")

# более подробный отчёт по каждому классу:
print(classification_report(big_df['drug_amount'], big_df['predicted_amount']))

Accuracy: 0.9060
Macro F1-score: 0.8607
              precision    recall  f1-score   support

значительный       0.89      1.00      0.94       110
     крупный       1.00      0.64      0.78        39

    accuracy                           0.91       149
   macro avg       0.94      0.82      0.86       149
weighted avg       0.92      0.91      0.90       149

