# Восстановление пробелов в тексте

Подготовим импорты

In [3]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Using cached https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-win_amd64.whl (2817.2 MB)
Using cached https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-win_amd64.whl (5.5 MB)
Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp311-cp311-win_amd64.whl (4.1 MB)
Installing collected packages: torch, torchvision, torchaudio

   ---------------------------------------- 0/3 [torch]
   ---------------------------------------- 0/3 [torch]
   

In [4]:
import os
import glob
from bs4 import BeautifulSoup
import json
import random
import pandas as pd
from razdel import sentenize
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
import ast
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification
from torch.optim import AdamW 
from sklearn.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm

Создание корпуса для обучения

In [42]:
input_folder = "news_raw/texts"
output_file = "corpus.txt"

all_files = glob.glob(os.path.join(input_folder, "**/*.txt"), recursive=True)
lines = []

for f in all_files:
    with open(f, "r", encoding="utf-8") as file:
        html = file.read()
        soup = BeautifulSoup(html, "html.parser")

        # Удаляем script и style
        for tag in soup(["script", "style"]):
            tag.decompose()

        text = soup.get_text(separator=" ", strip=True)

        # Убираем лишние символы переноса строк
        text = text.replace('\u2028',' ').replace('\u2029',' ')

        # Сжимаем множественные пробелы в один
        text = " ".join(text.split())

        if text:
            lines.append(text)

with open(output_file, "w", encoding="utf-8") as f:
    for line in lines:
        f.write(line + "\n")

print("corpus.txt готов! Всего строк:", len(lines))

corpus.txt готов! Всего строк: 36445


Подготовим корпус к обучению

In [6]:
def positions_from_text(s):
    """Возвращает индексы пробелов в строке"""
    return [i+1 for i, c in enumerate(s[:-1]) if s[i+1].isspace()]

def remove_spaces(s):
    """Удаляет пробелы"""
    return s.replace(" ", "")

def extract_sentences_from_file(filename):
    """Читает файл и разбивает на предложения с помощью razdel"""
    sentences = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            for sent in sentenize(line):
                text = sent.text.strip()
                if len(text) > 3:  # минимальная длина предложения
                    sentences.append(text)
    return sentences

def generate_synthetic_dataset(sentences, seed=42, val_size=0.1, test_size=0.1):
    random.seed(seed)
    data = []
    for i, line in enumerate(sentences):
        line_norm = " ".join(line.split())  # убираем лишние пробелы
        no_space = remove_spaces(line_norm)
        positions = positions_from_text(line_norm)
        data.append({
            "id": i,
            "text": line_norm,
            "no_space": no_space,
            "space_positions": json.dumps(positions)
        })

    df = pd.DataFrame(data)
    # Делим на train/val/test
    train_df, temp_df = train_test_split(df, test_size=val_size+test_size, random_state=seed)
    val_ratio = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(temp_df, test_size=1-val_ratio, random_state=seed)

    return train_df, val_df, test_df

In [7]:
corpus_file = "corpus.txt"  
if not os.path.exists(corpus_file):
    raise FileNotFoundError(f"Файл {corpus_file} не найден!")

print("Извлекаем предложения...")
sentences = extract_sentences_from_file(corpus_file)
print(f"Всего предложений: {len(sentences)}")

print("Генерируем синтетический датасет...")
train_df, val_df, test_df = generate_synthetic_dataset(sentences)
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

print("Синтетический датасет создан!")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Извлекаем предложения...
Всего предложений: 293223
Генерируем синтетический датасет...
Синтетический датасет создан!
Train: 234578, Val: 29322, Test: 29323


Так как данные из новостей, они могут пересекаться, проверим и удалим дубликаты во всех датасетах

In [8]:
def check_dataset_overlap(train_file, val_file, test_file):
    train_df = pd.read_csv(train_file)
    val_df = pd.read_csv(val_file)
    test_df = pd.read_csv(test_file)

    datasets = {"train": train_df, "val": val_df, "test": test_df}
    pairs = [("train", "val"), ("train", "test"), ("val", "test")]

    for a, b in pairs:
        df_a, df_b = datasets[a], datasets[b]

        id_overlap = set(df_a['id']).intersection(df_b['id'])
        text_overlap = set(df_a['text']).intersection(df_b['text'])

        print(f"\nПересечение {a} & {b}:")
        print(f"  ID пересекаются: {len(id_overlap)}")
        print(f"  Text пересекаются: {len(text_overlap)}")

check_dataset_overlap(
    "data/train.csv",
    "data/val.csv",
    "data/test.csv"
)


Пересечение train & val:
  ID пересекаются: 0
  Text пересекаются: 595

Пересечение train & test:
  ID пересекаются: 0
  Text пересекаются: 603

Пересечение val & test:
  ID пересекаются: 0
  Text пересекаются: 130


In [9]:
val_df = val_df[~val_df['text'].isin(train_df['text'])]
test_df = test_df[~test_df['text'].isin(train_df['text'])]
test_df = test_df[~test_df['text'].isin(val_df['text'])]

val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

print("Пересечения удалены. Новые размеры:")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Пересечения удалены. Новые размеры:
Train: 234578, Val: 28628, Test: 28568


Подготовка к обучению

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

MAX_LEN = 128    
BATCH_SIZE = 16
EPOCHS = 50

CHECKPOINT_DIR = "./checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

Using device: cuda


In [11]:
train_df = pd.read_csv("data/train.csv")
val_df   = pd.read_csv("data/val.csv")
test_df  = pd.read_csv("data/test.csv") 

In [12]:
train_df.sample(20)

Unnamed: 0,id,text,no_space,space_positions
10782,221169,"Атмосфера Земли, для сравнения, состоит на 21 ...","АтмосфераЗемли,длясравнения,состоитна21процент...","[9, 16, 20, 31, 39, 42, 45, 53, 56]"
57340,172891,"Компания также заверила, что этот факт не окаж...","Компаниятакжезаверила,чтоэтотфактнеокажетнегат...","[8, 14, 24, 28, 33, 38, 41, 48, 60, 72, 75, 83..."
26451,184070,Об этом сообщается на сайте кинотеатра.,Обэтомсообщаетсянасайтекинотеатра.,"[2, 7, 18, 21, 27]"
47327,31904,"По данным газеты ""Комсомольская правда"", его з...","Поданнымгазеты""Комсомольскаяправда"",егозадержа...","[2, 9, 16, 31, 40, 44, 54, 58, 66, 74, 82, 92,..."
161950,22579,Об этом заявил министр обороны временного прав...,Обэтомзаявилминистроборонывременногоправительс...,"[2, 7, 14, 22, 30, 41, 55, 62, 75, 79, 89, 99,..."
28327,59409,"При этом не уточняется, почему для ответной ак...","Приэтомнеуточняется,почемудляответнойакциипотр...","[3, 8, 11, 23, 30, 34, 43, 49, 63, 72, 75, 84,..."
194763,221179,"Кроме проекта MOXIE, на ней будут представлены...","КромепроектаMOXIE,нанейбудутпредставленыдругие...","[5, 13, 20, 23, 27, 33, 46, 53, 64, 67, 80, 89]"
27717,211403,Прибыль в расчете на одну акцию выросла на три...,Прибыльврасчетенаоднуакциювыросланатрипроцента...,"[7, 9, 17, 20, 25, 31, 39, 42, 46, 56, 59, 64,..."
152339,72669,"Впрочем, гендиректор ТНТ Роман Петренко опрове...","Впрочем,гендиректорТНТРоманПетренкоопровергэту...","[8, 20, 24, 30, 39, 48, 52, 66, 73, 80, 87, 96..."
70673,118406,Данные о численности обеих акций были озвучены...,Данныеочисленностиобеихакцийбылиозвученывластя...,"[6, 8, 20, 26, 32, 37, 46, 55, 62, 65, 71, 77,..."


Так как в тестовом файле от Авито в основном короткие словосочетания, такие как книгахорошая, оставим из исходного датасета только короткие строки

In [13]:
train_df = train_df[train_df['no_space'].str.len() <= MAX_LEN/2].copy()
val_df = val_df[val_df['no_space'].str.len() <= MAX_LEN/2].copy()
test_df = test_df[test_df['no_space'].str.len() <= MAX_LEN/2].copy()
print(f"Количество коротких строк: {len(train_df)}")
print(f"Количество коротких строк: {len(val_df)}")
print(f"Количество коротких строк: {len(test_df)}")

Количество коротких строк: 36160
Количество коротких строк: 4285
Количество коротких строк: 4023


In [14]:
train_df.sample(20)

Unnamed: 0,id,text,no_space,space_positions
44166,233020,Об этом сообщает Bloomberg.,ОбэтомсообщаетBloomberg.,"[2, 7, 16]"
228459,43539,"Позднее, правда, сайт возобновил работу.","Позднее,правда,сайтвозобновилработу.","[8, 16, 21, 32]"
33990,67038,"Отмечается лишь, что женщина - московская домо...","Отмечаетсялишь,чтоженщина-московскаядомохозяйка.","[10, 16, 20, 28, 30, 41]"
201703,109444,Эту величину они получили раньше срока - в сер...,Этувеличинуониполучилираньшесрока-всерединеиюн...,"[3, 12, 16, 25, 32, 38, 40, 42, 51, 56, 61]"
160018,236352,"«Не все», — ответил президент.","«Невсе»,—ответилпрезидент.","[3, 9, 11, 19]"
39875,16520,Ракеты могут быть пущены с любого направления ...,Ракетымогутбытьпущеныслюбогонаправленияподлюбы...,"[6, 12, 17, 24, 26, 33, 45, 49, 55, 61]"
111376,223178,"Среди кредиторов — Сбербанк, Райффайзенбанк, Б...","Средикредиторов—Сбербанк,Райффайзенбанк,БанкМо...","[5, 16, 18, 28, 44, 49]"
93677,258502,Штурмовики станцевали перед зданием лезгинку.,Штурмовикистанцевалипередзданиемлезгинку.,"[10, 21, 27, 35]"
97177,73249,Свою дебютную пластинку коллектив выпустил в 1...,Своюдебютнуюпластинкуколлективвыпустилв1984году.,"[4, 13, 23, 33, 42, 44, 49]"
165129,147587,"В интервью ""Дождю"" он называл Шойгу своим крес...","Винтервью""Дождю""онназывалШойгусвоимкрестнымотц...","[1, 10, 18, 21, 29, 35, 41, 50, 56, 58]"


Подготовка данных и датасета для BERT

In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class SpaceDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_LEN):
        self.texts = df['no_space'].tolist()
        self.labels = []
        for pos_str, txt in zip(df['space_positions'], df['no_space']):
            pos = [int(p) for p in pos_str.strip("[]").split(",") if p]
            label = [0]*len(txt)
            for p in pos:
                if p < len(label):
                    label[p] = 1
            if len(label) > max_len:
                label = label[:max_len]
            self.labels.append(label)

        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        txt = self.texts[idx]
        label = self.labels[idx]
        enc = self.tokenizer(txt, truncation=True, max_length=self.max_len, padding='max_length', return_tensors='pt')
        input_ids = enc['input_ids'].squeeze(0)
        attention_mask = enc['attention_mask'].squeeze(0)
        labels = torch.tensor(label[:self.max_len], dtype=torch.long)
        if len(labels) < self.max_len:
            pad_len = self.max_len - len(labels)
            labels = torch.cat([labels, torch.zeros(pad_len, dtype=torch.long)])
        return input_ids, attention_mask, labels

train_dataset = SpaceDataset(train_df, tokenizer)
val_dataset   = SpaceDataset(val_df, tokenizer)
train_loader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader    = DataLoader(val_dataset, batch_size=BATCH_SIZE)

Архитектура и инициализация модели

In [16]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [17]:
optimizer = AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

Функции для обучения и валидации

In [18]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = criterion(outputs.view(-1, 2), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            loss = criterion(outputs.view(-1, 2), labels.view(-1))
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=-1)
            mask = attention_mask.bool()
            all_preds.extend(preds[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())

    f1 = f1_score(all_labels, all_preds)
    p = precision_score(all_labels, all_preds)
    r = recall_score(all_labels, all_preds)
    return total_loss / len(loader), f1, p, r


Обучение модели

In [None]:
best_f1 = 0
epochs_no_improve = 0
patience = 7
min_delta = 0.001

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, f1, p, r = validate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, F1={f1:.4f}, P={p:.4f}, R={r:.4f}")

    # Проверка на улучшение F1
    if f1 - best_f1 > min_delta:
        best_f1 = f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, "best_epoch_5.pt"))
        print("Saved best checkpoint.")
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print(f"No improvement for {patience} epochs. Stopping training.")
        break

    # уменьшение learning rate, если нет улучшений
    if epochs_no_improve > 0 and epochs_no_improve % 2 == 0:
        for g in optimizer.param_groups:
            g['lr'] *= 0.5
        print(f"Learning rate reduced to {optimizer.param_groups[0]['lr']:.6f}")

100%|██████████| 2260/2260 [04:14<00:00,  8.89it/s]


Epoch 1: Train Loss=0.1175, Val Loss=0.0943, F1=0.4697, P=0.8176, R=0.3295
Saved best checkpoint.


100%|██████████| 2260/2260 [04:11<00:00,  8.97it/s]


Epoch 2: Train Loss=0.0883, Val Loss=0.0776, F1=0.6048, P=0.8208, R=0.4788
Saved best checkpoint.


100%|██████████| 2260/2260 [04:18<00:00,  8.75it/s]


Epoch 3: Train Loss=0.0752, Val Loss=0.0697, F1=0.6978, P=0.7819, R=0.6300
Saved best checkpoint.


100%|██████████| 2260/2260 [04:18<00:00,  8.74it/s]


Epoch 4: Train Loss=0.0655, Val Loss=0.0628, F1=0.7403, P=0.7977, R=0.6906
Saved best checkpoint.


100%|██████████| 2260/2260 [04:20<00:00,  8.68it/s]


Epoch 5: Train Loss=0.0579, Val Loss=0.0585, F1=0.7566, P=0.8271, R=0.6972
Saved best checkpoint.


100%|██████████| 2260/2260 [04:23<00:00,  8.58it/s]


Epoch 6: Train Loss=0.0521, Val Loss=0.0549, F1=0.7914, P=0.8382, R=0.7496
Saved best checkpoint.


100%|██████████| 2260/2260 [04:23<00:00,  8.59it/s]


Epoch 7: Train Loss=0.0474, Val Loss=0.0601, F1=0.7922, P=0.8143, R=0.7713


100%|██████████| 2260/2260 [04:23<00:00,  8.58it/s]


Epoch 8: Train Loss=0.0433, Val Loss=0.0556, F1=0.8097, P=0.8387, R=0.7826
Saved best checkpoint.


100%|██████████| 2260/2260 [04:22<00:00,  8.60it/s]


Epoch 9: Train Loss=0.0397, Val Loss=0.0545, F1=0.8250, P=0.8448, R=0.8061
Saved best checkpoint.


100%|██████████| 2260/2260 [04:19<00:00,  8.71it/s]


Epoch 10: Train Loss=0.0370, Val Loss=0.0531, F1=0.8315, P=0.8557, R=0.8085
Saved best checkpoint.


100%|██████████| 2260/2260 [04:16<00:00,  8.81it/s]


Epoch 11: Train Loss=0.0340, Val Loss=0.0537, F1=0.8430, P=0.8681, R=0.8193
Saved best checkpoint.


100%|██████████| 2260/2260 [04:10<00:00,  9.03it/s]


Epoch 12: Train Loss=0.0317, Val Loss=0.0557, F1=0.8396, P=0.8619, R=0.8185


100%|██████████| 2260/2260 [04:12<00:00,  8.94it/s]


Epoch 13: Train Loss=0.0303, Val Loss=0.0573, F1=0.8169, P=0.8416, R=0.7936
Learning rate reduced to 0.000015


100%|██████████| 2260/2260 [04:10<00:00,  9.02it/s]


Epoch 14: Train Loss=0.0230, Val Loss=0.0497, F1=0.8643, P=0.8809, R=0.8484
Saved best checkpoint.


100%|██████████| 2260/2260 [04:10<00:00,  9.02it/s]


Epoch 15: Train Loss=0.0203, Val Loss=0.0506, F1=0.8673, P=0.8824, R=0.8526
Saved best checkpoint.


100%|██████████| 2260/2260 [04:11<00:00,  9.00it/s]


Epoch 16: Train Loss=0.0191, Val Loss=0.0514, F1=0.8626, P=0.8796, R=0.8463


100%|██████████| 2260/2260 [04:11<00:00,  8.99it/s]


Epoch 17: Train Loss=0.0178, Val Loss=0.0574, F1=0.8673, P=0.8779, R=0.8569
Learning rate reduced to 0.000008


100%|██████████| 2260/2260 [04:13<00:00,  8.90it/s]


Epoch 18: Train Loss=0.0147, Val Loss=0.0564, F1=0.8720, P=0.8813, R=0.8628
Saved best checkpoint.


100%|██████████| 2260/2260 [04:11<00:00,  8.98it/s]


Epoch 19: Train Loss=0.0135, Val Loss=0.0589, F1=0.8718, P=0.8835, R=0.8603


100%|██████████| 2260/2260 [04:11<00:00,  8.98it/s]


Epoch 20: Train Loss=0.0128, Val Loss=0.0616, F1=0.8743, P=0.8821, R=0.8666
Saved best checkpoint.


 44%|████▍     | 997/2260 [01:53<02:23,  8.80it/s]


KeyboardInterrupt: 

Функция для проверки метрики на тестовом датасете

In [19]:
def evaluate_file(model, tokenizer, df, max_len=MAX_LEN):
    if 'no_space' not in df.columns:
        df['no_space'] = df['tetext_no_spaces'] 
    if 'space_positions' not in df.columns and 'true_positions' in df.columns:
        df['space_positions'] = df['true_positions']

    dataset = SpaceDataset(df, tokenizer, max_len=max_len)
    loader = DataLoader(dataset, batch_size=16)
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(outputs, dim=-1)

            mask = attention_mask.bool()
            all_preds.extend(preds[mask].cpu().numpy())
            all_labels.extend(labels[mask].cpu().numpy())

    f1 = f1_score(all_labels, all_preds)
    p = precision_score(all_labels, all_preds)
    r = recall_score(all_labels, all_preds)
    print(f"Evaluation: F1={f1:.4f}, Precision={p:.4f}, Recall={r:.4f}")

In [20]:
best_model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
best_model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "best_epoch_5.pt"), map_location=device))
best_model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [21]:
evaluate_file(best_model, tokenizer, test_df)

Evaluation: F1=0.8725, Precision=0.8794, Recall=0.8657


Обработка файла задания от Авито

In [24]:
with open("dataset_1937770_3.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Заменяем первую запятую в каждой строке на точку с запятой
corrected_lines = []
for line in lines:
    # Разделяем строку по первой запятой
    parts = line.split(",", 1)
    if len(parts) == 2:
        # Объединяем с разделителем ';'
        corrected_line = f"{parts[0]};{parts[1]}"
        corrected_lines.append(corrected_line)
    else:
        # Если строка не содержит запятых (например, заголовок), оставляем как есть
        corrected_lines.append(line)

with open("corrected_dataset.txt", "w", encoding="utf-8") as file:
    file.writelines(corrected_lines)

task_data = pd.read_csv("corrected_dataset.txt", sep=";", encoding="utf-8")

In [25]:
task_data

Unnamed: 0,id,text_no_spaces
0,0,куплюайфон14про
1,1,ищудомвПодмосковье
2,2,сдаюквартирусмебельюитехникой
3,3,новыйдивандоставканедорого
4,4,отдамдаромкошку
...,...,...
1000,1000,Янеусну.
1001,1001,Весна-яуженегреюпио.
1002,1002,Весна-скоровырастеттрава.
1003,1003,"Весна-выпосмотрите,каккрасиво."


Подготовка датасета для инференса

In [26]:
class TaskDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df['text_no_spaces'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        txt = self.texts[idx]
        enc = self.tokenizer(
            txt, truncation=True, max_length=self.max_len,
            padding='max_length', return_tensors='pt'
        )
        input_ids = enc['input_ids'].squeeze(0)
        attention_mask = enc['attention_mask'].squeeze(0)
        return input_ids, attention_mask, txt

task_dataset = TaskDataset(task_data, tokenizer, MAX_LEN)
task_loader = DataLoader(task_dataset, batch_size=BATCH_SIZE)

Загрузка сохраненной модели и подготовка ее к инференсу

In [27]:
model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, "best_epoch_5.pt")))
model.to(device)
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

Инференс модели и восстановление текста с пробелами

In [28]:
pred_positions = []
pred_texts = []

with torch.no_grad():
    for input_ids, attention_mask, texts in task_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(outputs, dim=-1)

        for i, txt in enumerate(texts):
            mask = attention_mask[i].bool()
            pred_labels = preds[i][mask].cpu().numpy()
            positions = [j for j, label in enumerate(pred_labels) if label == 1]

            # Восстанавливаем текст с пробелами
            new_text = ""
            for idx, char in enumerate(txt):
                if idx in positions:
                    new_text += " "
                new_text += char

            pred_positions.append(positions)
            pred_texts.append(new_text)

task_data['predicted_positions'] = pred_positions
task_data['predicted_text'] = pred_texts

task_data.to_csv("submission.csv", index=False)
print("Predictions added to task_data and saved to submission.csv")

Predictions added to task_data and saved to submission.csv
