In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

# Константы
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 30
LEARNING_RATE = 2e-5
WARMUP_STEPS = 100
PATIENCE = 5
MAX_PRED_LABELS = 3

class TopicDataset(Dataset):
    def __init__(self, questions, labels, tokenizer, max_length):
        self.questions = list(questions)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = str(self.questions[idx])
        
        encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

class TopicAttention(nn.Module):
    def __init__(self, hidden_size):
        super(TopicAttention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )
    
    def forward(self, hidden_states):
        attention_weights = self.attention(hidden_states)
        attention_weights = torch.softmax(attention_weights, dim=1)
        attended_output = torch.sum(attention_weights * hidden_states, dim=1)
        return attended_output

class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
        
    def forward(self, outputs, targets):
        # BCE loss
        bce_loss = self.bce(outputs, targets)
        
        # Штраф за количество предсказаний больше MAX_PRED_LABELS
        pred_probs = torch.sigmoid(outputs)
        num_preds = torch.sum(pred_probs > 0.5, dim=1)
        count_penalty = torch.relu(num_preds - MAX_PRED_LABELS) * 0.1
        
        # Штраф за отсутствие правильных меток в топ предсказаниях
        top_k_values, top_k_indices = torch.topk(pred_probs, k=MAX_PRED_LABELS, dim=1)
        correct_in_top_k = torch.zeros_like(num_preds, dtype=torch.float)
        
        for i in range(targets.size(0)):
            correct_in_top_k[i] = torch.sum(targets[i][top_k_indices[i]] == 1)
        
        accuracy_penalty = torch.exp(-correct_in_top_k) * 0.2
        
        total_loss = bce_loss.mean() + count_penalty.mean() + accuracy_penalty.mean()
        return total_loss

class TopicClassifier(nn.Module):
    def __init__(self, n_classes):
        super(TopicClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')
        
        # Заморозка BERT
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # Разморозка только последних двух слоев
        for layer in self.bert.encoder.layer[-2:]:
            for param in layer.parameters():
                param.requires_grad = True
        
        hidden_size = self.bert.config.hidden_size
        
        self.attention = TopicAttention(hidden_size)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size // 2, n_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        )
        
        # Используем выходы последних 4 слоев
        last_hidden_states = torch.stack(outputs.hidden_states[-4:])
        last_hidden_states = last_hidden_states.permute(1, 0, 2, 3)
        batch_size = last_hidden_states.size(0)
        last_hidden_states = last_hidden_states.reshape(batch_size, -1, outputs.last_hidden_state.size(-1))
        
        # Применяем механизм внимания
        attended_output = self.attention(last_hidden_states)
        
        # Классификация
        logits = self.classifier(attended_output)
        return logits

def train_epoch(model, data_loader, optimizer, criterion, device, scheduler=None):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        if scheduler:
            scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

In [18]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            # Получаем top-k предсказаний
            probs = torch.sigmoid(outputs)
            top_k_values, top_k_indices = torch.topk(probs, k=MAX_PRED_LABELS, dim=1)
            
            # Создаем бинарную матрицу предсказаний
            batch_predictions = torch.zeros_like(probs)
            for i in range(batch_predictions.size(0)):
                batch_predictions[i][top_k_indices[i]] = 1
            
            total_loss += loss.item()
            all_predictions.extend(batch_predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(data_loader), np.array(all_predictions), np.array(all_labels)

def main(df):
    # Подготовка данных
    df = df.reset_index(drop=True)
    
    # Подготовка меток
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(df['name'].apply(lambda x: [x] if isinstance(x, str) else x))
    
    # Разделение данных
    train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.2, random_state=42)
    
    X_train = df.loc[train_idx, 'question']
    y_train = labels[train_idx]
    X_val = df.loc[val_idx, 'question']
    y_val = labels[val_idx]
    X_test = df.loc[test_idx, 'question']
    y_test = labels[test_idx]
    
    # Токенизатор
    tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
    
    # Создание датасетов
    train_dataset = TopicDataset(X_train, y_train, tokenizer, MAX_LENGTH)
    val_dataset = TopicDataset(X_val, y_val, tokenizer, MAX_LENGTH)
    test_dataset = TopicDataset(X_test, y_test, tokenizer, MAX_LENGTH)
    
    # Создание даталоадеров
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    # Инициализация модели
    model = TopicClassifier(len(mlb.classes_))
    model.to(DEVICE)
    
    # Оптимизатор и планировщик
    optimizer = torch.optim.AdamW(
        [
            {"params": model.bert.encoder.layer[-2:].parameters(), "lr": LEARNING_RATE},
            {"params": model.attention.parameters(), "lr": LEARNING_RATE * 2},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE * 2}
        ],
        weight_decay=0.01
    )
    
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_STEPS,
        num_training_steps=total_steps
    )
    
    criterion = CustomLoss()
    
    # Обучение с early stopping
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(EPOCHS):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE, scheduler)
        val_loss, val_preds, val_true = evaluate(model, val_loader, criterion, DEVICE)
        
        print(f'Epoch {epoch + 1}/{EPOCHS}:')
        print(f'Train Loss: {train_loss:.4f}')
        print(f'Val Loss: {val_loss:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= PATIENCE:
            print(f'Early stopping triggered after epoch {epoch + 1}')
            break
    
    # Загрузка лучшей модели
    model.load_state_dict(best_model_state)
    
    # Получение предсказаний
    _, train_preds, train_true = evaluate(model, train_loader, criterion, DEVICE)
    _, val_preds, val_true = evaluate(model, val_loader, criterion, DEVICE)
    _, test_preds, test_true = evaluate(model, test_loader, criterion, DEVICE)
    
    def format_predictions(preds):
        return [mlb.classes_[np.where(pred == 1)[0]].tolist() for pred in preds]
    
    # Создание финального датафрейма
    results_df = pd.DataFrame({
        'question': list(X_test),
        'sample': ['test'] * len(X_test),
        'text': df.loc[test_idx, 'text'].values,
        'true_topic': [mlb.classes_[np.where(y)[0]].tolist() for y in test_true],
        'predicted_topic': format_predictions(test_preds)
    })
    
    train_results = pd.DataFrame({
        'question': list(X_train),
        'sample': ['train'] * len(X_train),
        'text': df.loc[train_idx, 'text'].values,
        'true_topic': [mlb.classes_[np.where(y)[0]].tolist() for y in train_true],
        'predicted_topic': format_predictions(train_preds)
    })
    
    val_results = pd.DataFrame({
        'question': list(X_val),
        'sample': ['val'] * len(X_val),
        'text': df.loc[val_idx, 'text'].values,
        'true_topic': [mlb.classes_[np.where(y)[0]].tolist() for y in val_true],
        'predicted_topic': format_predictions(val_preds)
    })
    
    final_df = pd.concat([train_results, val_results, results_df], axis=0, ignore_index=True)
    
    # Вычисление финальных метрик
    test_f1 = f1_score(test_true, test_preds, average='macro')
    print(f"\nФинальный F1-score на тестовой выборке: {test_f1:.4f}")
    
    return final_df, model, test_f1

def analyze_predictions(df):
    print("\nАнализ предсказаний:")
    
    # Подсчет точности попадания истинного топика в предсказанные
    def has_correct_topic(row):
        return any(topic in row['predicted_topic'] for topic in row['true_topic'])
    
    df['has_correct'] = df.apply(has_correct_topic, axis=1)
    accuracy = df['has_correct'].mean()
    print(f"\nТочность попадания истинного топика в предсказанные: {accuracy:.4f}")
    
    print("\nРаспределение количества предсказанных топиков:")
    df['num_predicted_topics'] = df['predicted_topic'].apply(len)
    print(df.groupby('sample')['num_predicted_topics'].describe())
    
    print("\nПримеры предсказаний:")
    for sample_type in ['train', 'val', 'test']:
        print(f"\n{sample_type.upper()} примеры:")
        sample_predictions = df[df['sample'] == sample_type].sample(min(3, len(df[df['sample'] == sample_type])))
        for _, row in sample_predictions.iterrows():
            print(f"\nВопрос: {row['question']}")
            print(f"Истинные топики: {row['true_topic']}")
            print(f"Предсказанные топики: {row['predicted_topic']}")
            print(f"Правильное предсказание: {'Да' if row['has_correct'] else 'Нет'}")

if __name__ == "__main__":
    df = pd.read_csv('/kaggle/input/texts-with-answers/texts_with_answers.csv')
    results_df, model, test_f1 = main(df)
    analyze_predictions(results_df)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/30:
Train Loss: 1.0854
Val Loss: 1.0509
Epoch 2/30:
Train Loss: 1.0785
Val Loss: 0.9221
Epoch 3/30:
Train Loss: 1.0297
Val Loss: 0.8537
Epoch 4/30:
Train Loss: 0.9374
Val Loss: 0.7658
Epoch 5/30:
Train Loss: 0.8319
Val Loss: 0.7159
Epoch 6/30:
Train Loss: 0.7554
Val Loss: 0.6590
Epoch 7/30:
Train Loss: 0.6798
Val Loss: 0.6012
Epoch 8/30:
Train Loss: 0.6482
Val Loss: 0.5607
Epoch 9/30:
Train Loss: 0.5944
Val Loss: 0.5097
Epoch 10/30:
Train Loss: 0.5625
Val Loss: 0.4903
Epoch 11/30:
Train Loss: 0.5306
Val Loss: 0.4816
Epoch 12/30:
Train Loss: 0.5242
Val Loss: 0.4725
Epoch 13/30:
Train Loss: 0.5038
Val Loss: 0.4667
Epoch 14/30:
Train Loss: 0.4861
Val Loss: 0.4593
Epoch 15/30:
Train Loss: 0.4818
Val Loss: 0.4530
Epoch 16/30:
Train Loss: 0.4881
Val Loss: 0.4518
Epoch 17/30:
Train Loss: 0.4715
Val Loss: 0.4508
Epoch 18/30:
Train Loss: 0.4560
Val Loss: 0.4455
Epoch 19/30:
Train Loss: 0.4667
Val Loss: 0.4517
Epoch 20/30:
Train Loss: 0.4559
Val Loss: 0.4421
Epoch 21/30:
Train Loss: 0.44

In [21]:
len(results_df[(results_df['sample'] == 'test') &
            (results_df['has_correct'] == True)]) / len(results_df[(results_df['sample'] == 'test')])

0.4583333333333333

In [22]:
len(results_df[(results_df['sample'] == 'train') &
            (results_df['has_correct'] == True)]) / len(results_df[(results_df['sample'] == 'train')])

0.8552631578947368

In [23]:
len(results_df[(results_df['sample'] == 'val') &
            (results_df['has_correct'] == True)]) / len(results_df[(results_df['sample'] == 'val')])

0.5