In [1]:
from transformers import BartForSequenceClassification, BartTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-xsum')

    def __getitem__(self, index):
        title = self.data[index]['title']
        text = self.data[index]['text']
        cluster = self.data[index]['cluster']
        input_ids = self.tokenizer.encode(title, text, add_special_tokens=True, truncation=True)
        input_ids = torch.tensor(input_ids)
        return input_ids, cluster

    def __len__(self):
        return len(self.data)

    def collate_fn(self, batch):
        input_ids, cluster = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        cluster = torch.tensor(cluster)
        return input_ids, cluster
    

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("1k_new.csv")

data = []

for index, row in df.iterrows():
    title = str(row['Название'])
    text = str(row['Текст'])
    cluster = row['Cluster']
    # text = text.replace('\n', '')
    if len(text) > 512:
        text = text[:512]
    data.append({'title': title, 'text': text, 'cluster': cluster})

In [4]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True, collate_fn=train_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=True, collate_fn=test_dataset.collate_fn)

print(len(train_dataset))
print(len(test_dataset))

972
109


In [5]:
model = BartForSequenceClassification.from_pretrained('facebook/bart-large-xsum', num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
# 1e-5 было

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): L

In [6]:
from tqdm import tqdm
import gc

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader):
        input_ids, cluster = batch
        input_ids = input_ids.to(device)
        cluster = cluster.to(device)

        optimizer.zero_grad()
        output = model(input_ids=input_ids, labels=cluster)
        loss = output.loss
        loss.backward()
        optimizer.step()
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    print(loss)

100%|██████████| 162/162 [01:38<00:00,  1.65it/s]


tensor(0.5589, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 162/162 [01:39<00:00,  1.63it/s]


tensor(0.6763, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 162/162 [01:38<00:00,  1.65it/s]


tensor(0.0310, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 162/162 [01:38<00:00,  1.65it/s]


tensor(0.0423, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 162/162 [01:39<00:00,  1.63it/s]

tensor(0.1210, device='cuda:0', grad_fn=<NllLossBackward0>)





In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()

with torch.no_grad():
    true_labels = []
    predicted_labels = []

    for batch in tqdm(test_dataloader):
        input_ids, cluster = batch
        input_ids = input_ids.to(device)
        cluster = cluster.to(device)

        output = model(input_ids=input_ids)
        _, predicted = torch.max(output.logits, dim=1)

        true_labels.extend(cluster.tolist())
        predicted_labels.extend(predicted.tolist())

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


100%|██████████| 19/19 [00:02<00:00,  6.68it/s]


Accuracy: 0.9357798165137615
Precision: 0.9076923076923077
Recall: 0.9833333333333333
F1 Score: 0.944


In [8]:
from sklearn.metrics import classification_report


print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.98      0.88      0.92        49
           1       0.91      0.98      0.94        60

    accuracy                           0.94       109
   macro avg       0.94      0.93      0.93       109
weighted avg       0.94      0.94      0.94       109



In [9]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-xsum')

def predict_news(model, title, text):
    if len(text) > 512:
        text = text[:512]
        print("Текст новости превышает 512 символов\n")

    input_ids = tokenizer.encode(title, text, add_special_tokens=True, truncation=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)

    model.eval()
    with torch.no_grad():
        input_ids = input_ids.to(device)
        output = model(input_ids=input_ids)
        _, predicted = torch.max(output.logits, dim=1)

    return predicted.item()

In [19]:
title = input("Введите название новости: ")
text = input("Введите текст новости: ")

prediction = predict_news(model, title, text)

if prediction == 1:
    print("Название соответствует тексту новости.")
else:
    print("Название не соответствует тексту новости.")

Текст новости превышает 512 символов

Название не соответствует тексту новости.
