In [None]:
# !pip install pandas numpy transformers torch scikit-learn tqdm openpyxl
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
from tqdm import tqdm

In [None]:
class CompanyMentionDataset(Dataset):
  def __init__(self, texts, companies, labels, tokenizer, max_length=512):
    self.texts = texts
    self.companies = companies
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self): return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    company = self.companies[idx]
    label = self.labels[idx]

    modified_text = text.replace(company, f"<company>{company}</company>")

    encoding = self.tokenizer(
      modified_text,
      return_tensors='pt',
      max_length=self.max_length,
      padding='max_length',
      truncation=True
    )

    return {
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label)
    }

In [None]:
def preprocess_data(df):
  sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
  df['label'] = df['tonality'].map(sentiment_map)

  def clean_text(text):
    text = re.sub(r'[^\w\s\.\,\-\"\«\»]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()
  df['clean_sentence'] = df['sentence'].apply(clean_text)
  return df

def train_model(train_dataloader, model, optimizer, device, num_epochs=10):
  model.train()
  for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}')

    for batch in progress_bar:
      optimizer.zero_grad()

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
      )
      loss = outputs.loss
      total_loss += loss.item()

      loss.backward()
      optimizer.step()

      progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

def evaluate_model(eval_dataloader, model, device):
  model.eval()
  all_preds, all_labels = [], []

  with torch.no_grad():
    for batch in eval_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      preds = torch.argmax(outputs.logits, dim=1)
      all_preds.extend(preds.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())
  return all_preds, all_labels

In [None]:
def main():
  df = preprocess_data(pd.read_excel('/content/sample_data/ru_data_test.xlsx')) # измените путь до вашего .xlsx файла

  train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

  model_name = "DeepPavlov/rubert-base-cased-sentence"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

  special_tokens = {'additional_special_tokens': ['<company>', '</company>']}
  tokenizer.add_special_tokens(special_tokens)
  model.resize_token_embeddings(len(tokenizer))

  train_dataset = CompanyMentionDataset(
    train_df['clean_sentence'].tolist(),
    train_df['object'].tolist(),
    train_df['label'].tolist(),
    tokenizer
  )
  test_dataset = CompanyMentionDataset(
    test_df['clean_sentence'].tolist(),
    test_df['object'].tolist(),
    test_df['label'].tolist(),
    tokenizer
  )
  train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
  test_dataloader = DataLoader(test_dataset, batch_size=8)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)
  optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
  train_model(train_dataloader, model, optimizer, device)
  preds, labels = evaluate_model(test_dataloader, model, device)
  label_names = ['negative', 'neutral', 'positive']
  print(classification_report(labels, preds, target_names=label_names))

  return model, tokenizer

def predict_sentiment(text, company, model, tokenizer, device):
  model.eval()
  modified_text = text.replace(company, f"<company>{company}</company>")

  encoding = tokenizer(
    modified_text,
    return_tensors='pt',
    max_length=512,
    padding='max_length',
    truncation=True
  )
  with torch.no_grad():
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1)

  sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
  return sentiment_map[prediction.item()]

In [None]:
if __name__ == "__main__":
  model, tokenizer = main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 11/11 [00:31<00:00,  2.87s/it, loss=1.03]
Epoch 2: 100%|██████████| 11/11 [00:30<00:00,  2.75s/it, loss=0.824]
Epoch 3: 100%|██████████| 11/11 [00:31<00:00,  2.89s/it, loss=0.531]
Epoch 4: 100%|██████████| 11/11 [00:33<00:00,  3.02s/it, loss=0.328]


              precision    recall  f1-score   support

    negative       0.86      0.55      0.67        11
     neutral       0.00      0.00      0.00         3
    positive       0.47      0.78      0.58         9

    accuracy                           0.57        23
   macro avg       0.44      0.44      0.42        23
weighted avg       0.59      0.57      0.55        23



## Описание подхода:
Подход состоит в файн-тюнинге предобученной модели RuBERT.
Также были добавлены:
1. Маркеры < company> и </ company> вокруг предложений для лучшей обработки текстов об определенной компании
2. Очистка данных, которая заключается в удалении специальных символов и сохранение знаков препинания

## Проблемы:
1. Маленький датасет, из-за которого получилось очень неточное распределение классов