In [1]:
%pip install tokenizers datasets evaluate accelerate torch

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_DIR = '/home/jupyter/datasphere/project/MovieReview/Users/daniil/Desktop/University/AI/GreenAtom/aclImdb'


class IMDBDataset(Dataset):
    def __init__(self, texts, ratings, sentiments, tokenizer, max_length):
        self.texts = texts
        self.ratings = ratings
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        rating = self.ratings[idx]
        sentiment = self.sentiments[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'rating': torch.tensor(rating, dtype=torch.long),
            'sentiment': torch.tensor(sentiment, dtype=torch.long),
        }


def load_data(data_dir, split):
    texts = []
    ratings = []
    sentiments = []

    for label in ['pos', 'neg']:
        sentiment = 1 if label == 'pos' else 0
        dir_path = os.path.join(data_dir, split, label)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt') and not(filename.startswith('._')):
                rating = int(filename.split('_')[1].split('.')[0])
                if rating < 0 or rating > 10:
                    print(filename)
                with open(os.path.join(dir_path, filename), 'r', encoding='latin-1') as f:
                    text = f.read()
                    texts.append(text)
                    ratings.append(rating)
                    sentiments.append(sentiment)
    return texts, ratings, sentiments


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 256
BATCH_SIZE = 96

train_texts, train_ratings, train_sentiments = load_data(DATA_DIR, 'train')
test_texts, test_ratings, test_sentiments = load_data(DATA_DIR, 'test')

train_dataset = IMDBDataset(train_texts, train_ratings, train_sentiments, tokenizer, MAX_LENGTH)
test_dataset = IMDBDataset(test_texts, test_ratings, test_sentiments, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


class SentimentRatingModel(nn.Module):
    def __init__(self, n_ratings):
        super(SentimentRatingModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=0.3)
        self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, 2)
        self.rating_classifier = nn.Linear(self.bert.config.hidden_size, n_ratings)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        sentiment_logits = self.sentiment_classifier(pooled_output)
        rating_logits = self.rating_classifier(pooled_output)
        return sentiment_logits, rating_logits
    
if __name__ == "__main__":
    model = SentimentRatingModel(n_ratings=10)
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion_sentiment = nn.CrossEntropyLoss()
    criterion_rating = nn.CrossEntropyLoss()

    torch.save(model.state_dict(), 'sentiment_rating_model.pth')

    EPOCHS = 3
    save_interval = 100  
    batch_count = 0

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch_count += 1
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiments = batch['sentiment'].to(device)
            ratings = batch['rating'].to(device) - 1  

            sentiment_logits, rating_logits = model(input_ids, attention_mask)

            loss_sentiment = criterion_sentiment(sentiment_logits, sentiments)
            loss_rating = criterion_rating(rating_logits, ratings)

            loss = loss_sentiment + loss_rating
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            if batch_count % save_interval == 0:
                torch.save(model.state_dict(), 'sentiment_rating_model.pth')
                print(f'Сохранение модели после {batch_count} batch\'ей')

        avg_loss = total_loss / len(train_loader)
        print(f'Эпоха {epoch + 1}/{EPOCHS}, Потеря: {avg_loss:.4f}')

        torch.save(model.state_dict(), 'sentiment_rating_model.pth')
        print(f'Модель сохранена после эпохи {epoch + 1}')

    model.eval()
    correct_sentiment = 0
    correct_rating = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiments = batch['sentiment'].to(device)
            ratings = batch['rating'].to(device) - 1 

            sentiment_logits, rating_logits = model(input_ids, attention_mask)

            _, sentiment_preds = torch.max(sentiment_logits, dim=1)
            _, rating_preds = torch.max(rating_logits, dim=1)

            correct_sentiment += (sentiment_preds == sentiments).sum().item()
            correct_rating += (rating_preds == ratings).sum().item()
            total += sentiments.size(0)

    print(f'Точность тональности: {correct_sentiment / total * 100:.2f}%')
    print(f'Точность рейтинга: {correct_rating / total * 100:.2f}%')

    torch.save(model.state_dict(), 'sentiment_rating_model.pth')