In [1]:
import os
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizerFast

class ReviewsDataset(Dataset):
    def __init__(self, directory, tokenizer, max_length=512):
        self.examples = []

        for label_dir in ['pos', 'neg']:
            label = 1 if label_dir == 'pos' else 0
            subdir = os.path.join(directory, label_dir)
            for filename in os.listdir(subdir):
                file_path = os.path.join(subdir, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read().strip()
                    inputs = tokenizer(text, max_length=max_length, truncation=True, padding="max_length")
                    self.examples.append((inputs.input_ids, inputs.attention_mask, label))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        input_ids, attention_mask, label = self.examples[idx]
        return {'input_ids': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask), 'labels': torch.tensor(label)}

tokenizer = RobertaTokenizerFast.from_pretrained("pdelobelle/robbert-v2-dutch-base")


train_dataset = ReviewsDataset(r"C:\Users\shital.nerkar\Desktop\SentimentAnalysis\reviews_SA\data\train",tokenizer)
test_dataset = ReviewsDataset(r"C:\Users\shital.nerkar\Desktop\SentimentAnalysis\reviews_SA\data\test",tokenizer)

In [2]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Where to store the final model
    num_train_epochs=2,              # Total number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=100,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.05,               # Weight decay if applied
#     logging_dir='./logs',         
#     logging_steps=10,
    no_cuda=True                     # Set this to False to use CUDA if available
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Step,Training Loss
500,0.3183


TrainOutput(global_step=500, training_loss=0.3182547607421875, metrics={'train_runtime': 19261.9428, 'train_samples_per_second': 0.208, 'train_steps_per_second': 0.026, 'total_flos': 1052444221440000.0, 'train_loss': 0.3182547607421875, 'epoch': 2.0})

In [5]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.5006716847419739, 'eval_runtime': 252.8828, 'eval_samples_per_second': 0.807, 'eval_steps_per_second': 0.051, 'epoch': 2.0}


In [18]:
def predict_sentiment(text, model, tokenizer):

    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")

    model.to('cpu')  
    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)

    prediction = torch.argmax(probabilities, dim=1).item()

    sentiments = ["Negative", "Positive"]  
    predicted_sentiment = sentiments[prediction]

    probabilities = probabilities.numpy().flatten().tolist()

    return predicted_sentiment, probabilities

text = '''
Het horloge is net aangekomen en na het opladen begon ik met het koppelingsproces.
Wat mij is opgevallen en wat zo vervelend is, is het feit dat het horloge zichzelf blijft bellen, waardoor de gebruikersinterface niet zo soepel verloopt.
Dus onder mijn verwachtingen!
Nu weet ik niet meer wat ik ermee ga doen!
'''

# """
# Het horloge heeft een goed prijs kwaliteit verhouding met verbazingwekkend veel functies! 
# De bouwkwaliteit is top en alles werkt goed en soepel. 
# Verder had ik ook wat vragen over het gebruik van het horloge, en het support team hielp mij hier snel en goed mee!
# """

predicted_sentiment, probabilities = predict_sentiment(text, model, tokenizer)
print(f"Predicted sentiment: {predicted_sentiment}")
print(f"Probabilities: {probabilities}")

Predicted sentiment: Negative
Probabilities: [0.9979313611984253, 0.0020686383359134197]


In [16]:
def predict_sentiment_from_file(file_path, model, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    predicted_sentiment, probabilities = predict_sentiment(text, model, tokenizer)

    return predicted_sentiment, probabilities

file_path = r"C:\Users\shital.nerkar\Desktop\SentimentAnalysis\DatchData\DBRD_v3\DBRD\test\pos\22243_5.txt"

predicted_sentiment, probabilities = predict_sentiment_from_file(file_path, model, tokenizer)
print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Probabilities: {probabilities}")

Predicted Sentiment: Negative
Probabilities: [0.991441547870636, 0.008558427914977074]
