In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
from torch.utils.data import WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW,get_linear_schedule_with_warmup
# Load the dataset
df = pd.read_excel('sentiment_analysis_dataset.xlsx')

# Calculate class weights to handle class imbalance
class_weights = torch.tensor([1.0 / count for count in Counter(df["sentiment"]).values()])

# Map category labels to integers
label_map = {label: i for i, label in enumerate(df["sentiment"].unique())}

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [2]:
label_map

{'positive': 0, 'negative': 1, 'neutral': 2}

In [3]:
# Define custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        review = self.dataframe.iloc[idx]["review"]
        category = self.dataframe.iloc[idx]["sentiment"]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        label = torch.tensor(label_map[category])

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label,
        }


In [4]:
# Initialize tokenizer and model
hf_token = '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'
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained(
    "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    num_labels=len(label_map)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_results",  # Specify the output directory
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_dir="./logs",
    gradient_accumulation_steps=4
)

# Create datasets and dataloaders
train_dataset = ReviewDataset(train_df, tokenizer, max_length=64)
test_dataset = ReviewDataset(test_df, tokenizer, max_length=64)

# Compute weights for each sample in the dataset
class_weights = [class_weights[label_map[label]] for label in train_df["sentiment"]]
sampler = WeightedRandomSampler(class_weights, len(train_dataset), replacement=True)

train_dataloader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * training_args.num_train_epochs)

# Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)

print(results)


***** Running training *****
  Num examples = 7232
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 226


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 1809
  Batch size = 16


{'eval_loss': 0.048052143305540085, 'eval_accuracy': 0.9878385848535103, 'eval_f1': 0.98780200717812, 'eval_precision': 0.9878098723972308, 'eval_recall': 0.9878385848535103, 'eval_runtime': 287.8573, 'eval_samples_per_second': 6.284, 'eval_steps_per_second': 0.396, 'epoch': 2.0}


In [5]:
trainer.save_model()

Saving model checkpoint to ./sentiment_results
Configuration saved in ./sentiment_results\config.json
Model weights saved in ./sentiment_results\pytorch_model.bin


In [None]:
!pip install accelerate>=0.21.0
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
label_map={'positive': 0, 'negative': 1, 'neutral': 2}
model1=TFAutoModelForSequenceClassification.from_pretrained("sentiment_results", num_labels=len(label_map),from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
user_review = input('Enter review: ')

# Tokenize the user review (assuming you have a tokenizer)
tokenized_user_review = tokenizer(user_review, truncation=True, padding=True, return_tensors="tf")

# Make predictions
predictions = model1.predict(dict(tokenized_user_review))

# Extract the predicted probabilities for each class
predicted_probabilities = predictions[0][0]

# Get the predicted class index
predicted_class_index = predicted_probabilities.argmax()

def print_key_by_value(dict, value1):
  for key, value in dict.items():
    if value == value1:
      return key

# Example usage:
predicted_category=print_key_by_value(label_map,predicted_class_index )
# Print the predicted class index
print('predicted index',predicted_class_index)
print("Predicted category:", predicted_category)