In [1]:
import transformers
print(transformers.__version__)


4.49.0


In [3]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split

file_path = "/content/drive/MyDrive/AI_Model/data.csv"  # Update if using a different location
df = pd.read_csv(file_path)

print("Dataset Loaded Successfully!")
df.head()


Dataset Loaded Successfully!


Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [4]:
def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df['Cleaned_Sentence'] = df['Sentence'].apply(clean_text)


In [5]:
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['Sentiment'].map(label_mapping)

In [6]:
from sklearn.utils import resample

# Separate by class
negative_samples = df[df['label'] == 0]
neutral_samples = df[df['label'] == 1]
positive_samples = df[df['label'] == 2]

# Find the largest class size
max_class_size = max(len(negative_samples), len(neutral_samples), len(positive_samples))

# Oversample minority classes
negative_oversampled = resample(negative_samples, replace=True, n_samples=max_class_size, random_state=42)
neutral_oversampled = resample(neutral_samples, replace=True, n_samples=max_class_size, random_state=42)
positive_oversampled = resample(positive_samples, replace=True, n_samples=max_class_size, random_state=42)

# Merge back into a balanced dataset
df_balanced = pd.concat([negative_oversampled, neutral_oversampled, positive_oversampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Class Balancing Done!")
df_balanced['label'].value_counts()


Class Balancing Done!


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,3130
2,3130
1,3130


In [7]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_data(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_balanced['Cleaned_Sentence'], df_balanced['label'], test_size=0.2, random_state=42
)

train_encodings = tokenize_data(list(train_texts))
test_encodings = tokenize_data(list(test_texts))

print("Tokenization Completed Successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenization Completed Successfully!


In [8]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}  # Convert to tensors
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}  # Correct indexing
        item["labels"] = self.labels[idx]
        return item

# Convert labels to list
train_labels = train_labels.tolist()
test_labels = test_labels.tolist()

# Create dataset
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

print("Dataset Prepared for Training!")


Dataset Prepared for Training!


  self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}  # Convert to tensors


In [9]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    weight_decay=0.02,
    warmup_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2
)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mreaz-pfec[0m ([33mreaz-pfec-rezan-fze-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.458873
2,No log,0.293225
3,0.593300,0.254059
4,0.593300,0.249438
5,0.235400,0.236957


TrainOutput(global_step=1175, training_loss=0.3781582008524144, metrics={'train_runtime': 216.1635, 'train_samples_per_second': 173.757, 'train_steps_per_second': 5.436, 'total_flos': 2470634992465920.0, 'train_loss': 0.3781582008524144, 'epoch': 5.0})

In [12]:
from sklearn.metrics import accuracy_score, classification_report

predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=-1).numpy()

accuracy = accuracy_score(test_labels, preds)
print(f"Model Accuracy: {accuracy:.4f}")

print(classification_report(test_labels, preds, target_names=["Negative", "Neutral", "Positive"]))


Model Accuracy: 0.9244
              precision    recall  f1-score   support

    Negative       0.85      0.99      0.91       665
     Neutral       0.97      0.80      0.88       603
    Positive       0.97      0.98      0.98       610

    accuracy                           0.92      1878
   macro avg       0.93      0.92      0.92      1878
weighted avg       0.93      0.92      0.92      1878



In [13]:
model.save_pretrained("/content/drive/MyDrive/saved_model")
tokenizer.save_pretrained("/content/drive/MyDrive/saved_model")

print("Model and tokenizer saved successfully in Google Drive!")

Model and tokenizer saved successfully in Google Drive!
