<a href="https://colab.research.google.com/github/shubham3032002/Fine--tuning-BERT-for-setiment-analysis/blob/main/Fine_tuning_BERT_for_setiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate accelerate -q
!pip install optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)

In [5]:
dataset = load_dataset("carblacac/twitter-sentiment-analysis")
print("\nSample dataset entry:", dataset["train"][0])


Sample dataset entry: {'text': '@fa6ami86 so happy that salman won.  btw the 14sec clip is truely a teaser', 'feeling': 0}


In [6]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [8]:
# Process dataset
tokenized_dataset = dataset.map(preprocess, batched=True)

# Fix column names (original label column is 'feeling')
tokenized_dataset = tokenized_dataset.rename_column("feeling", "labels")

# Set format for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/29997 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Mixed precision
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)



In [11]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)


In [14]:
print("\nStarting training...")
trainer.train()


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3818,0.371357,0.835269
2,0.3213,0.371613,0.842027
3,0.2643,0.388988,0.841011


TrainOutput(global_step=5625, training_loss=0.3322018866644965, metrics={'train_runtime': 1173.2473, 'train_samples_per_second': 306.81, 'train_steps_per_second': 4.794, 'total_flos': 1.1921086264310784e+16, 'train_loss': 0.3322018866644965, 'epoch': 3.0})

In [15]:
results = trainer.evaluate()
print(f"\nValidation Accuracy: {results['eval_accuracy']:.2f}")


Validation Accuracy: 0.84


In [18]:
import torch
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)


Device set to use cuda:0


In [27]:
def predict(text):
    result = classifier(text)[0]

    label_map = {
        0: "Negative",
        1: "Neutral",
        2: "Positive"
    }

    label_text = label_map[int(result["label"].split("_")[-1])]


    if label_text == "Neutral" and result["score"] > 0.90:
        label_text = "Positive"

    return {
        "Sentiment": label_text,
        "Confidence": float(result["score"])
    }


In [29]:
print(predict(""))


{'Sentiment': 'Positive', 'Confidence': 0.9932340979576111}
