In [None]:
# Sentiment Analysis using Pre-trained BERT


In [None]:
# STEP 1: Install necessary libraries
!pip install transformers
!pip install datasets
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# STEP 2: Import required libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline
from sklearn.model_selection import train_test_split
from google.colab import files
import zipfile


In [None]:
# STEP 3: Upload and read dataset (must be Tweets_sampled.csv or a zip containing it)
uploaded = files.upload()

# If it's a zip file, extract it
for fn in uploaded.keys():
    if fn.endswith(".zip"):
        with zipfile.ZipFile(fn, 'r') as zip_ref:
            zip_ref.extractall(".")


Saving Tweets_sampled.csv to Tweets_sampled.csv


In [None]:
# STEP 4: Load CSV
df = pd.read_csv("Tweets_sampled.csv")
df = df.rename(columns={"airline_sentiment": "label", "text": "text"})
df = df[["text", "label"]]

In [None]:
# STEP 5: Preprocess Labels
label_map = {"positive": 2, "neutral": 1, "negative": 0}
df["label"] = df["label"].map(label_map)

In [None]:
# STEP 6: Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)


In [None]:

# STEP 7: Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# STEP 8: Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [None]:
# STEP 9: Prepare Dataset
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, train_labels)
test_dataset = SimpleDataset(test_encodings, test_labels)

In [None]:
# STEP 10: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_eval=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none"  # disables wandb
)


In [None]:

# STEP 11: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


Step,Training Loss
10,1.0679
20,0.9684
30,0.932
40,0.8447
50,0.8421
60,0.8377
70,0.8272
80,0.7907
90,0.694
100,0.6539


TrainOutput(global_step=110, training_loss=0.820654973116788, metrics={'train_runtime': 614.4817, 'train_samples_per_second': 1.432, 'train_steps_per_second': 0.179, 'total_flos': 26681345016480.0, 'train_loss': 0.820654973116788, 'epoch': 1.0})

In [None]:
# STEP 12: Inference with a few sample texts
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False, device=0 if torch.cuda.is_available() else -1)

examples = [
    "I absolutely loved the flight service!",
    "This airline is terrible, I want a refund!",
    "The flight was delayed, but it was okay overall."
]

for text in examples:
    result = pipe(text)[0]
    print(f"Text: '{text}' → Predicted Sentiment: {result['label']}, Score: {round(result['score'], 2)}")


Device set to use cpu


Text: 'I absolutely loved the flight service!' → Predicted Sentiment: LABEL_2, Score: 0.43
Text: 'This airline is terrible, I want a refund!' → Predicted Sentiment: LABEL_0, Score: 0.78
Text: 'The flight was delayed, but it was okay overall.' → Predicted Sentiment: LABEL_0, Score: 0.87
