In [None]:
# Sentiment Analysis using Pre-trained BERT


In [None]:
# STEP 1: Install necessary libraries
!pip install transformers
!pip install datasets
!pip install torch


In [None]:
# STEP 2: Import required libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TextClassificationPipeline
from sklearn.model_selection import train_test_split
from google.colab import files
import zipfile


In [None]:
# STEP 3: Upload and read dataset (must be Tweets_sampled.csv or a zip containing it)
uploaded = files.upload()

# If it's a zip file, extract it
for fn in uploaded.keys():
    if fn.endswith(".zip"):
        with zipfile.ZipFile(fn, 'r') as zip_ref:
            zip_ref.extractall(".")


In [None]:
# STEP 4: Load CSV
df = pd.read_csv("Tweets_sampled.csv")
df = df.rename(columns={"airline_sentiment": "label", "text": "text"})
df = df[["text", "label"]]

In [None]:
# STEP 5: Preprocess Labels
label_map = {"positive": 2, "neutral": 1, "negative": 0}
df["label"] = df["label"].map(label_map)

In [None]:
# STEP 6: Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)


In [None]:

# STEP 7: Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


In [None]:
# STEP 8: Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [None]:
# STEP 9: Prepare Dataset
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimpleDataset(train_encodings, train_labels)
test_dataset = SimpleDataset(test_encodings, test_labels)

In [None]:
# STEP 10: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_eval=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none"  # disables wandb
)


In [None]:

# STEP 11: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [None]:
# STEP 12: Inference with a few sample texts
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False, device=0 if torch.cuda.is_available() else -1)

examples = [
    "I absolutely loved the flight service!",
    "This airline is terrible, I want a refund!",
    "The flight was delayed, but it was okay overall."
]

for text in examples:
    result = pipe(text)[0]
    print(f"Text: '{text}' → Predicted Sentiment: {result['label']}, Score: {round(result['score'], 2)}")
