In [None]:
!pip install -q transformers datasets scikit-learn torch pandas

In [None]:
import os
import pickle
import pandas as pd

# File names you said you have — edit paths if needed
train_csv = "train_cleaned.csv"
val_csv   = "val_cleaned.csv"
test_csv  = "test_cleaned.csv"

# Helper to load and print basic info
def load_and_inspect(csv_path):
    assert os.path.exists(csv_path), f"File not found: {csv_path}"
    df = pd.read_csv(csv_path)
    print(f"\nLoaded {csv_path} — shape: {df.shape}")
    print("Columns:", list(df.columns))
    # show top rows and value counts for mood
    print(df.head(3))
    print("\nMood distribution:\n", df['mood'].value_counts())
    return df

train_df = load_and_inspect(train_csv)
val_df   = load_and_inspect(val_csv)
test_df  = load_and_inspect(test_csv)



Loaded train_cleaned.csv — shape: (43222, 2)
Columns: ['text', 'mood']
                                                text   mood
0  if you dont wear brown and orangeyou dont matt...  angry
1  what do scottish people look like how i would ...   calm
2            a surprise to be sure but a welcome one   calm

Mood distribution:
 mood
calm     19470
happy    15239
sad       4258
angry     4255
Name: count, dtype: int64

Loaded val_cleaned.csv — shape: (5423, 2)
Columns: ['text', 'mood']
                                                text   mood
0  calm down and relax are the worst thing to say...  angry
1  it sound like youre setting up for a good 2019...  happy
2  thanks bot you too mansoooooo what doe your ha...  happy

Mood distribution:
 mood
calm     2409
happy    1956
sad       545
angry     513
Name: count, dtype: int64

Loaded test_cleaned.csv — shape: (5421, 2)
Columns: ['text', 'mood']
                                                text   mood
0  i have worked at a shitty 

In [None]:
# Manual mapping
custom_map = {"sad": 0, "happy": 1, "angry": 2, "calm": 3}

# Apply mapping directly
train_labels = train_df["mood"].map(custom_map).values
val_labels = val_df["mood"].map(custom_map).values
test_labels = test_df["mood"].map(custom_map).values


In [None]:
train_labels

array([2, 3, 3, ..., 2, 3, 0])

In [None]:
import pickle
import torch
from torch.utils.data import Dataset

In [None]:
with open("train_encodings.pkl", "rb") as f:
    train_encodings = pickle.load(f)
with open("val_encodings.pkl", "rb") as f:
    val_encodings = pickle.load(f)
with open("test_encodings.pkl", "rb") as f:
    test_encodings = pickle.load(f)

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset   = EmotionDataset(val_encodings, val_labels)
test_dataset  = EmotionDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Quick sanity check
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))

# Inspect one sample
sample = train_dataset[0]
print("\nSample keys:", sample.keys())
print("Input IDs length:", len(sample["input_ids"]))
print("Attention mask length:", len(sample["attention_mask"]))
print("Label:", sample["labels"])


Train dataset size: 43222
Validation dataset size: 5423
Test dataset size: 5421

Sample keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
Input IDs length: 119
Attention mask length: 119
Label: tensor(2)


In [None]:
import torch
from transformers import BertForSequenceClassification

In [None]:
num_labels = 4

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",   # pretrained weights
    num_labels=num_labels  # our task → 4 moods
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
from transformers import get_scheduler

num_epochs = 3  # start small, you can increase later
num_training_steps = num_epochs * len(train_loader)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # can set to ~10% of steps for warmup
    num_training_steps=num_training_steps
)


In [None]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()


In [None]:
import torch
print(torch.cuda.is_available())


True


In [None]:
from tqdm import tqdm

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward + optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"  Average training loss: {avg_train_loss:.4f}")



Epoch 1/3


100%|██████████| 2702/2702 [14:04<00:00,  3.20it/s]


  Average training loss: 0.7332

Epoch 2/3


100%|██████████| 2702/2702 [14:06<00:00,  3.19it/s]


  Average training loss: 0.5613

Epoch 3/3


100%|██████████| 2702/2702 [14:06<00:00,  3.19it/s]

  Average training loss: 0.4115





In [None]:
# Save both model and tokenizer
model.save_pretrained("bert_mood_model")



In [None]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import accuracy_score, classification_report
acc = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {acc:.4f}")
print(classification_report(all_labels, all_preds, target_names=custom_map.keys()))


Validation Accuracy: 0.7299
              precision    recall  f1-score   support

         sad       0.54      0.49      0.51       545
       happy       0.79      0.82      0.80      1956
       angry       0.58      0.59      0.59       513
        calm       0.75      0.74      0.75      2409

    accuracy                           0.73      5423
   macro avg       0.67      0.66      0.66      5423
weighted avg       0.73      0.73      0.73      5423



In [None]:
text = "I am so excited today."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted Mood: {list(custom_map.keys())[pred]}")


Predicted Mood: happy


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]