In [None]:
!pip install transformers datasets torch scikit-learn

# ==================== STEP 2: Load & Preprocess ====================
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import BertTokenizer

raw_datasets = load_dataset("daily_dialog")
train_data = raw_datasets["train"]

texts, labels = [], []
for dialog, emotions in zip(train_data["dialog"], train_data["emotion"]):
    for sentence, label in zip(dialog, emotions):
        if label != -1:
            texts.append(sentence)
            labels.append(label)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=64)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_dataset = EmotionDataset(X_train, y_train)
val_dataset = EmotionDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ==================== STEP 3: Train the Model ====================
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(labels)))
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(3):
    print(f"Epoch {epoch+1}")
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

model.save_pretrained("emotion_model")
tokenizer.save_pretrained("emotion_model")

# Save label encoder
import pickle
with open("emotion_model/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
