In [None]:
import numpy as np
import zipfile
import pickle
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, GlobalMaxPooling1D,
    concatenate, Dropout, Dense
)
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, f1_score, accuracy_score

### Loading the dataset

In [None]:
# Load GoEmotions
dataset = load_dataset("go_emotions")
train_texts = dataset["train"]["text"]
val_texts   = dataset["validation"]["text"]
test_texts  = dataset["test"]["text"]
train_labels = dataset["train"]["labels"]
val_labels   = dataset["validation"]["labels"]
test_labels  = dataset["test"]["labels"]
label_names  = dataset["train"].features["labels"].feature.names  # 28 emotion labels

### Tokenization and encoding

In [None]:
# Multi‑hot encode labels
num_labels = len(label_names)
def to_multi_hot(label_lists):
    m = np.zeros((len(label_lists), num_labels), dtype=np.int32)
    for i, labs in enumerate(label_lists):
        m[i, labs] = 1
    return m

y_train = to_multi_hot(train_labels)
y_val   = to_multi_hot(val_labels)
y_test  = to_multi_hot(test_labels)

# Tokenize and pad
vocab_size = 20000  
max_len    = 100     

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

In [None]:
with open("tokenizer_goemotions.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved to tokenizer.pkl")

In [None]:
def encode(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")

X_train = encode(train_texts)
X_val   = encode(val_texts)
X_test  = encode(test_texts)

### Model architecture and training

In [None]:
# CNN model architecture
embedding_dim = 128       
filter_sizes  = [3,4,5]   
num_filters   = 128       
drop_rate     = 0.5       

inputs = Input(shape=(max_len,), dtype="int32")
embed  = Embedding(vocab_size, embedding_dim, input_length=max_len)(inputs)

conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters=num_filters, kernel_size=sz, activation="relu")(embed)
    pool = GlobalMaxPooling1D()(conv)
    conv_blocks.append(pool)

concat = concatenate(conv_blocks)
drop   = Dropout(drop_rate)(concat)
output = Dense(num_labels, activation="sigmoid")(drop)

model = Model(inputs, output)

# Compile & train
learning_rate = 1e-3
batch_size    = 64
epochs        = 5

model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs
)

### Evaluation and metrics

In [None]:
# Evaluate
loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

pred_probs = model.predict(X_test, batch_size=batch_size)
y_pred     = (pred_probs >= 0.5).astype(int)

# Keras evaluation (loss + element‑wise accuracy)
loss, keras_acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print(f"\nTest Loss: {loss:.4f}")
print(f"Keras Accuracy (element‑wise): {keras_acc:.4f}")

# Raw accuracy (fraction of individual label predictions correct)
raw_acc = accuracy_score(
    y_test.flatten(),
    y_pred.flatten()
)
print(f"Raw Accuracy (sklearn, element‑wise): {raw_acc:.4f}")

# Micro‑averaged F1 (treats every label equally across all samples)
micro_f1 = f1_score(
    y_test,
    y_pred,
    average='micro',
    zero_division=0
)
print(f"Micro-averaged F1 score: {micro_f1:.4f}\n")

# Full per‑label classification report
print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_names,
    zero_division=0,
    digits=4
))

### Code cell for interactive testing

In [None]:
# Interactive prediction function
def predict_emotions(text, top_k=5, threshold=0.2):
    seq    = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    preds  = model.predict(padded)[0]
    # pair labels with scores and sort
    pairs = sorted(zip(label_names, preds), key=lambda x: x[1], reverse=True)
    return [(label, float(score)) for label, score in pairs if score >= threshold][:top_k]

def interactive_predict_emotions():
    print("\nEnter a sentence to classify emotions (type 'quit' to exit):")
    while True:
        user_input = input("> ")
        if user_input.lower() in ("quit", "exit"):
            print("Goodbye!")
            break
        results = predict_emotions(user_input)
        if results:
            print("Predicted emotions:")
            for label, score in results:
                print(f"  {label:>10s}: {score:.3f}")
        else:
            print("No emotion score exceeded the threshold. Try a different sentence.")

#interactive_predict_emotions()

### Code cell to export and save the model

In [None]:
def export_and_zip(model,
                   model_filename='cnn_goemotions.h5',
                   zip_filename='cnn_goemotions.zip'):
    
    # Save the model
    model.save(model_filename)
    print(f"Model saved to {model_filename}")

#export_and_zip(model)