In [9]:
import pandas as pd
import numpy as np
import zipfile
import re
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [11]:
# Model and training configuration step
MAX_VOCAB = 10000        # I put to limit the vocab size to only the first 10,000 words 
MAX_LEN = 100            # pads all the sequences to 100 tokens
EMBED_DIM = 64           # Embeds the output size
BATCH_SIZE = 32
EPOCHS = 5

# All of the emotion labels that are used for application to each text
LABELS = ['admiration', 'amusement', 'gratitude', 'love', 'pride', 'relief', 'remorse']

# Function used to clean input text and remove any special characters 
def clean_text(text):
    text = text.lower()  # converts all text to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # removes any URLs within the text inputs
    text = re.sub(r"@\w+", "", text)  # removes any mentions like seen in discord
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)  # remove emojis/special chars
    text = re.sub(r"\s+", " ", text).strip()  # remove extra whitespace
    return text

In [13]:
# Loads the training CSV and cleans the text within each input
train_df = pd.read_csv("train.csv")
train_df['text'] = train_df['text'].astype(str).apply(clean_text)

# Tokenizes the text and prepares the input features
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text'].tolist())
train_sequences = tokenizer.texts_to_sequences(train_df['text'].tolist())
X_train = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Extracts the label matrix for multi-hot encoding
y_train = train_df[LABELS].values

In [14]:
# Creates the sequential Bidirectional Long-Short Term Memory model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM),
    Bidirectional(LSTM(64)),  # I input it as Bidirectional for better sequence understanding within the context
    Dense(64, activation='relu'),
    Dense(len(LABELS), activation='sigmoid')  # Sigmoid for interpreting outputs as probabilities
])

# Compile the model using binary crossentropy 
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [15]:
# Defined an early stopping to prevent overfitting since I noticed it was overfitting significantly at higher epochs
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model using a 90/10 split where 90% is training and 10% is validation
model.fit(
    X_train, y_train,
    validation_split=0.1,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/5
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 45ms/step - accuracy: 0.5844 - loss: 0.2250 - val_accuracy: 0.6401 - val_loss: 0.1223
Epoch 2/5
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 45ms/step - accuracy: 0.6018 - loss: 0.0883 - val_accuracy: 0.8063 - val_loss: 0.0810
Epoch 3/5
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 44ms/step - accuracy: 0.6846 - loss: 0.0606 - val_accuracy: 0.7075 - val_loss: 0.0799
Epoch 4/5
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 43ms/step - accuracy: 0.6762 - loss: 0.0498 - val_accuracy: 0.5254 - val_loss: 0.0858
Epoch 5/5
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 42ms/step - accuracy: 0.6702 - loss: 0.0391 - val_accuracy: 0.6429 - val_loss: 0.0967


<keras.src.callbacks.history.History at 0x175ccf89910>

In [21]:
#This portion is to check the F1 scores of the model through the dev.csv dataset
try:
    # Loads and cleans the development set
    dev_df = pd.read_csv("dev.csv")
    dev_df['text'] = dev_df['text'].astype(str).apply(clean_text)
    
    # Prepares the dev inputs
    dev_sequences = tokenizer.texts_to_sequences(dev_df['text'].tolist())
    X_dev = pad_sequences(dev_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
    y_dev = dev_df[LABELS].values

    # Predicts the probabilities and threshold at 0.5
    dev_preds = model.predict(X_dev)
    dev_binary_preds = (dev_preds > 0.5).astype(int)

    # Provides the calculations of the micro and macro F1 scores
    micro_f1 = f1_score(y_dev, dev_binary_preds, average='micro')
    macro_f1 = f1_score(y_dev, dev_binary_preds, average='macro')

    print(f"Dev Set Micro F1 Score: {micro_f1:.4f}")
    print(f"Dev Set Macro F1 Score: {macro_f1:.4f}")
except FileNotFoundError:
    print("dev.csv not found. Skipping evaluation.")


[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
Dev Set Micro F1 Score: 0.8135
Dev Set Macro F1 Score: 0.5966


In [23]:
# Loads and cleans the test set
test_df = pd.read_csv("test.csv")
test_df['text'] = test_df['text'].astype(str).apply(clean_text)

# Tokenizes and pads the test data
test_sequences = tokenizer.texts_to_sequences(test_df['text'].tolist())
X_test = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Predicts the probabilities and converts them to binary labels
test_preds = model.predict(X_test)
test_binary_preds = (test_preds > 0.5).astype(int)

# Prepares the final submission dataframe
submission_df = pd.DataFrame(test_binary_preds, columns=LABELS)
submission_df.insert(0, "text", test_df["text"])
submission_df.to_csv("submission.csv", index=False)


[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step


In [25]:
# Zips the CSV into submission.zip for proper formatting to submit to codabench
with zipfile.ZipFile("submission.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    zipf.write("submission.csv")

print("Submission saved as 'submission.zip'")


Submission saved as 'submission.zip'
