In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# DL Models' Pipeline

In [None]:
# Importing necessary libraries :)
import pandas as pd
import numpy as np
import os
import re
import emoji
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Bidirectional, GRU, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Bidirectional, LSTM, GRU, BatchNormalization

In [None]:
# Downloading stopwords :()
nltk.download('stopwords')

In [None]:
# Defining languages
languages = ["arq", "amh", "hau", "orm", "som"]

In [None]:
# Defining stopwords :) :(
stopwords_dict={
    "arq": set(stopwords.words('arabic')),
    "amh": set(stopwords.words('english')),  
    "hau": set(stopwords.words('english')),  
    "orm": set(stopwords.words('english')),  
    "som": set(stopwords.words('english')),  
}

In [None]:
# Text preprocessing function
def preprocess_text(text, lang="English"):
    # Remove emojis
    text = emoji.replace_emoji(text, replace="")
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    words = text.lower().split()
    if lang in stopwords_dict:
        words = [word for word in words if word not in stopwords_dict[lang]]
    
    return " ".join(words)

In [None]:
# Model training function
def train_model(X_train, y_train, X_dev, y_dev, model_type="GRU"):
    vocab_size = 10000
    embedding_dim = 128
    max_length = 100
    
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))

    if model_type == "CNN":
        model.add(Conv1D(128, 5, activation='relu'))  # can be used other activation functions
        model.add(MaxPooling1D(2))
        model.add(BatchNormalization())
        model.add(Conv1D(64, 3, activation='relu'))
        model.add(MaxPooling1D(2))
        model.add(Dropout(0.3))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.3))
    
    elif model_type == "BiLSTM+CNN":
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(Dropout(0.3))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(2))
        model.add(BatchNormalization())
        model.add(Conv1D(64, 3, activation='relu'))
        model.add(MaxPooling1D(2))
        model.add(Dropout(0.3))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.3))
    
    elif model_type == "BiLSTM+BiGRU":
        model.add(Bidirectional(LSTM(128, return_sequences=True)))
        model.add(Dropout(0.3))
        model.add(Bidirectional(GRU(128, return_sequences=True)))
        model.add(Dropout(0.3))
        model.add(BatchNormalization())
        model.add(Bidirectional(GRU(64, return_sequences=True)))
        model.add(Dropout(0.3))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.3))

    if model_type == "GRU":
        model.add(GRU(128, return_sequences=True))
        model.add(Dropout(0.3))
        model.add(GRU(64, return_sequences=True))
        model.add(Dropout(0.3))
        model.add(BatchNormalization())
        model.add(GRU(32, return_sequences=True))
        model.add(Dropout(0.3))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.3))

    # For GRU
    # model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])
    # model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_dev, y_dev), verbose=1)
    # return model

    # For CNN, BiLSTM+CNN, BiLSTM+BiGRU
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

    # Train model
    model.fit(X_train, y_train, epochs=45, batch_size=32, validation_data=(X_dev, y_dev), verbose=1)
    return model

In [None]:
# Load datasets, train models, and make predictions for all languages
def solver_function():
    results = {}
    max_length = 100
    vocab_size = 10000

    for lang in languages:
        print(f"\n Processing {lang}..")
        # Load datasets
        train_path = f"train_path_{lang}.csv"
        dev_path = f"validation_path_{lang}.csv"
        test_path = f"test_path_{lang}.csv"

        if not (os.path.exists(train_path) and os.path.exists(dev_path) and os.path.exists(test_path)):
            print(f"Missing dataset for {lang}, processing...")
            continue

        train_df = pd.read_csv(train_path)
        dev_df = pd.read_csv(dev_path)
        test_df = pd.read_csv(test_path)

        # Preprocess text
        train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(str(x), lang))
        dev_df['text'] = dev_df['text'].apply(lambda x: preprocess_text(str(x), lang))
        test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(str(x), lang))

        # Extract labels
        emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]
        y_train = train_df[emotion_labels].values
        y_dev = dev_df[emotion_labels].values

        # Tokenization & Padding
        tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        tokenizer.fit_on_texts(train_df['text'])

        X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=max_length, padding='post')
        X_dev = pad_sequences(tokenizer.texts_to_sequences(dev_df['text']), maxlen=max_length, padding='post')
        X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=max_length, padding='post')

        # Train and predict using all DL models
        models = ["CNN", "BiLSTM+CNN", "BiLSTM+BiGRU"]
        # models = ["GRU"]
        lang_results = {}

        for model_name in models:
            print(f"\n🔹 Training {model_name} for {lang}...")
            model = train_model(X_train, y_train, X_dev, y_dev, model_type=model_name)
            # Make predictions
            y_pred = model.predict(X_test)
            predictions = (y_pred > 0.5).astype(int)
            # Save predictions with text column
            pred_df = test_df[['id', 'text']].copy()
            pred_df[emotion_labels] = predictions
            pred_df.to_csv(f"{lang}_{model_name}_predictions.csv", index=False)
            lang_results[model_name] = pred_df

        results[lang] = lang_results

    return results

In [None]:
# Driver function
if __name__ == "__main__":
    predictions = solver_function()
    print("\n Predictions saved!!!")