In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# ML Models' Pipeline

In [None]:
# Importing necessary libraries :)
import pandas as pd
import numpy as np
import os
import re
import emoji
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from tqdm import tqdm

In [None]:
# Downloading stopwords :()
nltk.download('stopwords')

In [None]:
# Defining languages
languages = ["arq", "amh", "hau", "orm", "som"]

In [None]:
# Defining stopwords :) :(
stopwords_dict={
    "arq": set(stopwords.words('arabic')),
    "amh": set(stopwords.words('english')),  
    "hau": set(stopwords.words('english')),  
    "orm": set(stopwords.words('english')),  
    "som": set(stopwords.words('english')),  
}

In [None]:
# Text preprocessing function
def preprocess_text(text, lang="English"):
    # Remove emojis
    text = emoji.replace_emoji(text, replace="")
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    words = text.lower().split()
    if lang in stopwords_dict:
        words = [word for word in words if word not in stopwords_dict[lang]]
    
    return " ".join(words)

In [None]:
def train_models(X_train, y_train, X_dev, y_dev, X_test, test_df, model_params, lang):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=model_params["Logistic Regression"]["max_iter"]),
        "Random Forest": RandomForestClassifier(n_estimators=model_params["Random Forest"]["n_estimators"], max_depth=model_params["Random Forest"]["max_depth"]),
        "SVM": SVC(kernel=model_params["SVM"]["kernel"], C=model_params["SVM"]["C"]),
        "Naive Bayes": MultinomialNB(alpha=model_params["Naive Bayes"]["alpha"])
    }
    results = {}
    for model_name, model in models.items():
        print(f"🔹 Training {model_name}...")
        multi_model = MultiOutputClassifier(model)
        multi_model.fit(X_train, y_train)
        y_dev_pred = multi_model.predict(X_dev)
        f1 = f1_score(y_dev, y_dev_pred, average='macro')
        y_test_pred = multi_model.predict(X_test)
        y_test_pred_df = pd.DataFrame(y_test_pred, columns=["anger", "disgust", "fear", "joy", "sadness", "surprise"])
        
        y_test_pred_df["id"] = test_df['id'].values
        cols = ["id"] + [col for col in y_test_pred_df.columns if col != "id"]
        y_test_pred_df = y_test_pred_df[cols]
        
        # Save predictions
        predictions_filename = f"predictions_{lang}_{model_name.replace(' ', '_')}_test.csv"
        y_test_pred_df.to_csv(predictions_filename, index=False)
        
        results[model_name] = {
            "model": multi_model,
            "f1_score": f1,
            "classification_report": classification_report(y_dev, y_dev_pred),
            "confusion_matrix": confusion_matrix(y_dev.argmax(axis=1), y_dev_pred.argmax(axis=1))
        }
    return results

In [None]:
def solver_function(model_params):
    results = {}
    for lang in languages:
        print(f"\n🔹 Processing {lang}...")
        train_path = f"train_folder_path/{lang}.csv"
        dev_path = f"validation_folder_path/{lang}.csv"
        test_path = f"test_folder_path/{lang}.csv"
        
        if not (os.path.exists(train_path) and os.path.exists(dev_path) and os.path.exists(test_path)):
            print(f"Missing dataset for {lang}, processing...")
            continue
        
        train_df = pd.read_csv(train_path)
        dev_df = pd.read_csv(dev_path)
        test_df = pd.read_csv(test_path)
        
        # Preprocessing text columns
        train_df['text'] = train_df['text'].apply(lambda x: preprocess_text(str(x), lang))
        dev_df['text'] = dev_df['text'].apply(lambda x: preprocess_text(str(x), lang))
        test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(str(x), lang))
        
        emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]
        y_train = train_df[emotion_labels].values
        y_dev = dev_df[emotion_labels].values
        
        # Text Vectorization
        vectorizer = TfidfVectorizer(max_features=10512)
        X_train = vectorizer.fit_transform(train_df['text']).toarray()
        X_dev = vectorizer.transform(dev_df['text']).toarray()
        X_test = vectorizer.transform(test_df['text']).toarray()
        
        # Train models and save predictions
        lang_results = train_models(X_train, y_train, X_dev, y_dev, X_test, test_df, model_params, lang)
        results[lang] = lang_results
    return results

In [None]:
model_params={
    "Logistic Regression": {"max_iter": 256},
    "Random Forest": {"n_estimators": 120, "max_depth": 12},
    "SVM": {"kernel": "rbf", "C": 2}, # Linear also used as kernel
    "Naive Bayes": {"alpha": 1.0}
}

In [None]:
# Driver function
if __name__ == "__main__":
    predictions = solver_function(model_params)
    print("\n Predictions saved!!!")