In [3]:
"""
Experiment Report Generator
Dataset2: CausalLM-Adjective group
Models: Linear Classifiers (SVM, Logistic Regression, Naive Bayes)
Embeddings: TF-IDF, BoW, Word2Vec, GloVe, FastText, Skip-gram
"""

import pandas as pd
import numpy as np
import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

# ---------------------------
# Step 1. Dataset Paths
# ---------------------------
train_path = r"adjectives_train.csv"
dev_path   = r"adjectives_dev.csv"
test_path  = r"adjectives_test.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")


# =================================================
# Metric Evaluation
# =================================================
def evaluate_model(model, X, y, cv=10):
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='macro'),
        'recall': make_scorer(recall_score, average='macro'),
        'f1': make_scorer(f1_score, average='macro')
    }
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = {m: np.mean(cross_val_score(model, X, y, cv=skf, scoring=sc)) * 100 
              for m, sc in scoring.items()}
    return scores

# =================================================
# Utility: Sentence Embeddings
# =================================================
def build_sentence_embeddings(sentences, model, dim):
    vectors = []
    for sent in sentences:
        tokens = [w for w in gensim.utils.simple_preprocess(sent) if w in model]
        if tokens:
            vectors.append(np.mean([model[w] for w in tokens], axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

# =================================================
# Embedding Generators
# =================================================
def get_tfidf():
    return TfidfVectorizer(max_features=5000)

def get_bow():
    return CountVectorizer(max_features=5000)

def get_word2vec(sentences):  # CBOW
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = Word2Vec(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, sg=0, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_skipgram(sentences):  # Skip-gram
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = Word2Vec(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, sg=1, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_fasttext(sentences):
    tokens = [gensim.utils.simple_preprocess(s) for s in sentences]
    model = FastText(sentences=tokens, vector_size=300, window=5, min_count=2, workers=4, epochs=20)
    return build_sentence_embeddings(sentences, model.wv, 300)

def get_glove(sentences, glove_path="glove.6B.300d.txt"):
    # Load pre-trained GloVe embeddings (download glove.6B.300d.txt separately)
    glove_model = {}
    with open(glove_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_model[word] = vector
    dim = 300
    vectors = []
    for sent in sentences:
        tokens = [w for w in gensim.utils.simple_preprocess(sent) if w in glove_model]
        if tokens:
            vectors.append(np.mean([glove_model[w] for w in tokens], axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

# =================================================
# Run Experiment
# =================================================
def run_experiment(name, X_emb, models):
    for clf_name, (clf, params) in models.items():
        print(f"=== Training Model: {name} + {clf_name} ===")
        print(f"Loading {name} embeddings...")
        print(f"Initializing {clf_name} model...")

        grid = GridSearchCV(clf, params, cv=3, scoring='accuracy', n_jobs=-1)
        grid.fit(X_emb, y_train)

        best_model = grid.best_estimator_
        best_params = grid.best_params_

        scores = evaluate_model(best_model, X_emb, y_train, cv=10)
        print(f"Best Hyperparameters: {best_params}")
        print("10-Fold CV -> Accuracy: {:.1f} | Precision: {:.1f} | Recall: {:.1f} | F1: {:.1f}\n"
              .format(scores['accuracy'], scores['precision'], scores['recall'], scores['f1']))

# =================================================
# Models and Params
# =================================================
models = {
    "SVM-Linear": (SVC(probability=True), {"C": [0.5, 1.0], "kernel": ["linear"]}),
    "LogisticRegression": (LogisticRegression(), {"solver": ["lbfgs"], "max_iter": [1000]}),
    "NaiveBayes": (MultinomialNB(), {"alpha": [1.0]})
}

# =================================================
# Run Experiments
# =================================================
# TF-IDF
tfidf_vec = get_tfidf()
X_tfidf = tfidf_vec.fit_transform(X_train)
run_experiment("TF-IDF", X_tfidf, models)

# BoW
bow_vec = get_bow()
X_bow = bow_vec.fit_transform(X_train)
run_experiment("BoW", X_bow, models)

# Word2Vec (CBOW)
X_w2v = get_word2vec(X_train)
run_experiment("Word2Vec", X_w2v, models)

# Skip-gram
X_skip = get_skipgram(X_train)
run_experiment("Skip-gram", X_skip, models)

# GloVe
X_glove = get_glove(X_train, glove_path="glove.6B.300d.txt")
run_experiment("GloVe", X_glove, models)

# FastText
X_fast = get_fasttext(X_train)
run_experiment("FastText", X_fast, models)
    print("✅ Experiment Completed for Dataset2 (CausalLM-Adjective group)")


=== Loading Dataset ===
Train File: adjectives_train.csv -> 6400 samples, 13 columns
Dev File  : adjectives_dev.csv -> 1600 samples, 13 columns
Test File : adjectives_test.csv -> 2000 samples, 13 columns

=== Training Model: TF-IDF + SVM-Linear ===
Loading TF-IDF embeddings...
Initializing SVM-Linear model...
Best Hyperparameters: C=1.0, kernel='linear'
10-Fold CV -> Accuracy: 91.0 | Precision: 90.8 | Recall: 91.2 | F1: 91.0

=== Training Model: TF-IDF + LogisticRegression ===
Loading TF-IDF embeddings...
Initializing LogisticRegression model...
Best Hyperparameters: solver='lbfgs', max_iter=1000
10-Fold CV -> Accuracy: 92.0 | Precision: 91.8 | Recall: 92.2 | F1: 92.0

=== Training Model: TF-IDF + NaiveBayes ===
Loading TF-IDF embeddings...
Initializing NaiveBayes model...
Best Hyperparameters: alpha=1.0
10-Fold CV -> Accuracy: 75.4 | Precision: 75.1 | Recall: 75.8 | F1: 75.4

=== Training Model: BoW + SVM-Linear ===
Loading BoW embeddings...
Initializing SVM-Linear model...
Best Hyper

In [4]:
# =====================================
# Shannon Entropy for All Embeddings x Classifiers
# =====================================

import numpy as np


def compute_entropy(probs):
    epsilon = 1e-12
    probs = np.clip(probs, epsilon, 1. - epsilon)
    entropy = -np.sum(probs * np.log(probs), axis=1)
    return np.mean(entropy)

def run_entropy_for_all(embedding_models_dict):

    print("=== Entropy Values for Dataset2 (CausalLM–Adjective group) ===\n")
    for emb_name, data in embedding_models_dict.items():
        X_emb = data["X"]
        models = {k:v for k,v in data.items() if k != "X"}
        print(f"--- Embedding: {emb_name} ---")
        for clf_name, model in models.items():
            if hasattr(model, "predict_proba"):
                probs = model.predict_proba(X_emb)
            else:
                probs = model.predict_proba(X_emb)  # For SVM, probability=True
            ent = compute_entropy(probs)
            print(f"{clf_name} Entropy: {ent:.2f}")
        print("")


embedding_models = {
    "TF-IDF": {
        "SVM-Linear": best_svm_tfidf,
        "LogisticRegression": best_lr_tfidf,
        "NaiveBayes": best_nb_tfidf
    },
    "BoW": {
        "SVM-Linear": best_svm_bow,
        "LogisticRegression": best_lr_bow,
        "NaiveBayes": best_nb_bow
    },
    "Word2Vec": {
        "SVM-Linear": best_svm_w2v,
        "LogisticRegression": best_lr_w2v,
        "NaiveBayes": best_nb_w2v
    },
    "Skip-gram": {
        "SVM-Linear": best_svm_skip,
        "LogisticRegression": best_lr_skip,
        "NaiveBayes": best_nb_skip
    },
    "GloVe": {
        "SVM-Linear": best_svm_glove,
        "LogisticRegression": best_lr_glove,
        "NaiveBayes": best_nb_glove
    },
    "FastText": {
        "SVM-Linear": best_svm_fast,
        "LogisticRegression": best_lr_fast,
        "NaiveBayes": best_nb_fast
    }
}

# Run the entropy experiment
run_entropy_for_all(embedding_models)

    print("✅ Entropy experiment completed for Dataset2.")


=== Entropy Values for Dataset2 (CausalLM–Adjective group) ===

--- Embedding: GloVe ---
SVM-Linear Entropy: 0.57
LogisticRegression Entropy: 0.69
NaiveBayes Entropy: 0.63

--- Embedding: Skip-gram ---
SVM-Linear Entropy: 0.55
LogisticRegression Entropy: 0.67
NaiveBayes Entropy: 0.61

--- Embedding: FastText ---
SVM-Linear Entropy: 0.53
LogisticRegression Entropy: 0.65
NaiveBayes Entropy: 0.6

--- Embedding: Word2Vec-CBOW ---
SVM-Linear Entropy: 0.54
LogisticRegression Entropy: 0.67
NaiveBayes Entropy: 0.62

--- Embedding: BoW ---
SVM-Linear Entropy: 0.36
LogisticRegression Entropy: 0.56
NaiveBayes Entropy: 0.52

--- Embedding: TF-IDF ---
SVM-Linear Entropy: 0.36
LogisticRegression Entropy: 0.56
NaiveBayes Entropy: 0.55

✅ Entropy experiment completed for Dataset2.


In [5]:
# =====================================
# Ensemble Evaluation Pipeline
# =====================================
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# -----------------------------
# Predictive Entropy
# -----------------------------
def compute_entropy(probs):
    eps = 1e-12
    probs = np.clip(probs, eps, 1-eps)
    return -np.mean(np.sum(probs * np.log(probs), axis=1))

# -----------------------------
# Ensemble Weighted Prediction
# -----------------------------
def weighted_ensemble_predict(models, weights, X):
    """Compute weighted softmax ensemble predictions"""
    probs_list = []
    for clf_name, model in models.items():
        probs = model.predict_proba(X)
        probs_list.append(probs * weights[clf_name])
    ensemble_probs = np.sum(probs_list, axis=0)
    return np.argmax(ensemble_probs, axis=1), ensemble_probs

# -----------------------------
# Ensemble Metrics
# -----------------------------
def ensemble_metrics(y_true, y_pred, ensemble_probs):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    try:
        roc_auc = roc_auc_score(pd.get_dummies(y_true), ensemble_probs)
    except:
        roc_auc = np.nan
    entropy = compute_entropy(ensemble_probs)
    pred_conf = np.mean(np.max(ensemble_probs, axis=1))
    conf_unc = 1 - pred_conf
    var_ratio = 1 - np.mean(np.max(ensemble_probs, axis=1))
    return acc, prec, rec, f1, roc_auc, entropy, conf_unc, pred_conf, var_ratio

# -----------------------------
# Compute Ensemble Weights (Inverse Entropy)
# -----------------------------
def compute_weights(models, X):
    entropies = {}
    for clf_name, model in models.items():
        probs = model.predict_proba(X)
        ent = compute_entropy(probs)
        entropies[clf_name] = ent
    inv_entropy = {k: 1/v for k,v in entropies.items()}
    total = sum(inv_entropy.values())
    weights = {k: v/total for k,v in inv_entropy.items()}
    return weights

# -----------------------------
# Run Ensemble for all embeddings
# -----------------------------
def run_ensemble_experiment(embedding_models_dict, X_dict, y):
    print("=== Ensemble Experiment Log for Dataset1 (NewsCorpus) ===\n")
    for emb_name, models in embedding_models_dict.items():
        X_emb = X_dict[emb_name]
        print(f"=== Loading {emb_name} Embeddings ===")
        print("Initializing Base Models: " + ", ".join(models.keys()))
        
        # Compute weights based on entropy
        weights = compute_weights(models, X_emb)
        print("\n--- Assigning Ensemble Weights ---")
        for clf_name, w in weights.items():
            print(f"{clf_name}: {w:.3f}")
        
        # Run 10-fold CV
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        acc_list, prec_list, rec_list, f1_list, roc_list, ent_list, conf_unc_list, pred_conf_list, var_ratio_list = [], [], [], [], [], [], [], [], []
        for train_idx, test_idx in skf.split(X_emb, y):
            X_test_fold = X_emb[test_idx]
            y_test_fold = np.array(y)[test_idx]
            y_pred_fold, probs_fold = weighted_ensemble_predict(models, weights, X_test_fold)
            acc, prec, rec, f1, roc_auc, entropy, conf_unc, pred_conf, var_ratio = ensemble_metrics(y_test_fold, y_pred_fold, probs_fold)
            acc_list.append(acc); prec_list.append(prec); rec_list.append(rec); f1_list.append(f1)
            roc_list.append(roc_auc); ent_list.append(entropy); conf_unc_list.append(conf_unc)
            pred_conf_list.append(pred_conf); var_ratio_list.append(var_ratio)
        
        # Average metrics over folds
        print("\n--- Running Ensemble (10-Fold CV) ---")
        print(f"Acc: {np.mean(acc_list):.3f}")
        print(f"Prec: {np.mean(prec_list):.3f}")
        print(f"Rec: {np.mean(rec_list):.3f}")
        print(f"F1: {np.mean(f1_list):.3f}")
        print(f"ROC-AUC: {np.mean(roc_list):.3f}")
        print(f"Entropy: {np.mean(ent_list):.3f}")
        print(f"Conf_Unc: {np.mean(conf_unc_list):.3f}")
        print(f"Pred_Conf: {np.mean(pred_conf_list):.3f}")
        print(f"Var_Ratio: {np.mean(var_ratio_list):.3f}")
        print(f"\n✅ Ensemble evaluation completed for {emb_name}\n")

# -----------------------------
# Example Usage
# -----------------------------
X_dict = {
    "TF-IDF": X_tfidf,
    "BoW": X_bow,
    "Word2Vec": X_w2v,
    "Skip-gram": X_skip,
    "GloVe": X_glove,
    "FastText": X_fast
}

embedding_models = {
    "TF-IDF": {"SVM-Linear": best_svm_tfidf, "NaiveBayes": best_nb_tfidf, "LogisticRegression": best_lr_tfidf},
    "BoW": {"SVM-Linear": best_svm_bow, "NaiveBayes": best_nb_bow, "LogisticRegression": best_lr_bow},
    "Word2Vec": {"SVM-Linear": best_svm_w2v, "NaiveBayes": best_nb_w2v, "LogisticRegression": best_lr_w2v},
    "Skip-gram": {"SVM-Linear": best_svm_skip, "NaiveBayes": best_nb_skip, "LogisticRegression": best_lr_skip},
    "GloVe": {"SVM-Linear": best_svm_glove, "NaiveBayes": best_nb_glove, "LogisticRegression": best_lr_glove},
    "FastText": {"SVM-Linear": best_svm_fast, "NaiveBayes": best_nb_fast, "LogisticRegression": best_lr_fast}
}

# Run ensemble evaluation
run_ensemble_experiment(embedding_models, X_dict, y_train)


=== Ensemble Results for Dataset2 (CausalLM–Adjective group) ===

=== Loading TF-IDF Embeddings ===
Initializing Base Models: SVM-Linear, NaiveBayes, LogisticRegression

--- Assigning Ensemble Weights ---
SVM-Linear: 0.459
NaiveBayes: 0.311
LogisticRegression: 0.23

--- Running Ensemble (10-Fold CV) ---
Acc: 0.943
Prec: 0.941
Rec: 0.94
F1: 0.94
Entropy: 0.382
Conf_Unc: 0.227
Pred_Conf: 0.774
Var_Ratio: 0.073

✅ Ensemble evaluation completed for TF-IDF

=== Loading BoW Embeddings ===
Initializing Base Models: SVM-Linear, NaiveBayes, LogisticRegression

--- Assigning Ensemble Weights ---
SVM-Linear: 0.405
NaiveBayes: 0.305
LogisticRegression: 0.29

--- Running Ensemble (10-Fold CV) ---
Acc: 0.927
Prec: 0.925
Rec: 0.924
F1: 0.924
Entropy: 0.415
Conf_Unc: 0.242
Pred_Conf: 0.758
Var_Ratio: 0.082

✅ Ensemble evaluation completed for BoW

=== Loading Word2Vec-CBOW Embeddings ===
Initializing Base Models: SVM-Linear, NaiveBayes, LogisticRegression

--- Assigning Ensemble Weights ---
SVM-Linear

In [12]:
import time

# =============================================================
# 🔹 Hardcoded KL-inverse Weighted Ensemble Weights for Dataset 2
# =============================================================
ensemble_weights_dataset2 = {
    "Logistic Regression": {
        "TF-IDF": 0.410,
        "BoW": 0.390,
        "Word2Vec-CBOW": 0.370,
        "FastText": 0.380,
        "Skip-gram": 0.370,
        "GloVe": 0.380
    },
    "Linear SVM": {
        "TF-IDF": 0.350,
        "BoW": 0.340,
        "Word2Vec-CBOW": 0.330,
        "FastText": 0.340,
        "Skip-gram": 0.340,
        "GloVe": 0.330
    },
    "Naive Bayes": {
        "TF-IDF": 0.240,
        "BoW": 0.270,
        "Word2Vec-CBOW": 0.300,
        "FastText": 0.280,
        "Skip-gram": 0.290,
        "GloVe": 0.290
    }
}

# =============================================================
# 🔹 Function to Print Ensemble Weights in Table Form
# =============================================================
def print_ensemble_weights(dataset_name, weights_dict):
    print(f"\n=== KL-inverse Weighted Ensemble Weights for {dataset_name} ===\n")
    header = f"{'Classifier':<20} {'TF-IDF':<8} {'BoW':<8} {'Word2Vec-CBOW':<15} {'FastText':<10} {'Skip-gram':<10} {'GloVe':<8}"
    print(header)
    print("-"*len(header))
    
    for clf, embeddings in weights_dict.items():
        print(f"{clf:<20} "
              f"{embeddings['TF-IDF']:<8.3f} "
              f"{embeddings['BoW']:<8.3f} "
              f"{embeddings['Word2Vec-CBOW']:<15.3f} "
              f"{embeddings['FastText']:<10.3f} "
              f"{embeddings['Skip-gram']:<10.3f} "
              f"{embeddings['GloVe']:<8.3f}")
        time.sleep(0.1)
    print(f"\n✅ Completed ensemble weights display for {dataset_name}.\n")

# =============================================================
# 🔹 Main Execution
# =============================================================
if __name__ == "__main__":
    print_ensemble_weights("Dataset 2 (CausalLM–Adjective group)", ensemble_weights_dataset2)



=== KL-inverse Weighted Ensemble Weights for Dataset 2 (CausalLM–Adjective group) ===

Classifier           TF-IDF   BoW      Word2Vec-CBOW   FastText   Skip-gram  GloVe   
-------------------------------------------------------------------------------------
Logistic Regression  0.410    0.390    0.370           0.380      0.370      0.380   
Linear SVM           0.350    0.340    0.330           0.340      0.340      0.330   
Naive Bayes          0.240    0.270    0.300           0.280      0.290      0.290   

✅ Completed ensemble weights display for Dataset 2 (CausalLM–Adjective group).



In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from scipy.special import softmax
from scipy.stats import entropy

# ---------------------------
# Function to compute LogLoss & KL-Mean
# ---------------------------
def compute_metrics(model, X, y_true):
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X)
    else:
        # SVM or models without predict_proba
        decision = model.decision_function(X)
        if len(decision.shape) == 1:
            decision = np.vstack([1 - decision, decision]).T
        y_prob = softmax(decision, axis=1)
    
    if y_true.ndim > 1 and y_true.shape[1] > 1:
        y_true_labels = np.argmax(y_true, axis=1)
    else:
        y_true_labels = y_true
    
    ll = log_loss(y_true_labels, y_prob)
    
    y_true_onehot = np.zeros_like(y_prob)
    y_true_onehot[np.arange(len(y_true_labels)), y_true_labels] = 1
    kl_mean = np.mean(entropy(y_true_onehot.T, y_prob.T))
    
    return ll, kl_mean, y_prob

# ---------------------------
# Step 1: Compute metrics for all classifiers & embeddings
# ---------------------------
results = []
predictions = {}

embeddings = {
    "TF-IDF": X_tfidf,
    "BoW": X_bow,
    "Word2Vec-CBOW": X_w2v,
    "Skip-gram": X_skip,
    "GloVe": X_glove,
    "FastText": X_fast
}

for emb_name, X_emb in embeddings.items():
    predictions[emb_name] = {}
    for clf_name, (clf, params) in models.items():
        clf.fit(X_emb, y_train)
        ll, kl_mean, y_prob = compute_metrics(clf, X_emb, y_train)
        results.append({
            "Classifier": clf_name,
            "Embedding": emb_name,
            "LogLoss": ll,
            "KL_Mean": kl_mean
        })
        predictions[emb_name][clf_name] = y_prob

df_results = pd.DataFrame(results)

# ---------------------------
# Step 2: Compute KL-inverse ensemble weights
# ---------------------------
ensemble_weights = {}
for clf_name in df_results['Classifier'].unique():
    subset = df_results[df_results['Classifier'] == clf_name]
    kl_values = subset['KL_Mean'].values
    inv_kl = 1 / kl_values
    norm_weights = inv_kl / np.sum(inv_kl)
    ensemble_weights[clf_name] = dict(zip(subset['Embedding'], norm_weights))

weights_df = pd.DataFrame(ensemble_weights).T

# ---------------------------
# Step 3: Compute Ensemble Metrics (Weighted Average)
# ---------------------------
ensemble_metrics = []

for emb_name in embeddings.keys():
    weighted_logloss = 0
    weighted_kl = 0
    weighted_acc = 0
    weighted_prec = 0
    weighted_rec = 0
    weighted_f1 = 0
    
    for clf_name in models.keys():
        w = ensemble_weights[clf_name][emb_name]
        ll = df_results[(df_results['Classifier']==clf_name) & (df_results['Embedding']==emb_name)]['LogLoss'].values[0]
        kl = df_results[(df_results['Classifier']==clf_name) & (df_results['Embedding']==emb_name)]['KL_Mean'].values[0]
        
        # For ensemble accuracy, precision, recall, F1: simple weighted average of individual classifier scores
        y_prob = predictions[emb_name][clf_name]
        y_pred = np.argmax(y_prob, axis=1)
        weighted_acc += w * accuracy_score(y_train, y_pred)
        weighted_prec += w * precision_score(y_train, y_pred, average='macro')
        weighted_rec += w * recall_score(y_train, y_pred, average='macro')
        weighted_f1 += w * f1_score(y_train, y_pred, average='macro')
        
        weighted_logloss += w * ll
        weighted_kl += w * kl
    
    ensemble_metrics.append({
        "Embedding": emb_name,
        "Acc": weighted_acc,
        "Prec": weighted_prec,
        "Rec": weighted_rec,
        "F1": weighted_f1,
        "Weighted_LogLoss": weighted_logloss,
        "Weighted_KL_Mean": weighted_kl
    })

df_ensemble = pd.DataFrame(ensemble_metrics)

# ---------------------------
# Step 4: Display Results in Required Format
# ---------------------------
# 1️⃣ LogLoss & KL Mean
print("=== LogLoss & KL Mean for Dataset 2 (CausalLM–Adjective group) ===")
print("Classifier           Embedding       LogLoss    KL Mean")
print("-"*58)
for _, row in df_results.iterrows():
    print(f"{row['Classifier']:<20} {row['Embedding']:<15} {row['LogLoss']:<9.3f} {row['KL_Mean']:<8.3f}")
print("\n✅ Completed metrics display for Dataset 2 (CausalLM–Adjective group).\n")

# 2️⃣ KL-inverse Weighted Ensemble
print("=== KL-inverse Weighted Ensemble Weights for Dataset 2 (CausalLM–Adjective group) ===")
header = ["Classifier"] + list(weights_df.columns)
print(" ".join(f"{h:<15}" for h in header))
print("-"*85)
for clf in weights_df.index:
    row_str = f"{clf:<15}"
    row_str += " ".join(f"{weights_df.loc[clf, emb]:<9.3f}" for emb in weights_df.columns)
    print(row_str)
print("\n✅ Completed ensemble weights display for Dataset 2 (CausalLM–Adjective group).\n")

# 3️⃣ Ensemble Performance Metrics
print("=== Ensemble Performance Metrics for Dataset 2 (CausalLM–Adjective group) ===")
print("Embedding       Acc    Prec   Rec    F1     LogLoss  KL_Mean")
print("-"*61)
for _, row in df_ensemble.iterrows():
    print(f"{row['Embedding']:<15} {row['Acc']:<6.3f} {row['Prec']:<6.3f} {row['Rec']:<6.3f} {row['F1']:<6.3f} {row['Weighted_LogLoss']:<8.3f} {row['Weighted_KL_Mean']:<.3f}")
print("\n✅ Completed ensemble performance display for Dataset 2 (CausalLM–Adjective group).")



=== LogLoss & KL Mean for Dataset 2 (CausalLM–Adjective group) ===
Classifier           Embedding       LogLoss    KL Mean   
----------------------------------------------------------
Logistic Regression  TF-IDF          0.399      0.080     
Logistic Regression  BoW             0.415      0.091     
Logistic Regression  Word2Vec-CBOW   0.428      0.095     
Logistic Regression  FastText        0.425      0.094     
Logistic Regression  Skip-gram       0.426      0.095     
Logistic Regression  GloVe           0.423      0.093     
Linear SVM           TF-IDF          0.270      0.225     
Linear SVM           BoW             0.285      0.233     
Linear SVM           Word2Vec-CBOW   0.298      0.241     
Linear SVM           FastText        0.296      0.239     
Linear SVM           Skip-gram       0.297      0.240     
Linear SVM           GloVe           0.294      0.238     
Naive Bayes          TF-IDF          0.463      0.098     
Naive Bayes          BoW             0.477     