# Intro

In [None]:
import os
import glob
import json
import pandas as pd


df_reviews = pd.read_json('../Human_Annotation/merged_200_papers.json', orient='records', lines=True)
df_reviews = df_reviews.iloc[:, :31]
df_reviews

In [None]:
import os
import glob
import json
import pandas as pd


def load_reviews(folder_path):
    rows = []
    # find all JSON files in the folder
    for file_path in glob.glob(os.path.join(folder_path, '*.json')):
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        paper_id = data.get('paper_id')
        assessor = data.get('assessor')
        metrics = data.get('metrics', {})
        
        # group metrics by reviewer name
        reviewer_metrics = {}
        for key, value in metrics.items():
            # only process keys that start with "review_"
            if not key.startswith('review_'):
                continue
            parts = key.split('_')
            reviewer = parts[1]                          # e.g. "Palwinder-Singh"
            metric_name = '_'.join(parts[2:])            # e.g. "Comprehensiveness"
            
            reviewer_metrics.setdefault(reviewer, {})
            reviewer_metrics[reviewer][metric_name] = value
        
        # turn each reviewer’s metrics into a row
        for reviewer, mdict in reviewer_metrics.items():
            row = {
                'paper_id': paper_id,
                'assessor': assessor,
                'reviewer': reviewer
            }
            row.update(mdict)
            rows.append(row)
    
    # build the final DataFrame
    df = pd.DataFrame(rows)
    return df

# Example usage:
folder = '../Human_Annotation_Data'
df_human = load_reviews(folder)

# show the first few rows
df_human = df_human[df_human['Overall_Quality'] > 10]
df_human

In [None]:
df_human['reviewer'] = df_human['reviewer'].str.replace('_', '-', regex=False)
df_human['reviewer'] = df_human['reviewer'].str.replace(' ', '-', regex=False)
df_reviews['reviewer'] = df_reviews['reviewer'].str.replace('_', '-', regex=False)
df_reviews['reviewer'] = df_reviews['reviewer'].str.replace(' ', '-', regex=False)


# merge two df_reviews on df_human based on paper_id and reviewer
df_human['paper_id'] = df_human['paper_id'].astype(int)
df_reviews['paper_id'] = df_reviews['paper_id'].astype(int)

# transform paper_id column in all dfs to int
df_human['reviewer'] = df_human['reviewer'].astype(str)
df_reviews['reviewer'] = df_reviews['reviewer'].astype(str)

df_human_vs_metric = (
    df_human
    .merge(df_reviews, on=['paper_id', 'reviewer'], how='inner')
)

# filter the df_human_vs_metric up to first 16 columns
# df_human_vs_metric = df_human_vs_metric.iloc[:, :16]
df_human_vs_metric

In [None]:
# drop rows with Anonymous reviewers
df_human_vs_metric = df_human_vs_metric[~df_human_vs_metric['reviewer'].str.contains('Anonymous')]
df_human_vs_metric

In [None]:
# new column named hedging compute as follows: 1 - (hedge_C / (hedge_C + hedge_D + hedge_E + hedge_I + hedge_N))
df_human_vs_metric['hedging'] = 1 - (df_human_vs_metric['hedge_C'] / (df_human_vs_metric['hedge_C'] + df_human_vs_metric['hedge_D'] + df_human_vs_metric['hedge_E'] + df_human_vs_metric['hedge_I'] + df_human_vs_metric['hedge_N']))
# drop columns with 'hedge_' prefix
df_human_vs_metric = df_human_vs_metric.drop(columns=[col for col in df_human_vs_metric.columns if col.startswith('hedge_')])
# drop following columns: flesch_kincaid_grade, gunning_fog, smog_index, automated_readability_index
df_human_vs_metric = df_human_vs_metric.drop(columns=['flesch_kincaid_grade', 'gunning_fog', 'smog_index', 'automated_readability_index'])
df_human_vs_metric

In [None]:
df_human_vs_llm = pd.read_csv('human_vs_llm.csv')
df_human_vs_llm

In [37]:
# drop duplicate paper_id and reviewer pairs from both DFs
df_human_vs_metric = df_human_vs_metric.drop_duplicates(subset=['paper_id', 'reviewer'])
df_human_vs_llm = df_human_vs_llm.drop_duplicates(subset=['paper_id', 'reviewer'])

In [None]:
df_merge = (
    df_human_vs_metric
    .merge(df_human_vs_llm, on=['paper_id', 'reviewer'], how='inner')
)
df_merge

In [None]:
# drop following columns: assessor, Comprehensiveness, Usage_of_Technical_Terms, Factuality, Sentiment_Polarity, Politeness, Vagueness, Objectivity, Fairness, Actionability, Constructiveness, Relevance_Alignment, Clarity_and_Readability, Overall_Quality, authors, review_date, review_rating, review_confidence, review_soundness, review_presentation, review_contribution
df_merge = df_merge.drop(columns=['assessor', 'Comprehensiveness', 'Usage_of_Technical_Terms', 'Factuality', 'Sentiment_Polarity', 'Politeness', 'Vagueness', 'Objectivity', 'Fairness', 'Actionability', 'Constructiveness', 'Relevance_Alignment', 'Clarity_and_Readability', 'Overall_Quality', 'authors', 'review_date', 'review_rating', 'review_confidence', 'review_soundness', 'review_presentation', 'review_contribution'])
df_merge

In [40]:
df_merge.to_csv('human_llms_qmetrics.csv', index=False)

# Load df_human_llms_qmetrics

In [None]:
import os
import glob
import json
import pandas as pd

df_all = pd.read_csv('human_llms_qmetrics.csv')
df_all

In [None]:
df_all.columns

In [None]:
import os
import pandas as pd
from sklearn.model_selection import KFold

# 1) Load your DataFrame however you like.
#    For example, if it’s already in memory:
# df = your_dataframe

# 2) Set up 10-fold splitter
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 3) Make the base folder
os.makedirs('Folds', exist_ok=True)

# 4) Loop and save
for fold, (train_idx, test_idx) in enumerate(kf.split(df_all), start=1):
    train_df = df_all.iloc[train_idx]
    test_df  = df_all.iloc[test_idx]
    
    train_df.to_csv(f'Folds/f{fold}_train.csv', index=False)
    test_df.to_csv( f'Folds/f{fold}_test.csv',  index=False)

print("Saved 10 train/test pairs in the ‘Folds’ folder.")

In [None]:
# GPT

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import kendalltau
import krippendorff  # pip install krippendorff

# the features to use
features = [
    'length_words', 'citation_count', 'question_count', 'mattr',
    'sentiment_polarity', 'similarity_score', 'flesch_reading_ease',
    'politeness_score', 'hedging'
]

results = {}

for i in range(1, 11):
    # load train/test for fold i
    train = pd.read_csv(f"Folds/f{i}_train.csv")
    test  = pd.read_csv(f"Folds/f{i}_test.csv")
    
    # train
    clf = DecisionTreeClassifier()
    clf.fit(train[features], train['Human_Overall_Quality'])
    
    # predict on test
    y_true = test['Human_Overall_Quality']
    y_pred_model = clf.predict(test[features])
    
    # collect metrics
    fold_key = f"Fold{i}"
    results[fold_key] = {}
    
    for name, y_pred in [
        ('Qwen', test['Qwen_Overall_Quality']),
        ('GPT',  test['GPT_Overall_Quality']),
        ('Phi',  test['Phi_Overall_Quality']),
        ('Output', y_pred_model)
    ]:
        tau   = kendalltau(y_true, y_pred).correlation
        alpha = krippendorff.alpha([y_true, y_pred])
        results[fold_key][name] = {
            "kendall_tau":       tau,
            "krippendorff_alpha": alpha
        }

# print results
for fold, metrics in results.items():
    print(f"{fold}:")
    for name, m in metrics.items():
        print(f"  {name} {{Kendall's Tau: {m['kendall_tau']:.3f}, "
              f"Krippendorff's Alpha: {m['krippendorff_alpha']:.3f}}}")
    print()

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import kendalltau
import krippendorff

# Define features and target
features = [
    'length_words', 'citation_count', 'question_count', 'mattr',
    'sentiment_polarity', 'similarity_score', 'flesch_reading_ease',
    'politeness_score', 'hedging'
]
target = 'Human_Overall_Quality'
all_fold_metrics = []


# Iterate over each fold
for fold in range(1, 11):
    print(f"Fold{fold}:")
    
    # Load data
    train_df = pd.read_csv(f"Folds/f{fold}_train.csv")
    test_df = pd.read_csv(f"Folds/f{fold}_test.csv")
    
    # Prepare data
    X_train = train_df[features]
    y_train = train_df[target]
    X_test = test_df[features]
    y_test = test_df[target]
    
    # Train model
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test)
    
    # Groups to compare with Human_Overall_Quality
    groups = {
        "Qwen": test_df["Qwen_Overall_Quality"],
        "GPT": test_df["GPT_Overall_Quality"],
        "Phi": test_df["Phi_Overall_Quality"],
        "Output": y_pred
    }
    
    # Compute metrics for each group
    results = {}
    for name, scores in groups.items():
        # Kendall's Tau
        tau, _ = kendalltau(y_test, scores)
        
        # Krippendorff's Alpha (requires 2D array of shape [raters, items])
        data = [y_test.tolist(), scores.tolist()]
        alpha = krippendorff.alpha(data, level_of_measurement='ordinal')
        
        results[name] = {
            "Kendall Tau": round(tau, 3),
            "Krippendorff Alpha": round(alpha, 3)
        }
    
    # Print results
    for model in ["Qwen", "GPT", "Phi", "Output"]:
        metrics = results[model]
        print(f"{model} {{Kendall Tau: {metrics['Kendall Tau']}, Krippendorff Alpha: {metrics['Krippendorff Alpha']}}}")
    print("\n")
    
    all_fold_metrics.append(results)
    

# After processing all folds, calculate averages
average_metrics = {
    model: {
        "kendall": sum(fold[model]["Kendall Tau"] for fold in all_fold_metrics) / 10,
        "alpha": sum(fold[model]["Krippendorff Alpha"] for fold in all_fold_metrics) / 10
    }
    for model in ["Qwen", "GPT", "Phi", "Output"]
}

# Print final averages
print("\nAverage across all folds:")
for model, metrics in average_metrics.items():
    print(f"{model}: {{Kendall Tau: {metrics['kendall']:.3f}, Krippendorff Alpha: {metrics['alpha']:.3f}}}")

# Train ML Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from scipy.stats import kendalltau
import krippendorff
from sklearn.inspection import permutation_importance


from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor


# ================== CONFIGURATION ==================
features = [
    'length_words', 'citation_count', 'question_count', 'mattr',
    'sentiment_polarity', 'similarity_score', 'flesch_reading_ease',
    'politeness_score', 'hedging'
]
feature_importances = {}
target = 'Human_Overall_Quality'

models = {
    # "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    # "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss'),
    # "SVM": SVC(random_state=42),
    # "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    # "Neural Network": MLPClassifier(random_state=42, hidden_layer_sizes=(50,))
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "SVR": SVR(kernel='linear'),
    "Linear Regression": LinearRegression(),
    "Neural Network": MLPRegressor(random_state=42, hidden_layer_sizes=(54, 108, 108, 54)),
    "XGBoost": XGBRegressor(random_state=42)
}

# ================== METRIC STORAGE ==================
results = {
    model_name: {
        'Kendall': {'Qwen': [], 'GPT': [], 'Phi': [], 'Output': []},
        'Alpha': {'Qwen': [], 'GPT': [], 'Phi': [], 'Output': []}
    }
    for model_name in models
}

# ================== MAIN PIPELINE ==================
for fold in range(1, 6):
    print(f"\n{'='*40}\nFold {fold}\n{'='*40}")
    
    # Load data
    train_df = pd.read_csv(f"Folds/f{fold}_train.csv")
    test_df = pd.read_csv(f"Folds/f{fold}_test.csv")
    
    X_train, y_train = train_df[features], train_df[target]
    X_test, y_test = test_df[features], test_df[target]

    for model_name, model in models.items():
        # Clone model to prevent parameter leakage
        cloned_model = clone(model)
        
        # Train and predict
        cloned_model.fit(X_train, y_train)
        
        # ======== Feature Importance Calculation ================================
        if hasattr(cloned_model, 'feature_importances_'):
            # Tree-based models
            fold_imp = cloned_model.feature_importances_
        elif hasattr(cloned_model, 'coef_'):
            # Linear models
            fold_imp = np.abs(cloned_model.coef_.flatten())
        else:
            # For models without inherent importance (SVR, MLP)
            result = permutation_importance(
                cloned_model, X_test, y_test,
                n_repeats=10, 
                random_state=42
            )
            fold_imp = result.importances_mean
        
        # Normalize and store
        fold_imp = fold_imp / fold_imp.sum()  # Normalize to sum=1
        
        if model_name not in feature_importances:
            feature_importances[model_name] = {
                'features': features,
                'importances': {f: [] for f in features}
            }
        
        for f, imp in zip(features, fold_imp):
            feature_importances[model_name]['importances'][f].append(imp)
        
        # Print fold-level importance
        print(f"\n{model_name} Feature Importance (Fold {fold}):")
        sorted_idx = np.argsort(fold_imp)[::-1]
        for idx in sorted_idx:
            print(f"  {features[idx]}: {fold_imp[idx]:.4f}")

        # ======== Feature Importance Calculation ================================
        
        y_pred = cloned_model.predict(X_test)
        
        # Calculate metrics
        groups = {
            'Qwen': test_df['Qwen_Overall_Quality'],
            'GPT': test_df['GPT_Overall_Quality'],
            'Phi': test_df['Phi_Overall_Quality'],
            'Output': y_pred
        }

        for group_name, scores in groups.items():
            # Kendall's Tau
            tau, _ = kendalltau(y_test, scores)
            results[model_name]['Kendall'][group_name].append(tau)
            
            # Krippendorff's Alpha
            data = [y_test.tolist(), scores.tolist()]
            alpha = krippendorff.alpha(data, level_of_measurement='ordinal')
            results[model_name]['Alpha'][group_name].append(alpha)

        # Print fold results
        print(f"\n{model_name}:")
        print(f"Output Kendall: {tau:.3f}, Alpha: {alpha:.3f}")


print('#############################################')
# ================== FINAL RESULTS ==================
print("\n\nAverage Metrics Across All Folds:")
for model_name in models:
    print(f"\n{model_name}:")
    for metric in ['Kendall', 'Alpha']:
        print(f"  {metric}:")
        for group in ['Output']:  # 'Qwen', 'GPT', 'Phi', 
            avg = np.mean(results[model_name][metric][group])
            std = np.std(results[model_name][metric][group])
            print(f"    {group}: {avg:.3f} ± {std:.3f}")
            

# ================== FINAL FEATURE IMPORTANCE ==================
print("\n\nAverage Feature Importance Across All Folds:")
for model_name, data in feature_importances.items():
    print(f"\n{model_name}:")
    avg_imp = {f: np.mean(vals) for f, vals in data['importances'].items()}
    sorted_imp = sorted(avg_imp.items(), key=lambda x: x[1], reverse=True)
    
    for feature, importance in sorted_imp:
        print(f"  {feature}: {importance:.4f} ± {np.std(data['importances'][feature]):.4f}")

# Visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# ================== PLOTTING ==================


plt.rcParams.update({
    'axes.labelsize': 16,    # X/Y axis labels
    'xtick.labelsize': 16,   # X-axis ticks
    'ytick.labelsize': 14,   # Y-axis ticks
    'legend.fontsize': 14,   # Legend
    'axes.titlesize': 16     # Title
})

# Data from your results
data = {
    'Qwen-3': {'kendall Tau': 0.272, 'krippendorff Alpha': 0.164},
    'Phi-4': {'kendall Tau': 0.254, 'krippendorff Alpha': 0.191},
    'GPT-4o': {'kendall Tau': 0.372, 'krippendorff Alpha': 0.401},
    
    r'LLaMA-3-FT$\mathregular{^*}$': {'kendall Tau': 0.406, 'krippendorff Alpha': 0.454},
    
    'Random Forest': {'kendall Tau': 0.451, 'krippendorff Alpha': 0.551},
    'Linear Regression': {'kendall Tau': 0.459, 'krippendorff Alpha': 0.530},
    'MLP': {'kendall Tau': 0.426, 'krippendorff Alpha': 0.567},
    # 'XGBoost': {'kendall Tau': 0.380, 'krippendorff Alpha': 0.510},
    # 'SVR': {'kendall Tau': 0.454, 'krippendorff Alpha': 0.559},
}

std_devs = {
    'Qwen-3': {'kendall Tau': 0.079, 'krippendorff Alpha': 0.109},
    'Phi-4': {'kendall Tau': 0.116, 'krippendorff Alpha': 0.149},
    'GPT-4o': {'kendall Tau': 0.100, 'krippendorff Alpha': 0.119},
    
    r'LLaMA-3-FT$\mathregular{^*}$': {'kendall Tau': 0.035, 'krippendorff Alpha': 0.035},
    
    'Random Forest': {'kendall Tau': 0.071, 'krippendorff Alpha': 0.060},
    'Linear Regression': {'kendall Tau': 0.107, 'krippendorff Alpha': 0.118},
    'MLP': {'kendall Tau': 0.036, 'krippendorff Alpha': 0.038},
    # 'SVR': {'kendall Tau': 0.089, 'krippendorff Alpha': 0.093},
    # 'XGBoost': {'kendall Tau': 0.075, 'krippendorff Alpha': 0.085},
}

# Create figure and subplots
fig, (ax1) = plt.subplots(1, 1, figsize=(10, 6))

# Configure style
plt.rcParams['font.size'] = 18
# colors = plt.cm.tab10.colors  # Get colors from tab10 colormap

colors = {
    'GPT-4o': '#C0C0C0',    # Lightest Gray
    'Qwen-3': '#08306B',   # Medium Blue
    'Phi-4': '#4F81BD',   # Very Dark Blue (Navy)
    'Random Forest':      '#A5D6A7',  # Medium-light Green
    'Linear Regression':  '#FFCC80',  # Medium-light Orange
    'MLP':                '#EF9A9A',  # Medium-light Red
    r'LLaMA-3-FT$\mathregular{^*}$':         '#ADD8E6',  # Medium-light Purple
}

patterns = {
    'GPT-4o':  '////',
    'Qwen-3': '****',
    'Phi-4':  'xxxx',
    'Random Forest': '',
    'Linear Regression': ''
}

# Plot parameters
bar_width = 0.75
index = np.arange(len(data))
model_names = list(data.keys())


############################################################
# Define grouping parameters
group1_size = 4  # First 4 columns
group2_size = 3  # Next 3 columns
intra_group_space = 0.1    # Space within groups
inter_group_space = 0.8   # Space between groups

# Create positions for each group
group1_pos = np.arange(group1_size)
group2_pos = np.arange(group1_size + inter_group_space, 
                      group1_size + inter_group_space + group2_size)
all_positions = np.concatenate([group1_pos, group2_pos])

# Modified plotting code
for i, (model, values) in enumerate(data.items()):
    ax1.bar(
        all_positions[i] + intra_group_space/2,  # Center bars in their slot
        values['kendall Tau'], 
        width=bar_width - intra_group_space + 0.1,
        yerr=std_devs[model]['kendall Tau'],
        color=colors[model],
        label=model,
        capsize=5,
        edgecolor='black', 
        linewidth=1,
    )

# Set x-axis labels and ticks
ax1.set_xticks(all_positions + (bar_width - intra_group_space)/2)
ax1.set_xticklabels(model_names, rotation=45, ha='right')

# Add visual separation between groups
ax1.axvline(x=group1_size + inter_group_space/2 - 0.45, 
           color='gray', 
           linestyle='--', 
           linewidth=0.8,
           alpha=1)


# Adjust x-axis limits
ax1.set_xlim(-0.5, all_positions[-1] + bar_width)
############################################################

# # Plot Kendall's Tau
# for i, (model, values) in enumerate(data.items()):
#     ax1.bar(i, values['kendall Tau'], bar_width,
#             yerr=std_devs[model]['kendall Tau'],
#             color=colors[model],
#             label=model,
#             # patterns=patterns[model],
#             capsize=5
#             , edgecolor='black', linewidth=1)

ax1.set_title("")
ax1.set_ylabel('Kendall Tau')
# ax1.set_xticks(index)
# ax1.set_xticklabels(model_names, rotation=45, ha='right')
ax1.set_ylim(0, 0.6)
ax1.grid(True, axis='y', linestyle='--', alpha=0.7)

# Plot Krippendorff's Alpha
# for i, (model, values) in enumerate(data.items()):
#     ax2.bar(i, values['krippendorff Alpha'], bar_width,
#             yerr=std_devs[model]['krippendorff Alpha'],
#             color=colors[model],
#             # patterns=patterns[model],
#             capsize=5)

# ax2.set_title("Krippendorff's Alpha")
# ax2.set_ylabel('Score', fontsize=16)
# ax2.set_xticks(index)
# ax2.set_xticklabels(model_names, rotation=45, ha='right')
# ax2.set_ylim(0, 0.7)
# ax2.grid(True, axis='y', linestyle='--', alpha=0.7)

# Create a single legend for both plots
handles, labels = ax1.get_legend_handles_labels()
# fig.legend(handles, labels, loc='center left', bbox_to_anchor=(0.85, 0.6))

plt.tight_layout()
plt.subplots_adjust(right=0.85)  # Make space for legend
plt.show()

# Save as high quality PNG
fig.savefig('model_comparison.pdf', dpi=300, bbox_inches='tight')

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np


# Data from your feature importance results
data = {
    "Random Forest": [0.5129, 0.1051, 0.0885, 0.0738, 0.0724, 0.0637, 0.0423, 0.0292, 0.0121],
    # "SVR": [0.0046, 0.0249, 0.3210, 0.1085, 0.4155, 0.0145, 0.0443, 0.0147, 0.0520],
    "Linear Regression": [0.0002, 0.0010, 0.0762, 0.0393, 0.0368, 0.0710, 0.7731, 0.0010, 0.0014],
    "MLP": [0.7906, 0.1967, -0.0004, 0.0018, -0.0018, 0.0019, -0.0001, -0.0355, 0.0467],
    # "XGBoost": [0.3093, 0.1228, 0.1033, 0.0841, 0.0959, 0.0642, 0.1252, 0.0508, 0.0444]
}

features = [
    'length_words', 'readability', 'similarity_score', 'sentiment_polarity', 'politeness_score', 'mattr',
    'hedging', 'question_count', 'citation_count'
]

models = list(data.keys())

# Create custom blue colormap
cmap = mpl.colors.LinearSegmentedColormap.from_list("blue", ["#f0f8ff", "#0047ab"])

# Create figure with adjusted size and ratios
fig, ax = plt.subplots(figsize=(16, 8))

# Create matrix for visualization
matrix = np.array([data[model] for model in models])

# Normalize each row (handling negative values)
normalized_matrix = np.array([
    [max(0, x)/sum(max(0, x) for x in row) for x in row] 
    for row in matrix
])

# Display heatmap with adjusted cell size
im = ax.imshow(normalized_matrix, cmap=cmap, aspect='equal')

# Set axis labels and ticks
ax.set_xticks(np.arange(len(features)))
ax.set_yticks(np.arange(len(models)))
ax.set_xticklabels(features, rotation=45, ha="right", fontsize=14)
ax.set_yticklabels(models, fontsize=14)

# Add text annotations with 3 decimal places
for i in range(len(models)):
    for j in range(len(features)):
        value = matrix[i, j]
        color = 'black' if normalized_matrix[i, j] < 0.5 else 'white'
        ax.text(j, i, f"{value:.3f}",
                ha="center", va="center",
                color=color, fontsize=10)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, shrink=0.5)
cbar.set_label('Feature Importance Intensity', rotation=270, labelpad=20, fontsize=14)

# Adjust layout to remove whitespace
plt.tight_layout(pad=2.0)
plt.subplots_adjust(left=0.15, right=0.85, bottom=0.2, top=0.95)

plt.show()

In [None]:
# llama Finetuning results
# Experiment 1 (llama_mse.txt) - MSE-based training
experiment1_metrics = {
    'F1': {'Kendall Tau': 0.381, 'Krippendorff Alpha': 0.422},
    'F2': {'Kendall Tau': 0.347, 'Krippendorff Alpha': 0.385},
    'F3': {'Kendall Tau': 0.412, 'Krippendorff Alpha': 0.453},
    'F4': {'Kendall Tau': 0.394, 'Krippendorff Alpha': 0.436},
    'F5': {'Kendall Tau': 0.365, 'Krippendorff Alpha': 0.408},
    'Average': {'Kendall Tau': 0.380 ± 0.028, 'Krippendorff Alpha': 0.421 ± 0.027}
}

# Experiment 2 (llama.txt) - BCE-based training
experiment2_metrics = {
    'F1': {'Kendall Tau': 0.428, 'Krippendorff Alpha': 0.479},
    'F2': {'Kendall Tau': 0.392, 'Krippendorff Alpha': 0.437},
    'F3': {'Kendall Tau': 0.353, 'Krippendorff Alpha': 0.402},
    'F4': {'Kendall Tau': 0.415, 'Krippendorff Alpha': 0.461},
    'F5': {'Kendall Tau': 0.441, 'Krippendorff Alpha': 0.493},
    'Average': {'Kendall Tau': 0.406 ± 0.035, 'Krippendorff Alpha': 0.454 ± 0.035}
}