In [3]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
result = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        result = json.load(f)
else:
    print(f"Error: File {results_file} not found.")

# Mapping from different methods to model names
model_mappings = {
    'soft': 'stage-final-llava-v15-pythia+160m-soft',
    'lora': 'stage-final-llava-v15-pythia+160m-lora',
    'sgm': 'stage-final-llava-v15-pythia+160m-sgm',
    'original': 'stage-final-llava-v15-pythia+160m',
    'vicuna': 'reproduction-align-pythia+160m',
    'olf': 'stage-final-llava-v15-pythia+160m-olf',
    'softolf': 'stage-final-llava-v15-pythia+160m-softolf',
    'sgmolf': 'stage-final-llava-v15-pythia+160m-sgmolf',
    'ia3': 'stage-final-llava-v15-pythia+160m-ia3',
    'lora-quarterfullrank': 'stage-final-llava-v15-pythia+160m-lora-use_rslora-rank_by_factor_4',
    'lora-quarterfullrank-higheralpha': 'stage-final-llava-v15-pythia+160m-lora-use_rslora-rank_by_factor_4-higheralpha-16',
    'lora-halffullrank-higheralpha': 'stage-final-llava-v15-pythia+160m-lora-use_rslora-rank_by_factor_2-higheralpha-32',
    'lora-halffullrank': 'stage-final-llava-v15-pythia+160m-lora-rank_by_factor_2',
    'lora-halffullrank-rslora': 'stage-final-llava-v15-pythia+160m-lora-use_rslora-rank_by_factor_2',
    'lora-halffullrank-rslora-kqv': 'stage-final-llava-v15-pythia+160m-lora-use_rslora-higher_rank-target_modules_kqv',
    'soft-0.001': 'stage-final-llava-v15-pythia+160m-soft-0.001',
    'soft-0.1': 'stage-final-llava-v15-pythia+160m-soft-0.1',
}

# Label name mappings for the main methods and variants of LoRA
name_mapping = {
    'soft': 'Soft Targets',
    'softolf': 'Soft Targets + OLF',
    'sgmolf': 'SGM + OLF',
    'sgm': 'SGM',
    'lora': 'LoRA',
    'ia3': 'IA3',
    'original': 'Original LLaVA',
    'vicuna': 'Language Only LLM',
    'olf': 'Output Layer Freezing (OLF)',
    'lora-old': 'LoRA (Rank 16, Alpha 8)',
    'lora-quarterfullrank': 'LoRA (1/4 Full Rank)',
    'lora-quarterfullrank-higheralpha': 'LoRA (1/4 Full Rank, Higher Alpha)',
    'lora-halffullrank-higheralpha': 'LoRA (1/2 Full Rank, Higher Alpha)',
    'lora-halffullrank': 'LoRA (1/2 Full Rank)',
    'lora-halffullrank-rslora': 'LoRA (1/2 Full Rank, RSLoRA)',
    'lora-halffullrank-rslora-kqv': 'LoRA (1/2 Full Rank, RSLoRA, KQV Target)',
    'soft-0.001': 'Soft Targets 0.001',
    'soft-0.1': 'Soft Targets 0.1'
}

# Initialize model mappings
model_results = {model: [] for model in model_mappings.keys()}

# Populate the mappings based on the given mappings
for method, model_name in model_mappings.items():
    if model_name in result:
        metrics = result[model_name]
        accuracies = list(metrics.values())
        avg_accuracy = np.nanmean(accuracies)
        model_results[method].append((model_name, avg_accuracy))
    else:
        print(f"Warning: Model '{model_name}' not found in results")

# Identify the highest accuracy model for each mapping
highest_accuracy_models = {}
for method, mappings in model_results.items():
    if mappings:
        highest_accuracy_models[method] = max(mappings, key=lambda x: x[1])

# Save the highest accuracy models to a file
output_file = 'highest_accuracy_models.json'
with open(output_file, 'w') as f:
    json.dump(highest_accuracy_models, f, indent=2)

# Prepare data for radar chart
methods = [name_mapping[model] for model in model_mappings.keys() if model in highest_accuracy_models]
results_dict = {}

for model in model_mappings.keys():
    if model in highest_accuracy_models:
        model_name, _ = highest_accuracy_models[model]
        metrics = result[model_name]
        accuracies = {dataset: metrics.get(dataset, np.nan) for dataset in datasets}
        results_dict[name_mapping[model]] = accuracies

# Output the results dictionary
print(json.dumps(results_dict, indent=2))

import json
import numpy as np

# Calculate delta values
original_llava_acc = results_dict["Original LLaVA"]
language_only_llm_acc = results_dict.get("Language Only LLM", {})

table_data = []

for model, accuracies in results_dict.items():
    if model in ["Original LLaVA", "Language Only LLM"]:
        continue
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = hmean([language_only_llm_acc.get(dataset, np.nan) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - avg_acc_nl 
    table_data.append((model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[3]), reverse=True)

# Separate the data into two groups
lora_variants = [
    'LoRA (1/4 Full Rank)', 'LoRA (1/4 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank)', 'LoRA (1/2 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank, RSLoRA)', 'LoRA (1/2 Full Rank, RSLoRA, KQV Target)'
]

lora_table_data = [item for item in table_data if item[0] in lora_variants]
other_table_data = [item for item in table_data if item[0] not in lora_variants]

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Generate LaTeX table for LoRA variants
latex_code_lora = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} LoRA Variants}
  \\label{tab:lora_variants_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

original_llava_avg_delta_nl = hmean([language_only_llm_acc.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - hmean([original_llava_acc[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])

latex_code_lora += "Original LLaVA & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_lora += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_lora += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in lora_table_data:
    latex_code_lora += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_lora += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

# Generate LaTeX table for other methods
latex_code_other = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} Other Methods}
  \\label{tab:other_methods_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

latex_code_other += "Original LLaVA & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_other += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_other += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in other_table_data:
    latex_code_other += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_other += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code_lora)
print(latex_code_other)


{
  "Soft Targets": {
    "vqa-v2": 0.3267,
    "textvqa-ocr": 0.06923828124999999,
    "textvqa-pure": 0.061035156249999986,
    "gqa": 0.2539,
    "refcoco": 0.015625,
    "wsc273": 0.5128205128205128,
    "winogrande": 0.5082872928176796,
    "lambada_standard": 0.17019212109450804,
    "arc_easy": 0.39941077441077444,
    "arc_challenge": 0.2226962457337884
  },
  "LoRA": {
    "vqa-v2": 0.2897,
    "textvqa-ocr": 0.010156250000000002,
    "textvqa-pure": 0.017382812500000004,
    "gqa": 0.1797,
    "refcoco": 0.0009765625,
    "wsc273": 0.5457875457875457,
    "winogrande": 0.4964483030781373,
    "lambada_standard": 0.1946438967591694,
    "arc_easy": 0.3888888888888889,
    "arc_challenge": 0.2167235494880546
  },
  "SGM": {
    "vqa-v2": 0.2839,
    "textvqa-ocr": 0.013671875,
    "textvqa-pure": 0.0271484375,
    "gqa": 0.1748,
    "refcoco": 0.0,
    "wsc273": 0.5677655677655677,
    "winogrande": 0.5074980268350434,
    "lambada_standard": 0.17698428100135843,
    "arc_easy"

Top 3 only