Pythia410M Epochs Analysis

In [1]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
result = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        result = json.load(f)
else:
    print(f"Error: File {results_file} not found.")

# Mapping from different methods to model names
model_mappings = {
    'vicuna' : 'reproduction-align-pythia+410m',
    'original' : 'stage-final-llava-v15-pythia+410m',
    'original+2epochs': 'stage-final-llava-v15-pythia+410m-epochs-2',
    'original+3epochs': 'stage-final-llava-v15-pythia+410m-epochs-3',
    'soft': 'stage-final-llava-v15-pythia+410m-soft',
    'soft+2epochs': 'stage-final-llava-v15-pythia+410m-soft-epochs-2',
    'soft+3epochs': 'stage-final-llava-v15-pythia+410m-soft-epochs-3',
    'oolf': 'stage-final-llava-v15-pythia+410m-oolf',
    'sgm-oolf': 'stage-final-llava-v15-pythia+410m-sgm-oolf',
    'sgm': 'stage-final-llava-v15-pythia+410m-sgm'
}

# Label name mappings for the main methods and variants of LoRA
name_mapping = {
    'vicuna' : 'Language Only LLM',
    'original': 'Naive FT (410M)',
    'original+2epochs': 'Naive FT (410M, +2 Epochs)',
    'original+3epochs': 'Naive FT (410M, +3 Epochs)',
    'soft': 'Soft Targets (410M)',
    'soft+2epochs': 'Soft Targets (410M, +2 Epochs)',
    'soft+3epochs': 'Soft Targets (410M, +3 Epochs)',
    'oolf': 'Corrected OLF LLaVA (410M)',
    'sgm-oolf': 'SGM + Corrected OLF (410M)',
    'sgm': 'SGM (410M)'
}

# Initialize model mappings
model_results = {model: [] for model in model_mappings.keys()}

# Populate the mappings based on the given mappings
for method, model_name in model_mappings.items():
    if model_name in result:
        metrics = result[model_name]
        accuracies = list(metrics.values())
        avg_accuracy = np.nanmean(accuracies)
        model_results[method].append((model_name, avg_accuracy))
    else:
        print(f"Warning: Model '{model_name}' not found in results")

# Identify the highest accuracy model for each mapping
highest_accuracy_models = {}
for method, mappings in model_results.items():
    if mappings:
        highest_accuracy_models[method] = max(mappings, key=lambda x: x[1])

# Save the highest accuracy models to a file
output_file = 'highest_accuracy_models.json'
with open(output_file, 'w') as f:
    json.dump(highest_accuracy_models, f, indent=2)

# Prepare data for radar chart
methods = [name_mapping[model] for model in model_mappings.keys() if model in highest_accuracy_models]
results_dict = {}

for model in model_mappings.keys():
    if model in highest_accuracy_models:
        model_name, _ = highest_accuracy_models[model]
        metrics = result[model_name]
        accuracies = {dataset: metrics.get(dataset, np.nan) for dataset in datasets}
        results_dict[name_mapping[model]] = accuracies

# Output the results dictionary
print(json.dumps(results_dict, indent=2))

import json
import numpy as np

# Calculate delta values
original_llava_acc = results_dict["Naive FT (410M)"]
language_only_llm_acc = results_dict.get("Language Only LLM", {})

table_data = []

for model, accuracies in results_dict.items():
    if model in ["Naive FT (410M)", "Language Only LLM"]:
        continue
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = hmean([language_only_llm_acc.get(dataset, np.nan) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - avg_acc_nl 
    table_data.append((model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[3]), reverse=True)

# Separate the data into two groups
lora_variants = [
    'LoRA (1/4 Full Rank)', 'LoRA (1/4 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank)', 'LoRA (1/2 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank, RSLoRA)', 'LoRA (1/2 Full Rank, RSLoRA, KQV Target)'
]

lora_table_data = [item for item in table_data if item[0] in lora_variants]
other_table_data = [item for item in table_data if item[0] not in lora_variants]

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Generate LaTeX table for LoRA variants
latex_code_lora = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} LoRA Variants}
  \\label{tab:lora_variants_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

original_llava_avg_delta_nl = hmean([language_only_llm_acc.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - hmean([original_llava_acc[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])

latex_code_lora += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_lora += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_lora += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in lora_table_data:
    latex_code_lora += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_lora += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

# Generate LaTeX table for other methods
latex_code_other = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} Other Methods}
  \\label{tab:other_methods_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

latex_code_other += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_other += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_other += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in other_table_data:
    latex_code_other += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_other += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code_lora)
print(latex_code_other)


{
  "Language Only LLM": {
    "vqa-v2": 0.001,
    "textvqa-ocr": 0.001953125,
    "textvqa-pure": 0.0,
    "gqa": 0.002,
    "refcoco": 0.0,
    "wsc273": 0.6446886446886447,
    "winogrande": 0.5343330702446725,
    "lambada_standard": 0.39355715117407336,
    "arc_easy": 0.5147306397306397,
    "arc_challenge": 0.20648464163822525
  },
  "Naive FT (410M)": {
    "vqa-v2": 0.5488000000000001,
    "textvqa-ocr": 0.24384765625000018,
    "textvqa-pure": 0.20781250000000018,
    "gqa": 0.4102,
    "refcoco": 0.052734375,
    "wsc273": 0.6043956043956044,
    "winogrande": 0.5295974743488555,
    "lambada_standard": 0.3073937512128857,
    "arc_easy": 0.4978956228956229,
    "arc_challenge": 0.2295221843003413
  },
  "Naive FT (410M, +2 Epochs)": {
    "vqa-v2": 0.025699999999999997,
    "textvqa-ocr": 0.003125,
    "textvqa-pure": 0.0421875,
    "gqa": 0.0039000000000000003,
    "refcoco": 0.0341796875,
    "wsc273": 0.6190476190476191,
    "winogrande": 0.5445935280189423,
    "lambad

# Pythia1B Epochs Analysis

In [5]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
result = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        result = json.load(f)
else:
    print(f"Error: File {results_file} not found.")

# Mapping from different methods to model names
model_mappings = {
    'vicuna' : 'reproduction-align-pythia+1b',
    'original' : 'stage-final-llava-v15-pythia+1b',
    'original+2epochs': 'stage-final-llava-v15-pythia+1b-epochs-2',
    'original+3epochs': 'stage-final-llava-v15-pythia+1b-epochs-3',
    'soft': 'stage-final-llava-v15-pythia+1b-soft',
    'soft+2epochs': 'stage-final-llava-v15-pythia+1b-soft-epochs-2',
    'soft+3epochs': 'stage-final-llava-v15-pythia+1b-soft-epochs-3',
    'sgm+oolf': 'stage-final-llava-v15-pythia+1b-sgm-oolf',
    'track_plasticity': 'stage-final-llava-v15-pythia+1b-lora-track-plasticity',
    'track_plasticity_constant_lr_with_warmup': 'stage-final-llava-v15-pythia+1b-lora-track-plasticity-constant_lr_with_warmup'
}

# Label name mappings for the main methods and variants of LoRA
name_mapping = {
    'vicuna' : 'Language Only LLM',
    'original': 'Naive FT (1B)',
    'original+2epochs': 'Naive FT (1B, +2 Epochs)',
    'original+3epochs': 'Naive FT (1B, +3 Epochs)',
    'soft': 'Soft Targets (1B)',
    'soft+2epochs': 'Soft Targets (1B, +2 Epochs)',
    'soft+3epochs': 'Soft Targets (1B, +3 Epochs)',
    'sgm+oolf': 'SGM + Corrected OLF (1B)',
    'track_plasticity': 'LoRA (1B), Track Plasticity, Cosine LR',
    'track_plasticity_constant_lr_with_warmup': 'LoRA (1B), Track Plasticity, Constant LR, Warmup'
}

# Initialize model mappings
model_results = {model: [] for model in model_mappings.keys()}

# Populate the mappings based on the given mappings
for method, model_name in model_mappings.items():
    if model_name in result:
        metrics = result[model_name]
        accuracies = list(metrics.values())
        avg_accuracy = np.nanmean(accuracies)
        model_results[method].append((model_name, avg_accuracy))
    else:
        print(f"Warning: Model '{model_name}' not found in results")

# Identify the highest accuracy model for each mapping
highest_accuracy_models = {}
for method, mappings in model_results.items():
    if mappings:
        highest_accuracy_models[method] = max(mappings, key=lambda x: x[1])

# Save the highest accuracy models to a file
output_file = 'highest_accuracy_models.json'
with open(output_file, 'w') as f:
    json.dump(highest_accuracy_models, f, indent=2)

# Prepare data for radar chart
methods = [name_mapping[model] for model in model_mappings.keys() if model in highest_accuracy_models]
results_dict = {}

for model in model_mappings.keys():
    if model in highest_accuracy_models:
        model_name, _ = highest_accuracy_models[model]
        metrics = result[model_name]
        accuracies = {dataset: metrics.get(dataset, np.nan) for dataset in datasets}
        results_dict[name_mapping[model]] = accuracies

# Output the results dictionary
print(json.dumps(results_dict, indent=2))

import json
import numpy as np

# Calculate delta values
original_llava_acc = results_dict["Naive FT (1B)"]
language_only_llm_acc = results_dict.get("Language Only LLM", {})

table_data = []

for model, accuracies in results_dict.items():
    if model in ["Naive FT (1B)", "Language Only LLM"]:
        continue
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = hmean([language_only_llm_acc.get(dataset, np.nan) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - avg_acc_nl 
    table_data.append((model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[3]), reverse=True)

# Separate the data into two groups
lora_variants = [
    'LoRA (1/4 Full Rank)', 'LoRA (1/4 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank)', 'LoRA (1/2 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank, RSLoRA)', 'LoRA (1/2 Full Rank, RSLoRA, KQV Target)'
]

lora_table_data = [item for item in table_data if item[0] in lora_variants]
other_table_data = [item for item in table_data if item[0] not in lora_variants]

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Generate LaTeX table for LoRA variants
latex_code_lora = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} LoRA Variants}
  \\label{tab:lora_variants_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

original_llava_avg_delta_nl = hmean([language_only_llm_acc.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - hmean([original_llava_acc[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])

latex_code_lora += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_lora += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_lora += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in lora_table_data:
    latex_code_lora += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_lora += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

# Generate LaTeX table for other methods
latex_code_other = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} Other Methods}
  \\label{tab:other_methods_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

latex_code_other += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_other += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_other += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in other_table_data:
    latex_code_other += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_other += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code_lora)
print(latex_code_other)


{
  "Language Only LLM": {
    "vqa-v2": 0.044000000000000004,
    "textvqa-ocr": 0.00615234375,
    "textvqa-pure": 0.04140625,
    "gqa": 0.0127,
    "refcoco": 0.0,
    "wsc273": 0.673992673992674,
    "winogrande": 0.5280189423835833,
    "lambada_standard": 0.4395497768290316,
    "arc_easy": 0.5845959595959596,
    "arc_challenge": 0.24488054607508533
  },
  "Naive FT (1B)": {
    "vqa-v2": 0.6443000000000001,
    "textvqa-ocr": 0.3836914062500002,
    "textvqa-pure": 0.3547851562499999,
    "gqa": 0.45409999999999995,
    "refcoco": 0.078125,
    "wsc273": 0.6556776556776557,
    "winogrande": 0.5351223362273086,
    "lambada_standard": 0.390064040364836,
    "arc_easy": 0.5715488215488216,
    "arc_challenge": 0.27047781569965873
  },
  "Naive FT (1B, +2 Epochs)": {
    "vqa-v2": 0.664,
    "textvqa-ocr": 0.38593750000000027,
    "textvqa-pure": 0.36591796875000004,
    "gqa": 0.4805,
    "refcoco": 0.1220703125,
    "wsc273": 0.663003663003663,
    "winogrande": 0.537490134175

## 410M Schedule Free Analysis

In [2]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
result = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        result = json.load(f)
else:
    print(f"Error: File {results_file} not found.")

# Mapping from different methods to model names
model_mappings = {
    'vicuna' : 'reproduction-align-pythia+410m',
    'original' : 'stage-final-llava-v15-pythia+410m',
    'original+2epochs': 'stage-final-llava-v15-pythia+410m-epochs-2',
    'original+3epochs': 'stage-final-llava-v15-pythia+410m-epochs-3',
    'soft': 'stage-final-llava-v15-pythia+410m-soft',
    'soft+2epochs': 'stage-final-llava-v15-pythia+410m-soft-epochs-2',
    'soft+3epochs': 'stage-final-llava-v15-pythia+410m-soft-epochs-3',
    'oolf': 'stage-final-llava-v15-pythia+410m-oolf',
    'sgm-oolf': 'stage-final-llava-v15-pythia+410m-sgm-oolf',
    'sgm': 'stage-final-llava-v15-pythia+410m-sgm',
    'merging_per_epoch_1_epoch_2': 'stage-final-llava-v15-pythia+410m-merging_per_epoch-1-epochs-2',
    'merging_per_epoch_2_epoch_1': 'stage-final-llava-v15-pythia+410m-merging_per_epoch-2-epochs-1',
    'merging_per_epoch_2_epoch_2': 'stage-final-llava-v15-pythia+410m-merging_per_epoch-2-epochs-2',
    'schedule_free_epoch_1': 'stage-final-llava-v15-pythia+410m-schedule-free-epochs-1',
    'schedule_free_epoch_2': 'stage-final-llava-v15-pythia+410m-schedule-free-epochs-2',
    'schedule_free_epoch_3': 'stage-final-llava-v15-pythia+410m-schedule-free-epochs-3',
    'schedule_free_epoch_4': 'stage-final-llava-v15-pythia+410m-schedule-free-epochs-4',
    'infinite_rsqrt_schedule_1': 'stage-final-llava-v15-pythia+410m-infinite_rsqrt_schedule-epochs-1',
    'infinite_rsqrt_schedule_2': 'stage-final-llava-v15-pythia+410m-infinite_rsqrt_schedule-epochs-2',
    'infinite_rsqrt_schedule_3': 'stage-final-llava-v15-pythia+410m-infinite_rsqrt_schedule-epochs-3',
    'lora_track_plasticity': 'stage-final-llava-v15-pythia+410m-lora-track-plasticity',
    'lora_track_plasticity_constant_lr': 'stage-final-llava-v15-pythia+410m-lora-track-plasticity-constant_lr',
    'lora_track_plasticity_constant_lr_with_warmup': 'stage-final-llava-v15-pythia+410m-lora-track-plasticity-constant_lr_with_warmup',
    'adalora': 'stage-final-llava-v15-pythia+410m-adalora-constant_lr_warmup'
}

# Label name mappings for the main methods and variants
name_mapping = {
    'vicuna' : 'Language Only LLM',
    'original': 'Naive FT (410M)',
    'original+2epochs': 'Naive FT (410M, +2 Epochs)',
    'original+3epochs': 'Naive FT (410M, +3 Epochs)',
    'soft': 'Soft Targets (410M)',
    'soft+2epochs': 'Soft Targets (410M, +2 Epochs)',
    'soft+3epochs': 'Soft Targets (410M, +3 Epochs)',
    'oolf': 'Corrected OLF LLaVA (410M)',
    'sgm-oolf': 'SGM + Corrected OLF (410M)',
    'sgm': 'SGM (410M)',
    'merging_per_epoch_1_epoch_2': 'Naive FT (410M, +2 Epochs, Merge - 5198 Steps)',
    'merging_per_epoch_2_epoch_1': 'Naive FT (410M, +1 Epoch, Merge - 2599 Steps)',
    'merging_per_epoch_2_epoch_2': 'Naive FT (410M, +2 Epochs, Merge - 2599 Steps)',
    'schedule_free_epoch_1': 'Naive FT (410M, +1 Epoch) Sch. Free',
    'schedule_free_epoch_2': 'Naive FT (410M, +2 Epochs) Sch. Free',
    'schedule_free_epoch_3': 'Naive FT (410M, +3 Epochs) Sch. Free',
    'schedule_free_epoch_4': 'Naive FT (410M, +4 Epochs) Sch. Free',
    'infinite_rsqrt_schedule_1': 'Naive FT (410M, +1 Epoch) Inf. RSqrt',
    'infinite_rsqrt_schedule_2': 'Naive FT (410M, +2 Epochs) Inf. RSqrt',
    'infinite_rsqrt_schedule_3': 'Naive FT (410M, +3 Epochs) Inf. RSqrt',
    'lora_track_plasticity': 'LoRA (410M) Track Plasticity',
    'lora_track_plasticity_constant_lr': 'LoRA (410M) Track Plasticity, Const. LR',
    'lora_track_plasticity_constant_lr_with_warmup': 'LoRA (410M) Track Plasticity, Const. LR, Warmup',
    'adalora': 'AdaLoRA (410M) Const. LR, Warmup'
}

# Initialize model mappings
model_results = {model: [] for model in model_mappings.keys()}

# Populate the mappings based on the given mappings
for method, model_name in model_mappings.items():
    if model_name in result:
        metrics = result[model_name]
        accuracies = list(metrics.values())
        avg_accuracy = np.nanmean(accuracies)
        model_results[method].append((model_name, avg_accuracy))
    else:
        print(f"Warning: Model '{model_name}' not found in results")

# Identify the highest accuracy model for each mapping
highest_accuracy_models = {}
for method, mappings in model_results.items():
    if mappings:
        highest_accuracy_models[method] = max(mappings, key=lambda x: x[1])

# Save the highest accuracy models to a file
output_file = 'highest_accuracy_models.json'
with open(output_file, 'w') as f:
    json.dump(highest_accuracy_models, f, indent=2)

# Prepare data for radar chart
methods = [name_mapping[model] for model in model_mappings.keys() if model in highest_accuracy_models]
results_dict = {}

for model in model_mappings.keys():
    if model in highest_accuracy_models:
        model_name, _ = highest_accuracy_models[model]
        metrics = result[model_name]
        accuracies = {dataset: metrics.get(dataset, np.nan) for dataset in datasets}
        results_dict[name_mapping[model]] = accuracies

# Output the results dictionary
print(json.dumps(results_dict, indent=2))

import json
import numpy as np

# Calculate delta values
original_llava_acc = results_dict["Naive FT (410M)"]
language_only_llm_acc = results_dict.get("Language Only LLM", {})

table_data = []

for model, accuracies in results_dict.items():
    if model in ["Naive FT (410M)", "Language Only LLM"]:
        continue
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = hmean([language_only_llm_acc.get(dataset, np.nan) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - avg_acc_nl 
    table_data.append((model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[3]), reverse=True)

# Separate the data into two groups
lora_variants = [
    'LoRA (1/4 Full Rank)', 'LoRA (1/4 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank)', 'LoRA (1/2 Full Rank, Higher Alpha)', 
    'LoRA (1/2 Full Rank, RSLoRA)', 'LoRA (1/2 Full Rank, RSLoRA, KQV Target)'
]

lora_table_data = [item for item in table_data if item[0] in lora_variants]
other_table_data = [item for item in table_data if item[0] not in lora_variants]

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Generate LaTeX table for LoRA variants
latex_code_lora = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} LoRA Variants}
  \\label{tab:lora_variants_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

original_llava_avg_delta_nl = hmean([language_only_llm_acc.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - hmean([original_llava_acc[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])

latex_code_lora += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_lora += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_lora += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in lora_table_data:
    latex_code_lora += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_lora += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

# Generate LaTeX table for other methods
latex_code_other = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} Other Methods}
  \\label{tab:other_methods_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

latex_code_other += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_other += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_other += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in other_table_data:
    latex_code_other += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_other += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code_lora)
print(latex_code_other)


{
  "Language Only LLM": {
    "vqa-v2": NaN,
    "textvqa-ocr": NaN,
    "textvqa-pure": NaN,
    "gqa": NaN,
    "refcoco": NaN,
    "wsc273": 0.6446886446886447,
    "winogrande": 0.5343330702446725,
    "lambada_standard": 0.39355715117407336,
    "arc_easy": 0.5147306397306397,
    "arc_challenge": 0.20648464163822525
  },
  "Naive FT (410M)": {
    "vqa-v2": 0.5488000000000001,
    "textvqa-ocr": 0.24384765625000018,
    "textvqa-pure": 0.20781250000000018,
    "gqa": 0.4102,
    "refcoco": 0.052734375,
    "wsc273": 0.6043956043956044,
    "winogrande": 0.5295974743488555,
    "lambada_standard": 0.3073937512128857,
    "arc_easy": 0.4978956228956229,
    "arc_challenge": 0.2295221843003413
  },
  "Naive FT (410M, +2 Epochs)": {
    "vqa-v2": 0.025699999999999997,
    "textvqa-ocr": 0.003125,
    "textvqa-pure": 0.0421875,
    "gqa": 0.0039000000000000003,
    "refcoco": 0.0341796875,
    "wsc273": 0.6190476190476191,
    "winogrande": 0.5445935280189423,
    "lambada_standard":

# Pythia 160M LoRA Merging 

In [1]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
result = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        result = json.load(f)
else:
    print(f"Error: File {results_file} not found.")

# Mapping from different methods to model names
model_mappings = {
    'vicuna' : 'reproduction-align-pythia+160m',
    'original' : 'stage-final-llava-v15-pythia+160m',
    'soft': 'stage-final-llava-v15-pythia+160m-soft',
    'oolf': 'stage-final-llava-v15-pythia+160m-oolf',
    'sgm-oolf': 'stage-final-llava-v15-pythia+160m-sgm-oolf',
    'sgm': 'stage-final-llava-v15-pythia+160m-sgm',
    'lora': 'stage-final-llava-v15-pythia+160m-lora',
    'merging_per_epoch_1_epoch_2': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-1-epochs-2',
    'merging_per_epoch_2_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-2-epochs-1',
    'merging_per_epoch_2_epoch_2': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-2-epochs-2',
    'merging_per_epoch_4_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-4-epochs-1',
    'merging_per_epoch_8_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-8-epochs-1',
    'merging_per_epoch_16_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-16-epochs-1',
    'merging_per_epoch_32_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-32-epochs-1',
    'merging_per_epoch_64_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-64-epochs-1',
    'merging_per_epoch_128_epoch_1': 'stage-final-llava-v15-pythia+160m-merging_per_epoch-128-epochs-1',
    'lora_after_warmup_1040': "stage-final-llava-v15-pythia+160m-merges_after_steps-1040-lora_after_warmup",
    'lora_after_warmup_2600': "stage-final-llava-v15-pythia+160m-merges_after_steps-2600-lora_after_warmup",
    'lora_after_warmup_650': "stage-final-llava-v15-pythia+160m-merges_after_steps-650-lora_after_warmup",
    'lora_after_warmup_400': "stage-final-llava-v15-pythia+160m-merges_after_steps-400-lora_after_warmup",
    'lora_after_warmup_200': "stage-final-llava-v15-pythia+160m-merges_after_steps-200-lora_after_warmup-rerun",
    'lora_track_plasiticty': 'stage-final-llava-v15-pythia+160m-lora-track-plasticity',
    'lora_track_plasticity_constant_lr_with_warmup': 'stage-final-llava-v15-pythia+160m-lora-track-plasticity-constant_lr_with_warmup',
    'adalora': 'stage-final-llava-v15-pythia+160m-adalora-constant_lr_warmup', #
    'lora-rerun': "stage-final-llava-v15-pythia+160m-lora-track-plasticity-logpl-constant_lr_with_warmup",#
}

# Label name mappings for the main methods and variants
def get_merge_label(base_label, merge_per_epoch):
    steps_per_merge = int(5198 / merge_per_epoch)
    return f'{base_label}, Merge - {steps_per_merge} Steps'

name_mapping = {
    'vicuna' : 'Language Only LLM',
    'original': 'Naive FT (160M)',
    'lora': 'LoRA (160M)',
    'soft': 'Soft Targets (160M)',
    'oolf': 'Corrected OLF LLaVA (160M)',
    'sgm-oolf': 'SGM + Corrected OLF (160M)',
    'sgm': 'SGM (160M)',
    'merging_per_epoch_1_epoch_2': get_merge_label('Naive FT (160M, +2 Epochs)', 1),
    'merging_per_epoch_2_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 2),
    'merging_per_epoch_2_epoch_2': get_merge_label('Naive FT (160M, +2 Epochs)', 2),
    'merging_per_epoch_4_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 4),
    'merging_per_epoch_8_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 8),
    'merging_per_epoch_16_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 16),
    'merging_per_epoch_32_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 32),
    'merging_per_epoch_64_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 64),
    'merging_per_epoch_128_epoch_1': get_merge_label('Naive FT (160M, +1 Epoch)', 128),
    'schedule_free_epoch_1': 'Naive FT (160M, +1 Epoch) Sch. Free',
    'schedule_free_epoch_2': 'Naive FT (160M, +2 Epochs) Sch. Free',
    'schedule_free_epoch_3': 'Naive FT (160M, +3 Epochs) Sch. Free',
    'schedule_free_epoch_4': 'Naive FT (160M, +4 Epochs) Sch. Free',
    'lora_after_warmup_1040': 'LoRA (160M) RSLoRA, Merge - 1040 Steps',
    'lora_after_warmup_2600': 'LoRA (160M) RSLoRA, Merge - 2600 Steps',
    'lora_after_warmup_650': 'LoRA (160M) RSLoRA, Merge - 650 Steps',
    'lora_after_warmup_400': 'LoRA (160M) RSLoRA, Merge - 400 Steps',
    'lora_after_warmup_200': 'LoRA (160M) RSLoRA, Merge - 200 Steps',
    'lora_track_plasiticty': 'LoRA (160M) LoRA, Track Plasticity',
    'lora_track_plasticity_constant_lr_with_warmup': 'LoRA (160M) LoRA, Track Plasticity, Constant LR with Warmup',
    'adalora': 'AdaLoRA (160M)', #
    'lora-rerun': 'LoRA (160M)*',#

}

# Initialize model mappings
model_results = {model: [] for model in model_mappings.keys()}

# Populate the mappings based on the given mappings
for method, model_name in model_mappings.items():
    if model_name in result:
        metrics = result[model_name]
        accuracies = list(metrics.values())
        avg_accuracy = np.nanmean(accuracies)
        model_results[method].append((model_name, avg_accuracy))
    else:
        print(f"Warning: Model '{model_name}' not found in results")

# Identify the highest accuracy model for each mapping
highest_accuracy_models = {}
for method, mappings in model_results.items():
    if mappings:
        highest_accuracy_models[method] = max(mappings, key=lambda x: x[1])

# Save the highest accuracy models to a file
output_file = 'highest_accuracy_models.json'
with open(output_file, 'w') as f:
    json.dump(highest_accuracy_models, f, indent=2)

# Prepare data for radar chart
methods = [name_mapping[model] for model in model_mappings.keys() if model in highest_accuracy_models]
results_dict = {}

for model in model_mappings.keys():
    if model in highest_accuracy_models:
        model_name, _ = highest_accuracy_models[model]
        metrics = result[model_name]
        accuracies = {dataset: metrics.get(dataset, np.nan) for dataset in datasets}
        results_dict[name_mapping[model]] = accuracies

# Output the results dictionary
print(json.dumps(results_dict, indent=2))

import json
import numpy as np

# Calculate delta values
original_llava_acc = results_dict["Naive FT (160M)"]
language_only_llm_acc = results_dict.get("Language Only LLM", {})

table_data = []

for model, accuracies in results_dict.items():
    if model in ["Naive FT (160M)", "Language Only LLM"]:
        continue
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = hmean([language_only_llm_acc.get(dataset, np.nan) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - avg_acc_nl 
    table_data.append((model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[3]), reverse=True)

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Generate LaTeX table for other methods
latex_code_other = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance:} Other Methods}
  \\label{tab:other_methods_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\downarrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

original_llava_avg_delta_nl = hmean([language_only_llm_acc.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]]) - hmean([original_llava_acc[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])

latex_code_other += "Naive FT & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nl} & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(original_llava_acc["vqa-v2"]),
    textvqa_ocr=format_value(original_llava_acc["textvqa-ocr"]),
    textvqa_pure=format_value(original_llava_acc["textvqa-pure"]),
    gqa=format_value(original_llava_acc["gqa"]),
    avg_acc_vl=format_value(hmean([original_llava_acc["vqa-v2"], original_llava_acc["textvqa-ocr"], original_llava_acc["textvqa-pure"], original_llava_acc["gqa"]])),
    delta_nl=format_value(original_llava_avg_delta_nl),
    avg_acc_nl=format_value(hmean([original_llava_acc["wsc273"], original_llava_acc["winogrande"], original_llava_acc["arc_easy"], original_llava_acc["arc_challenge"], original_llava_acc["lambada_standard"]]))
)

latex_code_other += "Language Only LLM & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & - & {avg_acc_nl} \\\\\n".format(
    vqa_v2=format_value(language_only_llm_acc.get("vqa-v2", np.nan)),
    textvqa_ocr=format_value(language_only_llm_acc.get("textvqa-ocr", np.nan)),
    textvqa_pure=format_value(language_only_llm_acc.get("textvqa-pure", np.nan)),
    gqa=format_value(language_only_llm_acc.get("gqa", np.nan)),
    avg_acc_vl=format_value(hmean([language_only_llm_acc.get("vqa-v2", 0), language_only_llm_acc.get("textvqa-ocr", 0), language_only_llm_acc.get("textvqa-pure", 0), language_only_llm_acc.get("gqa", 0)])),
    avg_acc_nl=format_value(hmean([language_only_llm_acc.get("wsc273", 0), language_only_llm_acc.get("winogrande", 0), language_only_llm_acc.get("arc_easy", 0), language_only_llm_acc.get("arc_challenge", 0), language_only_llm_acc.get("lambada_standard", 0)]))
)

latex_code_other += "\\midrule\n"

for model, accuracies, avg_acc_vl, delta_nl, avg_acc_nl in table_data:
    latex_code_other += "{model} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {acc_vl} & {delta_nl} & {acc_nl} \\\\\n".format(
        model=model,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        acc_vl=format_value(avg_acc_vl),
        delta_nl=format_value(delta_nl),
        acc_nl=format_value(avg_acc_nl)
    )

latex_code_other += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code_other)


{
  "Language Only LLM": {
    "vqa-v2": NaN,
    "textvqa-ocr": NaN,
    "textvqa-pure": NaN,
    "gqa": NaN,
    "refcoco": NaN,
    "wsc273": 0.5677655677655677,
    "winogrande": 0.4956590370955012,
    "lambada_standard": 0.2334562390840287,
    "arc_easy": 0.44234006734006737,
    "arc_challenge": 0.19965870307167236
  },
  "Naive FT (160M)": {
    "vqa-v2": 0.3032,
    "textvqa-ocr": 0.0240234375,
    "textvqa-pure": 0.038281249999999996,
    "gqa": 0.2217,
    "refcoco": 0.00390625,
    "wsc273": 0.5347985347985348,
    "winogrande": 0.5193370165745856,
    "lambada_standard": 0.11313797787696488,
    "arc_easy": 0.390993265993266,
    "arc_challenge": 0.20051194539249148
  },
  "Soft Targets (160M)": {
    "vqa-v2": 0.3267,
    "textvqa-ocr": 0.06923828124999999,
    "textvqa-pure": 0.061035156249999986,
    "gqa": 0.2539,
    "refcoco": 0.015625,
    "wsc273": 0.5128205128205128,
    "winogrande": 0.5082872928176796,
    "lambada_standard": 0.17019212109450804,
    "arc_easy"