In [5]:

import os
import json
import numpy as np

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
results_nlp = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        results_nlp = json.load(f)
else:
    print(f"Error: File {results_file} not found.")


In [11]:
import json
import numpy as np

# Load the results from the JSON file
with open('results_nlp.json') as f:
    results_dict = json.load(f)

# Define the models of interest and their corresponding baselines with labels
models_of_interest = {
    "stage-final-llava-v15-pythia+1p4b": ("reproduction-align-pythia+1p4b", "LLaVA + Pythia Instruct (1.4B)"),
    "stage-final-llava-v15-pythia+1p4b-instruct-old": ("reproduction-align-pythia+1p4b-instruct-old", "LLaVA + Pythia (1.4B)"),
    "reproduction-llava-v15+7b+stage-finetune+x7": ("reproduction-llava-v15+7b+stage-align+x7", "LLaVA + LLaMA2 Instruct (7B)"),
    "reproduction-llama2": ("vila_base_llm", "LLaVA + LLaMA2 Base (7B)")
}

# Function to format values or return "-"
def format_value(value):
    return "{:.1f}".format(value * 100) if not np.isnan(value) else "-"

# Prepare the data for the LaTeX tables
table_data = []

for model, (baseline, label) in models_of_interest.items():
    accuracies = results_dict[model]
    baseline_accuracies = results_dict[baseline]
    
    avg_acc_vl = sum(accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]) / 4
    
    nlu_deltas = {dataset: accuracies[dataset] - baseline_accuracies.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge"]}
    avg_delta_nlu = sum(nlu_deltas[dataset] for dataset in nlu_deltas) / 4
    avg_acc_nlu = sum(accuracies[dataset] for dataset in nlu_deltas) / 4
    
    delta_nlg = accuracies["lambada_standard"] - baseline_accuracies.get("lambada_standard", 0)
    avg_acc_nlg = accuracies["lambada_standard"]
    
    table_data.append((label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg))

# Sort the data by Avg. VL Accuracy and highest NLG Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[5]), reverse=True)

# Generate LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance}}
  \\label{tab:model_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c|}{\\textbf{NLU Avg.}} & \\multicolumn{2}{c}{\\textbf{NLG Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

for label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg in table_data:
    latex_code += "{label} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nlu} & {avg_acc_nlu} & {delta_nlg} & {avg_acc_nlg} \\\\\n".format(
        label=label,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        avg_acc_vl=format_value(avg_acc_vl),
        delta_nlu=format_value(avg_delta_nlu),
        avg_acc_nlu=format_value(avg_acc_nlu),
        delta_nlg=format_value(delta_nlg),
        avg_acc_nlg=format_value(avg_acc_nlg)
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{LLaVA Model Performance}}
  \label{tab:model_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|cccc|c|cc|cc}
     \toprule
     \textbf{Model} & \multicolumn{4}{c|}{\textbf{Vision-Language (VL)}} & \textbf{VL Avg.} & \multicolumn{2}{c|}{\textbf{NLU Avg.}} & \multicolumn{2}{c}{\textbf{NLG Avg.}} \\
     & \textbf{VQAv2} & \textbf{TextVQA OCR} & \textbf{TextVQA Pure} & \textbf{GQA} & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ \\
     \midrule
LLaVA + LLaMA2 Base (7B) & 75.9 & 55.2 & 45.4 & 60.2 & 59.2 & 2.7 & 70.0 & 0.4 & 68.7 \\
LLaVA + LLaMA2 Instruct (7B) & 74.5 & 56.3 & 45.9 & 56.2 & 58.2 & 0.3 & 68.8 & -2.0 & 62.3 \\
LLaVA + Pythia Instruct (1.4B) & 66.2 & 38.5 & 35.5 & 46.1 & 46.6 & -1.1 & 53.0 & -8.1 & 40.9 \\
LLaVA + Pythia (1.4B) & 64.0 & 39.8 & 34.4 & 44.5 & 45.7 & -4.6 & 49.5 & -12.6 & 36.3 \\

     \bottomrule
    \end{tabular}
  }
\end{table*}

