In [1]:
import os
os.chdir('..')

In [2]:

import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the dataset stages and the corresponding labels
stages = ["Instruct (2)", "VQA (3)", "OCR (4)", "Ref (5)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_evaluate_sequence = [["vqa-v2"], ["vqa-v2", "gqa"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure", "refcoco"]]

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ],

}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_nlp.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Function to replace None values with a small positive value
def replace_none_and_zeros(arr, epsilon=EPSILON):
    return [epsilon if (x is None or np.isnan(x) or x == 0) else x for x in arr]

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        vl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        baseline_vl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        avg_delta_vl = hmean(baseline_vl_scores) - hmean(vl_scores)
        avg_acc_vl = hmean(vl_scores)
        
        nl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        baseline_nl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        avg_delta_nl = hmean(baseline_nl_scores) -  hmean(nl_scores)
        avg_acc_nl = hmean(nl_scores)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NL': avg_delta_nl}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NL': avg_acc_nl}
        print(f"Model: {model_name}, Stage: {i}, VL: {avg_acc_vl}, NL: {avg_acc_nl}")
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'olf': 'OLF',
    'sgm': 'mSGM',
    'sgm-rehearsal1': 'mSGM + Rehearsal \((1\%)\)',
    'sgm-olf': 'mSGM + OLF',
    'rehearsal1': 'Rehearsal \((1\%)\)',
    'rehearsal10': 'Rehearsal \((10\%)\)',
    'rehearsal20': 'Rehearsal \((20\%)\)',
    'lora': 'LoRA',
    'naive-ft': 'Original LLaVA',
    'soft': 'Soft Targets (ST)',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NL tasks}
  \\label{tab:vl_nl_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{2}{c|}{\\textbf{Task 0 (Instruct)}} & \\multicolumn{2}{c|}{\\textbf{Task 1 (VQA)}} & \\multicolumn{2}{c|}{\\textbf{Task 2 (OCR)}} & \\multicolumn{2}{c}{\\textbf{Task 3 (Ref)}} \\\\
     & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} \\\\
     & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
)

latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_nl_d=model_results[2],
    t1_vl_a=model_results[3], t1_nl_d=model_results[4],
    t2_vl_a=model_results[5], t2_nl_d=model_results[6],
    t3_vl_a=model_results[7], t3_nl_d=model_results[8]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_nl_d=model_results[2],
        t1_vl_a=model_results[3], t1_nl_d=model_results[4],
        t2_vl_a=model_results[5], t2_nl_d=model_results[6],
        t3_vl_a=model_results[7], t3_nl_d=model_results[8]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


Model: naive-ft, Stage: 0, VL: 0.001, NL: 0.3863352100999474
Model: naive-ft, Stage: 1, VL: 0.4421524886376234, NL: 0.2700812156708484
Model: naive-ft, Stage: 2, VL: 0.16669767843708932, NL: 0.34263702858390443
Model: naive-ft, Stage: 3, VL: 0.004805223973421436, NL: 0.3151680192897782
Model: soft, Stage: 0, VL: 0.0045000000000000005, NL: 0.38408061658239784
Model: soft, Stage: 1, VL: 0.0015918367346938777, NL: 0.2454425568378775
Model: soft, Stage: 2, VL: 0.10230418779859914, NL: 0.33810199662515095
Model: soft, Stage: 3, VL: 0.003102369722228088, NL: 0.2831706057243446
Model: lora, Stage: 0, VL: 0.002, NL: 0.3784273481481115
Model: lora, Stage: 1, VL: 0.374638336115034, NL: 0.3799173833943919
Model: lora, Stage: 2, VL: 0.14028927544647746, NL: 0.367197878990521
Model: lora, Stage: 3, VL: 0.09589530345069068, NL: 0.3485350680819869
Model: sgm, Stage: 0, VL: 0.001, NL: 0.3810485695829846
Model: sgm, Stage: 1, VL: 0.363138591322978, NL: 0.37644060500130067
Model: sgm, Stage: 2, VL: 0.11

In [3]:
# --- INSERT *AFTER* the cl_performance and cl_performance_change are computed ---

# List of all VL evaluation tasks (in evaluation order)
vl_tasks = vl_evaluate_sequence[-1]  # ['vqa-v2', 'gqa', 'textvqa-ocr', 'textvqa-pure', 'refcoco']

# For human‐readable labels
task_labels = {
    'vqa-v2':    'VQA (1)',
    'gqa':       'GQA (1)',
    'textvqa-ocr': 'OCR (2)',
    'textvqa-pure': 'OCR-only (2)',
    'refcoco':   'Ref (3)'
}

# Generate one LaTeX table per method
for model_key, perf in cl_performance.items():
    display_name = name_mapping.get(model_key, model_key)
    print(f"\\begin{{table*}}[h]")
    print(f"  \\caption{{\\textbf{{{display_name}}}: Accuracy on each VL task after training up to stage j}}")
    print(f"  \\label{{tab:acc_{model_key}}}")
    # column headers: trained stage → evaluated task
    cols = " & ".join([task_labels[d] for d in vl_tasks])
    print(f"  \\begin{{tabular}}{{l|{'c' * len(vl_tasks)}}}")
    print(f"    \\toprule")
    print(f"    \\textbf{{Trained on}} & {cols} \\\\")
    print(f"    \\midrule")
    # for each training stage j
    for i, stage_name in enumerate(stages):
        row = [stage_name]
        results = perf[f"stage_{i}"]
        # fetch accuracy for each eval task (multiply by 100 if desired)
        for d in vl_tasks:
            acc = results.get(d, np.nan)
            if np.isnan(acc):
                row.append("---")
            else:
                row.append(f"{acc * 100:.2f}")
        print("    " + " & ".join(row) + " \\\\")
    print(f"    \\bottomrule")
    print(f"  \\end{{tabular}}")
    print(f"\\end{{table*}}")
    print("\n")

\begin{table*}[h]
  \caption{\textbf{Original LLaVA}: Accuracy on each VL task after training up to stage j}
  \label{tab:acc_naive-ft}
  \begin{tabular}{l|ccccc}
    \toprule
    \textbf{Trained on} & VQA (1) & GQA (1) & OCR (2) & OCR-only (2) & Ref (3) \\
    \midrule
    Instruct (0) & --- & --- & --- & --- & --- \\
    VQA (1) & 51.44 & 38.77 & 9.84 & 10.78 & 0.00 \\
    OCR (2) & 45.13 & 30.47 & 9.86 & 11.96 & 0.00 \\
    Ref (3) & 46.84 & 30.57 & 3.90 & 10.57 & 0.00 \\
    \bottomrule
  \end{tabular}
\end{table*}


\begin{table*}[h]
  \caption{\textbf{Soft Targets (ST)}: Accuracy on each VL task after training up to stage j}
  \label{tab:acc_soft}
  \begin{tabular}{l|ccccc}
    \toprule
    \textbf{Trained on} & VQA (1) & GQA (1) & OCR (2) & OCR-only (2) & Ref (3) \\
    \midrule
    Instruct (0) & 0.45 & 0.00 & 1.55 & 1.15 & 0.00 \\
    VQA (1) & 0.39 & 0.00 & 0.31 & 0.41 & 0.00 \\
    OCR (2) & 35.64 & 26.07 & 8.03 & 5.00 & 0.00 \\
    Ref (3) & 4.92 & 2.15 & 0.21 & 1.26 & 0.00

In [4]:
# --- INSERT *AFTER* the cl_performance and cl_performance_change are computed ---

import numpy as np

# List of all VL evaluation tasks (in evaluation order)
vl_tasks = vl_evaluate_sequence[-1]  # ['vqa-v2', 'gqa', 'textvqa-ocr', 'textvqa-pure', 'refcoco']

# For human‐readable labels
task_labels = {
    'vqa-v2':      'VQA (3)',
    'gqa':         'GQA (3)',
    'textvqa-ocr': 'OCR (4)',
    'textvqa-pure':'OCR-only (4)',
    'refcoco':     'Ref (5)'
}

# Begin big table
print(r"\begin{table*}[h]")
print(r"  \centering")
print(r"  \caption{\textbf{VL Accuracy After Continual Training}}")
print(r"  \label{tab:vl_acc_subtables}")
# Use a single tabular to stack subtables
print(r"  \begin{tabular}{c}")
print(r"    \toprule")

for model_key, perf in cl_performance.items():
    display_name = name_mapping.get(model_key, model_key)
    # Subtable for this method
    print(r"    %----------------------------")
    print(r"    \begin{subtable}{\linewidth}")
    print(r"      \centering")
    print(r"      \caption{" + display_name + r"}")
    # Header with arrows: multirow & multicolumn
    print(r"      \begin{tabular}{l|" + "c"*len(vl_tasks) + r"}")
    print(r"        \toprule")
    print(r"        \multirow{2}{*}{\textbf{Trained on} $\downarrow$} & "
          + rf"\multicolumn{{{len(vl_tasks)}}}{{c}}{{\textbf{{Evaluated}} $\rightarrow$}} \\")
    headers = " & ".join([r"\textbf{" + task_labels[d] + "}" for d in vl_tasks])
    print(r"        & " + headers + r" \\")
    print(r"        \midrule")
    # Rows: stages 0–3
    for i, stage_name in enumerate(stages):
        row = [stage_name]
        results = perf[f"stage_{i}"]
        for d in vl_tasks:
            acc = results.get(d, np.nan)
            row.append(f"{acc * 100:.2f}" if not np.isnan(acc) else "---")
        print("        " + " & ".join(row) + r" \\")
    print(r"        \bottomrule")
    print(r"      \end{tabular}")
    print(r"    \end{subtable}\\[1em]")

# Close big table
print(r"    \bottomrule")
print(r"  \end{tabular}")
print(r"\end{table*}")

\begin{table*}[h]
  \centering
  \caption{\textbf{VL Accuracy After Continual Training}}
  \label{tab:vl_acc_subtables}
  \begin{tabular}{c}
    \toprule
    %----------------------------
    \begin{subtable}{\linewidth}
      \centering
      \caption{Original LLaVA}
      \begin{tabular}{l|ccccc}
        \toprule
        \multirow{2}{*}{\textbf{Trained on} $\downarrow$} & \multicolumn{5}{c}{\textbf{Evaluated} $\rightarrow$} \\
        & \textbf{VQA (1)} & \textbf{GQA (1)} & \textbf{OCR (2)} & \textbf{OCR-only (2)} & \textbf{Ref (3)} \\
        \midrule
        Instruct (0) & --- & --- & --- & --- & --- \\
        VQA (1) & 51.44 & 38.77 & 9.84 & 10.78 & 0.00 \\
        OCR (2) & 45.13 & 30.47 & 9.86 & 11.96 & 0.00 \\
        Ref (3) & 46.84 & 30.57 & 3.90 & 10.57 & 0.00 \\
        \bottomrule
      \end{tabular}
    \end{subtable}\\[1em]
    %----------------------------
    \begin{subtable}{\linewidth}
      \centering
      \caption{Soft Targets (ST)}
      \begin{tabular}{l|ccccc}

In [3]:
# --- INSERT *AFTER* the cl_performance and cl_performance_change are computed ---

import numpy as np

vl_tasks = vl_evaluate_sequence[-1]
task_labels = {
    'vqa-v2':      'VQAv2 (3)',
    'gqa':         'GQA (3)',
    'textvqa-ocr': 'TextVQA-OCR (4)',
    'textvqa-pure':'TextVQA-Pure (4)',
    'refcoco':     'RefCOCO (5)'
}

# Begin longtable
print(r"\begin{longtable}{l|" + "c"*len(vl_tasks) + r"}")
print(r"\caption{\textbf{VL Accuracy After Continual Training}}\label{tab:vl_acc_long} \\")
print(r"\toprule")
print(r"\multirow{2}{*}{\textbf{Trained on} $\downarrow$} & "
      + rf"\multicolumn{{{len(vl_tasks)}}}{{c}}{{\textbf{{Evaluated}} $\rightarrow$}} \\")
headers = " & ".join([r"\textbf{" + task_labels[d] + "}" for d in vl_tasks])
print(r" & " + headers + r" \\")
print(r"\midrule")
print(r"\endfirsthead")

# header for subsequent pages
print(r"\caption[]{\textbf{(continued)} VL Accuracy After Continual Training} \\")
print(r"\toprule")
print(r"\multirow{2}{*}{\textbf{Trained on} $\downarrow$} & "
      + rf"\multicolumn{{{len(vl_tasks)}}}{{c}}{{\textbf{{Evaluated}} $\rightarrow$}} \\")
print(r" & " + headers + r" \\")
print(r"\midrule")
print(r"\endhead")

print(r"\bottomrule")
print(r"\endfoot")

# Print a block for each method
for model_key, perf in cl_performance.items():
    name = name_mapping.get(model_key, model_key)
    # model separator
    print(r"\midrule")
    print(r"\multicolumn{" + str(len(vl_tasks)+1) + r"}{l}{\textbf{" + name + r"}} \\")
    print(r"\midrule")
    # rows for stages
    for i, stage in enumerate(stages):
        row = [stage]
        results = perf[f"stage_{i}"]
        for d in vl_tasks:
            acc = results.get(d, np.nan)
            row.append(f"{acc*100:.2f}" if not np.isnan(acc) else "---")
        print(" & ".join(row) + r" \\")
# End longtable
print(r"\end{longtable}")

\begin{longtable}{l|ccccc}
\caption{\textbf{VL Accuracy After Continual Training}}\label{tab:vl_acc_long} \\
\toprule
\multirow{2}{*}{\textbf{Trained on} $\downarrow$} & \multicolumn{5}{c}{\textbf{Evaluated} $\rightarrow$} \\
 & \textbf{VQAv2 (3)} & \textbf{GQA (3)} & \textbf{TextVQA-OCR (4)} & \textbf{TextVQA-Pure (4)} & \textbf{RefCOCO (5)} \\
\midrule
\endfirsthead
\caption[]{\textbf{(continued)} VL Accuracy After Continual Training} \\
\toprule
\multirow{2}{*}{\textbf{Trained on} $\downarrow$} & \multicolumn{5}{c}{\textbf{Evaluated} $\rightarrow$} \\
 & \textbf{VQAv2 (3)} & \textbf{GQA (3)} & \textbf{TextVQA-OCR (4)} & \textbf{TextVQA-Pure (4)} & \textbf{RefCOCO (5)} \\
\midrule
\endhead
\bottomrule
\endfoot
\midrule
\multicolumn{6}{l}{\textbf{Original LLaVA}} \\
\midrule
Instruct (2) & --- & --- & --- & --- & --- \\
VQA (3) & 51.44 & 38.77 & 9.84 & 10.78 & 0.00 \\
OCR (4) & 45.13 & 30.47 & 9.86 & 11.96 & 0.00 \\
Ref (5) & 46.84 & 30.57 & 3.90 & 10.57 & 0.00 \\
\midrule
\multicolum