In [3]:
import json

# Define the input and output file names
input_file = 'results_nlp.json'
output_file = 'results_CL.json'

# Load the JSON data from the input file
with open(input_file, 'r') as file:
    data = json.load(file)

# Filter out keys that start with 'cl-'
filtered_data = {key: value for key, value in data.items() if key.startswith('cl-') or key == 'reproduction-align-pythia+410m'}

# Write the filtered data to the output file
with open(output_file, 'w') as file:
    json.dump(filtered_data, file, indent=4)

print(f"Filtered data has been written to {output_file}")

Filtered data has been written to results_CL.json


In [6]:
import json
import numpy as np

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_datasets = datasets

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ]
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_CL.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        avg_delta_vl = np.nanmean([change[dataset] for dataset in vl_datasets])
        avg_acc_vl = np.nanmean([current_results.get(dataset, np.nan) for dataset in vl_datasets])
        avg_delta_nlu = np.nanmean([change[dataset] for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_acc_nlu = np.nanmean([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_delta_nlg = change["lambada_standard"]
        avg_acc_nlg = current_results.get("lambada_standard", np.nan)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NLU': avg_delta_nlu, 'NLG': avg_delta_nlg}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NLU': avg_acc_nlu, 'NLG': avg_acc_nlg}
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'sgm': 'SGM',
    'sgm-rehearsal1': 'SGM Rehearsal',
    'sgm-olf': 'SGM OLF',
    'rehearsal1': 'Rehearsal1',
    'lora': 'LoRA',
    'naive-ft': 'Naive FT',
    'soft': 'Soft',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \\label{tab:vl_nlu_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{8}{c|}{\\textbf{Vision-Language (VL)}} & \\multicolumn{8}{c}{\\textbf{NLU/NLG}} \\\\
     & \\multicolumn{2}{c|}{\\textbf{Instruct (0)}} & \\multicolumn{2}{c|}{\\textbf{VQA (1)}} & \\multicolumn{2}{c|}{\\textbf{OCR (2)}} & \\multicolumn{2}{c|}{\\textbf{Ref (3)}} & \\multicolumn{2}{c|}{\\textbf{Instruct (0)}} & \\multicolumn{2}{c|}{\\textbf{VQA (1)}} & \\multicolumn{2}{c|}{\\textbf{OCR (2)}} & \\multicolumn{2}{c}{\\textbf{Ref (3)}} \\\\
     & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['VL'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['VL'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['VL'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['VL'] * 100,
    tasks['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100,
    tasks['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100,
    tasks['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100,
    tasks['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100
)

latex_code += "{model} & {t0_vl_a:.1f} & {t0_vl_d:.1f} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t0_nlu_a:.1f} & {t0_nlu_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_vl_d=model_results[2],
    t1_vl_a=model_results[3], t1_vl_d=model_results[4],
    t2_vl_a=model_results[5], t2_vl_d=model_results[6],
    t3_vl_a=model_results[7], t3_vl_d=model_results[8],
    t0_nlu_a=model_results[9], t0_nlu_d=model_results[10],
    t1_nlu_a=model_results[11], t1_nlu_d=model_results[12],
    t2_nlu_a=model_results[13], t2_nlu_d=model_results[14],
    t3_nlu_a=model_results[15], t3_nlu_d=model_results[16]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['VL'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['VL'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['VL'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['VL'] * 100,
        tasks['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100,
        tasks['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100,
        tasks['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100,
        tasks['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.1f} & {t0_vl_d:.1f} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t0_nlu_a:.1f} & {t0_nlu_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_vl_d=model_results[2],
        t1_vl_a=model_results[3], t1_vl_d=model_results[4],
        t2_vl_a=model_results[5], t2_vl_d=model_results[6],
        t3_vl_a=model_results[7], t3_vl_d=model_results[8],
        t0_nlu_a=model_results[9], t0_nlu_d=model_results[10],
        t1_nlu_a=model_results[11], t1_nlu_d=model_results[12],
        t2_nlu_a=model_results[13], t2_nlu_d=model_results[14],
        t3_nlu_a=model_results[15], t3_nlu_d=model_results[16]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)



\begin{table*}[h]
  \caption{\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \label{tab:vl_nlu_acc}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc}
     \toprule
     \textbf{Model} & \multicolumn{8}{c|}{\textbf{Vision-Language (VL)}} & \multicolumn{8}{c}{\textbf{NLU/NLG}} \\
     & \multicolumn{2}{c|}{\textbf{Instruct (0)}} & \multicolumn{2}{c|}{\textbf{VQA (1)}} & \multicolumn{2}{c|}{\textbf{OCR (2)}} & \multicolumn{2}{c|}{\textbf{Ref (3)}} & \multicolumn{2}{c|}{\textbf{Instruct (0)}} & \multicolumn{2}{c|}{\textbf{VQA (1)}} & \multicolumn{2}{c|}{\textbf{OCR (2)}} & \multicolumn{2}{c}{\textbf{Ref (3)}} \\
     & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{$\Delta$} & \textbf{A} & \textbf{

In [11]:
import json
import numpy as np

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_datasets = datasets

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ]
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_CL.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        avg_delta_vl = np.nanmean([change[dataset] for dataset in vl_datasets])
        avg_acc_vl = np.nanmean([current_results.get(dataset, np.nan) for dataset in vl_datasets])
        avg_delta_nlu = np.nanmean([change[dataset] for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_acc_nlu = np.nanmean([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_delta_nlg = change["lambada_standard"]
        avg_acc_nlg = current_results.get("lambada_standard", np.nan)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NLU': avg_delta_nlu, 'NLG': avg_delta_nlg}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NLU': avg_acc_nlu, 'NLG': avg_acc_nlg}
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'sgm': 'SGM',
    'sgm-rehearsal1': 'SGM Rehearsal',
    'sgm-olf': 'SGM OLF',
    'rehearsal1': 'Rehearsal1',
    'lora': 'LoRA',
    'naive-ft': 'Naive FT',
    'soft': 'Soft',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \\label{tab:vl_nlu_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{6}{c|}{\\textbf{Task 0 (Instruct)}} & \\multicolumn{6}{c|}{\\textbf{Task 1 (VQA)}} & \\multicolumn{6}{c|}{\\textbf{Task 2 (OCR)}} & \\multicolumn{6}{c|}{\\textbf{Task 3 (Ref)}} \\\\
     & \\multicolumn{2}{c|}{\\textbf{VL}} & \\multicolumn{2}{c|}{\\textbf{NLU}} & \\multicolumn{2}{c|}{\\textbf{NLG}} & \\multicolumn{2}{c|}{\\textbf{VL}} & \\multicolumn{2}{c|}{\\textbf{NLU}} & \\multicolumn{2}{c|}{\\textbf{NLG}} & \\multicolumn{2}{c|}{\\textbf{VL}} & \\multicolumn{2}{c|}{\\textbf{NLU}} & \\multicolumn{2}{c|}{\\textbf{NLG}} & \\multicolumn{2}{c|}{\\textbf{VL}} & \\multicolumn{2}{c|}{\\textbf{NLU}} & \\multicolumn{2}{c|}{\\textbf{NLG}} \\\\
     & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$}\\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['VL'] * 100,
    tasks['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100,
    tasks['stage_0_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['VL'] * 100,
    tasks['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100,
    tasks['stage_1_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['VL'] * 100,
    tasks['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100,
    tasks['stage_2_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['VL'] * 100,
    tasks['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100,
    tasks['stage_3_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100
)

latex_code += "{model} & {t0_vl_a:.1f} & {t0_vl_d:.1f} & {t0_nlu_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_a:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_a:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_a:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_a:.1f} & {t3_nlg_d:.1f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_vl_d=model_results[2],
    t0_nlu_a=model_results[3], t0_nlu_d=model_results[4],
    t0_nlg_a=model_results[5], t0_nlg_d=model_results[6],
    t1_vl_a=model_results[7], t1_vl_d=model_results[8],
    t1_nlu_a=model_results[9], t1_nlu_d=model_results[10],
    t1_nlg_a=model_results[11], t1_nlg_d=model_results[12],
    t2_vl_a=model_results[13], t2_vl_d=model_results[14],
    t2_nlu_a=model_results[15], t2_nlu_d=model_results[16],
    t2_nlg_a=model_results[17], t2_nlg_d=model_results[18],
    t3_vl_a=model_results[19], t3_vl_d=model_results[20],
    t3_nlu_a=model_results[21], t3_nlu_d=model_results[22],
    t3_nlg_a=model_results[23], t3_nlg_d=model_results[24]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['VL'] * 100,
        tasks['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100,
        tasks['stage_0_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['VL'] * 100,
        tasks['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100,
        tasks['stage_1_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['VL'] * 100,
        tasks['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100,
        tasks['stage_2_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['VL'] * 100,
        tasks['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100,
        tasks['stage_3_avg']['NLG'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.1f} & {t0_vl_d:.1f} & {t0_nlu_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_a:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_a:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_a:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_a:.1f} & {t3_nlg_d:.1f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_vl_d=model_results[2],
        t0_nlu_a=model_results[3], t0_nlu_d=model_results[4],
        t0_nlg_a=model_results[5], t0_nlg_d=model_results[6],
        t1_vl_a=model_results[7], t1_vl_d=model_results[8],
        t1_nlu_a=model_results[9], t1_nlu_d=model_results[10],
        t1_nlg_a=model_results[11], t1_nlg_d=model_results[12],
        t2_vl_a=model_results[13], t2_vl_d=model_results[14],
        t2_nlu_a=model_results[15], t2_nlu_d=model_results[16],
        t2_nlg_a=model_results[17], t2_nlg_d=model_results[18],
        t3_vl_a=model_results[19], t3_vl_d=model_results[20],
        t3_nlu_a=model_results[21], t3_nlu_d=model_results[22],
        t3_nlg_a=model_results[23], t3_nlg_d=model_results[24]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)



\begin{table*}[h]
  \caption{\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \label{tab:vl_nlu_acc}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc|cc}
     \toprule
     \textbf{Model} & \multicolumn{6}{c|}{\textbf{Task 0 (Instruct)}} & \multicolumn{6}{c|}{\textbf{Task 1 (VQA)}} & \multicolumn{6}{c|}{\textbf{Task 2 (OCR)}} & \multicolumn{6}{c|}{\textbf{Task 3 (Ref)}} \\
     & \multicolumn{2}{c|}{\textbf{VL}} & \multicolumn{2}{c|}{\textbf{NLU}} & \multicolumn{2}{c|}{\textbf{NLG}} & \multicolumn{2}{c|}{\textbf{VL}} & \multicolumn{2}{c|}{\textbf{NLU}} & \multicolumn{2}{c|}{\textbf{NLG}} & \multicolumn{2}{c|}{\textbf{VL}} & \multicolumn{2}{c|}{\textbf{NLU}} & \multicolumn{2}{c|}{\textbf{NLG}} & \multicolumn{2}{c|}{\textbf{VL}} & \multicolumn{2}{c|}{\textbf{NLU}} & \multicolumn{2}{c|}{\textbf{NLG}} \\
     & \textbf{A $\uparrow$} & 

In [18]:
import json
import numpy as np

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_datasets = datasets

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "olf": [
        "cl-instruct-stage-0-pythia+410m-olf",
        "cl-vqa-stage-1-pythia+410m-olf",
        "cl-ocr-stage-2-pythia+410m-olf",
        "cl-ref-stage-3-pythia+410m-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ],
    
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_CL.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        avg_delta_vl = np.nanmean([change[dataset] for dataset in vl_datasets])
        avg_acc_vl = np.nanmean([current_results.get(dataset, np.nan) for dataset in vl_datasets])
        avg_delta_nlu = np.nanmean([change[dataset] for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_acc_nlu = np.nanmean([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_delta_nlg = change["lambada_standard"]
        avg_acc_nlg = current_results.get("lambada_standard", np.nan)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NLU': avg_delta_nlu, 'NLG': avg_delta_nlg}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NLU': avg_acc_nlu, 'NLG': avg_acc_nlg}
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'olf': 'OLF',
    'sgm': 'mSGM',
    'sgm-rehearsal1': 'mSGM + Rehearsal \((1\%)\)',
    'sgm-olf': 'mSGM + OLF',
    'rehearsal1': 'Rehearsal \((1\%)\)',
    'lora': 'LoRA',
    'naive-ft': 'Original LLaVA',
    'soft': 'Soft Targets (ST)',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \\label{tab:vl_nlu_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|ccc|ccc|ccc|ccc}
     \\toprule
     \\textbf{Model} & \\multicolumn{3}{c|}{\\textbf{Task 0 (Instruct)}} & \\multicolumn{3}{c|}{\\textbf{Task 1 (VQA)}} & \\multicolumn{3}{c|}{\\textbf{Task 2 (OCR)}} & \\multicolumn{3}{c}{\\textbf{Task 3 (Ref)}} \\\\
     & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} \\\\
     & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{A $\\uparrow$} & \\textbf{$\\Delta \\uparrow$} & \\textbf{$\\Delta \\uparrow$} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100
)

latex_code += "{model} & {t0_vl_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_d:.1f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_nlu_d=model_results[2], t0_nlg_d=model_results[3],
    t1_vl_a=model_results[4], t1_nlu_d=model_results[5], t1_nlg_d=model_results[6],
    t2_vl_a=model_results[7], t2_nlu_d=model_results[8], t2_nlg_d=model_results[9],
    t3_vl_a=model_results[10], t3_nlu_d=model_results[11], t3_nlg_d=model_results[12]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_d:.1f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_nlu_d=model_results[2], t0_nlg_d=model_results[3],
        t1_vl_a=model_results[4], t1_nlu_d=model_results[5], t1_nlg_d=model_results[6],
        t2_vl_a=model_results[7], t2_nlu_d=model_results[8], t2_nlg_d=model_results[9],
        t3_vl_a=model_results[10], t3_nlu_d=model_results[11], t3_nlg_d=model_results[12]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)



\begin{table*}[h]
  \caption{\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \label{tab:vl_nlu_acc}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|ccc|ccc|ccc|ccc}
     \toprule
     \textbf{Model} & \multicolumn{3}{c|}{\textbf{Task 0 (Instruct)}} & \multicolumn{3}{c|}{\textbf{Task 1 (VQA)}} & \multicolumn{3}{c|}{\textbf{Task 2 (OCR)}} & \multicolumn{3}{c}{\textbf{Task 3 (Ref)}} \\
     & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} \\
     & \textbf{A $\uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{A $\uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{A $\uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{A $\uparrow$} & \textbf{$\Delta \uparrow$} & \textbf{$\Delta \uparrow$} \\
    

In [19]:
import json
import numpy as np

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_datasets = datasets

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "olf": [
        "cl-instruct-stage-0-pythia+410m-olf",
        "cl-vqa-stage-1-pythia+410m-olf",
        "cl-ocr-stage-2-pythia+410m-olf",
        "cl-ref-stage-3-pythia+410m-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ],
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_CL.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        avg_delta_vl = np.nanmean([change[dataset] for dataset in vl_datasets])
        avg_acc_vl = np.nanmean([current_results.get(dataset, np.nan) for dataset in vl_datasets])
        avg_delta_nlu = np.nanmean([change[dataset] for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_acc_nlu = np.nanmean([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_delta_nlg = change["lambada_standard"]
        avg_acc_nlg = current_results.get("lambada_standard", np.nan)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NLU': avg_delta_nlu, 'NLG': avg_delta_nlg}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NLU': avg_acc_nlu, 'NLG': avg_acc_nlg}
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'olf': 'OLF',
    'sgm': 'mSGM',
    'sgm-rehearsal1': 'mSGM + Rehearsal \((1\%)\)',
    'sgm-olf': 'mSGM + OLF',
    'rehearsal1': 'Rehearsal \((1\%)\)',
    'lora': 'LoRA',
    'naive-ft': 'Original LLaVA',
    'soft': 'Soft Targets (ST)',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \\label{tab:vl_nlu_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
     \\toprule
     \\textbf{Model} & \\multicolumn{3}{c|}{\\textbf{Task 0 (Instruct)}} & \\multicolumn{3}{c|}{\\textbf{Task 1 (VQA)}} & \\multicolumn{3}{c|}{\\textbf{Task 2 (OCR)}} & \\multicolumn{3}{c|}{\\textbf{Task 3 (Ref)}} & \\multicolumn{3}{c}{\\textbf{Avg.}} \\\\
     & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} & \\textbf{VL} & \\textbf{NLU} & \\textbf{NLG} \\\\
     & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} & \\textbf{($\\Delta \\uparrow$)} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

# Calculate the average values across all tasks
avg_vl_a = np.mean([tasks[f'stage_{i}_avg']['VL'] for i in range(4)]) * 100
avg_nlu_d = np.mean([cl_performance_change[model_name][f'stage_{i}_avg']['NLU'] for i in range(4)]) * 100
avg_nlg_d = np.mean([cl_performance_change[model_name][f'stage_{i}_avg']['NLG'] for i in range(4)]) * 100

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100,
    avg_vl_a, avg_nlu_d, avg_nlg_d
)

latex_code += "{model} & {t0_vl_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_d:.1f} & {avg_vl_a:.1f} & {avg_nlu_d:.1f} & {avg_nlg_d:.1f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_nlu_d=model_results[2], t0_nlg_d=model_results[3],
    t1_vl_a=model_results[4], t1_nlu_d=model_results[5], t1_nlg_d=model_results[6],
    t2_vl_a=model_results[7], t2_nlu_d=model_results[8], t2_nlg_d=model_results[9],
    t3_vl_a=model_results[10], t3_nlu_d=model_results[11], t3_nlg_d=model_results[12],
    avg_vl_a=model_results[13], avg_nlu_d=model_results[14], avg_nlg_d=model_results[15]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    # Calculate the average values across all tasks
    avg_vl_a = np.mean([tasks[f'stage_{i}_avg']['VL'] for i in range(4)]) * 100
    avg_nlu_d = np.mean([cl_performance_change[model_name][f'stage_{i}_avg']['NLU'] for i in range(4)]) * 100
    avg_nlg_d = np.mean([cl_performance_change[model_name][f'stage_{i}_avg']['NLG'] for i in range(4)]) * 100

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_0_avg']['NLG'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_1_avg']['NLG'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_2_avg']['NLG'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLU'] * 100, cl_performance_change[model_name]['stage_3_avg']['NLG'] * 100,
        avg_vl_a, avg_nlu_d, avg_nlg_d
    )
    
    latex_code += "{model} & {t0_vl_a:.1f} & {t0_nlu_d:.1f} & {t0_nlg_d:.1f} & {t1_vl_a:.1f} & {t1_nlu_d:.1f} & {t1_nlg_d:.1f} & {t2_vl_a:.1f} & {t2_nlu_d:.1f} & {t2_nlg_d:.1f} & {t3_vl_a:.1f} & {t3_nlu_d:.1f} & {t3_nlg_d:.1f} & {avg_vl_a:.1f} & {avg_nlu_d:.1f} & {avg_nlg_d:.1f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_nlu_d=model_results[2], t0_nlg_d=model_results[3],
        t1_vl_a=model_results[4], t1_nlu_d=model_results[5], t1_nlg_d=model_results[6],
        t2_vl_a=model_results[7], t2_nlu_d=model_results[8], t2_nlg_d=model_results[9],
        t3_vl_a=model_results[10], t3_nlu_d=model_results[11], t3_nlg_d=model_results[12],
        avg_vl_a=model_results[13], avg_nlu_d=model_results[14], avg_nlg_d=model_results[15]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)



\begin{table*}[h]
  \caption{\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \label{tab:vl_nlu_acc}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|ccc|ccc|ccc|ccc|ccc}
     \toprule
     \textbf{Model} & \multicolumn{3}{c|}{\textbf{Task 0 (Instruct)}} & \multicolumn{3}{c|}{\textbf{Task 1 (VQA)}} & \multicolumn{3}{c|}{\textbf{Task 2 (OCR)}} & \multicolumn{3}{c|}{\textbf{Task 3 (Ref)}} & \multicolumn{3}{c}{\textbf{Avg.}} \\
     & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} & \textbf{VL} & \textbf{NLU} & \textbf{NLG} \\
     & \textbf{(A $\uparrow$)} & \textbf{($\Delta \uparrow$)} & \textbf{($\Delta \uparrow$)} & \textbf{(A $\uparrow$)} & \textbf{($\Delta \uparrow$)} & \textbf{($\Delta \uparrow$)} & \textbf{(A $\uparrow$)} & \textbf{($\Delta \uparrow$)} & \textbf{($\Delt