In [1]:
import json

# Define the input and output file names
input_file = 'results_nlp.json'
output_file = 'results_CL.json'

# Load the JSON data from the input file
with open(input_file, 'r') as file:
    data = json.load(file)

# Filter out keys that start with 'cl-'
filtered_data = {key: value for key, value in data.items() if key.startswith('cl-') or key == 'reproduction-align-pythia+410m'}

# Write the filtered data to the output file
with open(output_file, 'w') as file:
    json.dump(filtered_data, file, indent=4)

print(f"Filtered data has been written to {output_file}")

Filtered data has been written to results_CL.json


In [2]:
import json
import numpy as np

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_datasets = datasets

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ]
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_CL.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        avg_delta_vl = np.nanmean([change[dataset] for dataset in vl_datasets])
        avg_acc_vl = np.nanmean([current_results.get(dataset, np.nan) for dataset in vl_datasets])
        avg_delta_nlu = np.nanmean([change[dataset] for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_acc_nlu = np.nanmean([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets if dataset != "lambada_standard"])
        avg_delta_nlg = change["lambada_standard"]
        avg_acc_nlg = current_results.get("lambada_standard", np.nan)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NLU': avg_delta_nlu, 'NLG': avg_delta_nlg}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NLU': avg_acc_nlu, 'NLG': avg_acc_nlg}
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'sgm': 'SGM',
    'sgm-rehearsal1': 'SGM Rehearsal',
    'sgm-olf': 'SGM OLF',
    'rehearsal1': 'Rehearsal1',
    'lora': 'LoRA',
    'naive-ft': 'Naive FT',
    'soft': 'Soft',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NLU/NLG tasks}
  \\label{tab:vl_nlu_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{6}{c|}{\\textbf{Vision-Language (VL)}} & \\multicolumn{6}{c}{\\textbf{NLU/NLG}} \\\\
     & \\multicolumn{2}{c|}{\\textbf{Instruct (0)}} & \\multicolumn{2}{c|}{\\textbf{VQA (1)}} & \\multicolumn{2}{c|}{\\textbf{OCR (2)}} & \\multicolumn{2}{c|}{\\textbf{Ref (3)}} & \\multicolumn{2}{c|}{\\textbf{Instruct (0)}} & \\multicolumn{2}{c|}{\\textbf{VQA (1)}} & \\multicolumn{2}{c}{\\textbf{OCR (2)}} \\\\
     & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} & \\textbf{A} & \\textbf{$\\Delta$} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

for stage_idx in range(4):
    task_perf = tasks.get(f'stage_{stage_idx}_avg', {'VL': np.nan, 'NLU': np.nan, 'NLG': np.nan})
    task_forget = cl_performance_change[model_name].get(f'stage_{stage_idx}_avg', {'VL': np.nan, 'NLU': np.nan, 'NLG': np.nan})

    model_results = (
        name_mapping[model_name],
        task_perf['VL'] * 100, task_forget['VL'] * 100,
        task_perf['NLU'] * 100, task_forget['NLU'] * 100,
        task_perf['NLG'] * 100, task_forget['NLG'] * 100
    )
    latex_code += "{model} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t4_vl_a:.1f} & {t4_vl_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} & {t4_nlu_a:.1f} & {t4_nlu_d:.1f} \\\\\n".format(
        model=model_results[0],
        t1_vl_a=model_results[1], t1_vl_d=model_results[2],
        t2_vl_a=model_results[3], t2_vl_d=model_results[4],
        t3_vl_a=model_results[5], t3_vl_d=model_results[6],
        t4_vl_a=model_results[7], t4_vl_d=model_results[8],
        t1_nlu_a=model_results[9], t1_nlu_d=model_results[10],
        t2_nlu_a=model_results[11], t2_nlu_d=model_results[12],
        t3_nlu_a=model_results[13], t3_nlu_d=model_results[14],
        t4_nlu_a=model_results[15], t4_nlu_d=model_results[16]
    )

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    for stage_idx in range(4):
        task_perf = tasks.get(f'stage_{stage_idx}_avg', {'VL': np.nan, 'NLU': np.nan, 'NLG': np.nan})
        task_forget = cl_performance_change[model_name].get(f'stage_{stage_idx}_avg', {'VL': np.nan, 'NLU': np.nan, 'NLG': np.nan})

        model_results = (
            name_mapping[model_name],
            task_perf['VL'] * 100, task_forget['VL'] * 100,
            task_perf['NLU'] * 100, task_forget['NLU'] * 100,
            task_perf['NLG'] * 100, task_forget['NLG'] * 100
        )
        latex_code += "{model} & {t1_vl_a:.1f} & {t1_vl_d:.1f} & {t2_vl_a:.1f} & {t2_vl_d:.1f} & {t3_vl_a:.1f} & {t3_vl_d:.1f} & {t4_vl_a:.1f} & {t4_vl_d:.1f} & {t1_nlu_a:.1f} & {t1_nlu_d:.1f} & {t2_nlu_a:.1f} & {t2_nlu_d:.1f} & {t3_nlu_a:.1f} & {t3_nlu_d:.1f} & {t4_nlu_a:.1f} & {t4_nlu_d:.1f} \\\\\n".format(
            model=model_results[0],
            t1_vl_a=model_results[1], t1_vl_d=model_results[2],
            t2_vl_a=model_results[3], t2_vl_d=model_results[4],
            t3_vl_a=model_results[5], t3_vl_d=model_results[6],
            t4_vl_a=model_results[7], t4_vl_d=model_results[8],
            t1_nlu_a=model_results[9], t1_nlu_d=model_results[10],
            t2_nlu_a=model_results[11], t2_nlu_d=model_results[12],
            t3_nlu_a=model_results[13], t3_nlu_d=model_results[14],
            t4_nlu_a=model_results[15], t4_nlu_d=model_results[16]
        )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


Run 'cl-instruct-stage-0-pythia+410m-lora' missing for model 'lora'
Run 'cl-vqa-stage-1-pythia+410m-sgm-rehearsal1' missing for model 'sgm-rehearsal1'


IndexError: tuple index out of range