In [3]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the dataset stages and the corresponding labels
stages = ["Instruct (0)", "VQA (1)", "OCR (2)", "Ref (3)"]
datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]
vl_evaluate_sequence = [["vqa-v2"], ["vqa-v2", "gqa"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure", "refcoco"]]

# Define the mitigation methods and their sequence of model names
cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m",
        "cl-ocr-stage-2-pythia+410m",
        "cl-ref-stage-3-pythia+410m"
    ],
    "olf": [
        "cl-instruct-stage-0-pythia+410m-olf",
        "cl-vqa-stage-1-pythia+410m-olf",
        "cl-ocr-stage-2-pythia+410m-olf",
        "cl-ref-stage-3-pythia+410m-olf"
    ],
    "soft": [
        "cl-instruct-stage-0-pythia+410m-soft",
        "cl-vqa-stage-1-pythia+410m-soft",
        "cl-ocr-stage-2-pythia+410m-soft",
        "cl-ref-stage-3-pythia+410m-soft"
    ],
    "ia3": [
        "cl-instruct-stage-0-pythia+410m-ia3",
        "cl-vqa-stage-1-pythia+410m-ia3",
        "cl-ocr-stage-2-pythia+410m-ia3",
        "cl-ref-stage-3-pythia+410m-ia3"
    ],
    "lora": [
        "cl-instruct-stage-0-pythia+410m-lora",
        "cl-vqa-stage-1-pythia+410m-lora",
        "cl-ocr-stage-2-pythia+410m-lora",
        "cl-ref-stage-3-pythia+410m-lora"
    ],
    "sgm": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm",
        "cl-ocr-stage-2-pythia+410m-sgm",
        "cl-ref-stage-3-pythia+410m-sgm"
    ],
    "sgm-olf": [
        "cl-instruct-stage-0-pythia+410m-sgm-olf",
        "cl-vqa-stage-1-pythia+410m-sgm-olf",
        "cl-ocr-stage-2-pythia+410m-sgm-olf",
        "cl-ref-stage-3-pythia+410m-sgm-olf"
    ],
    "rehearsal1": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-rehearsal1",
        "cl-ref-stage-3-pythia+410m-rehearsal1"
    ],
    "rehearsal10": [
        "cl-instruct-stage-0-pythia+410m",
        "cl-vqa-stage-1-pythia+410m-rehearsal10",
        "cl-ocr-stage-2-pythia+410m-rehearsal10",
        "cl-ref-stage-3-pythia+410m-rehearsal10"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+410m-sgm",
        "cl-vqa-stage-1-pythia+410m-sgm-rehearsal1",
        "cl-ocr-stage-2-pythia+410m-sgm-rehearsal1",
        "cl-ref-stage-3-pythia+410m-sgm-rehearsal1"
    ],
}

# Baseline run_id
baseline_run_id = "reproduction-align-pythia+410m"

# Load the JSON data from the results file
with open('results_nlp.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Function to replace None values with a small positive value
def replace_none_and_zeros(arr, epsilon=EPSILON):
    return [epsilon if (x is None or np.isnan(x) or x == 0) else x for x in arr]

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        vl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        baseline_vl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        avg_delta_vl = hmean(baseline_vl_scores) - hmean(vl_scores)
        avg_acc_vl = hmean(vl_scores)
        
        nl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        baseline_nl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        avg_delta_nl = hmean(baseline_nl_scores) -  hmean(nl_scores)
        avg_acc_nl = hmean(nl_scores)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NL': avg_delta_nl}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NL': avg_acc_nl}
        print(f"Model: {model_name}, Stage: {i}, VL: {avg_acc_vl}, NL: {avg_acc_nl}")
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'olf': 'OLF',
    'sgm': 'mSGM',
    'sgm-rehearsal1': 'mSGM + Rehearsal \((1\%)\)',
    'sgm-olf': 'mSGM + OLF',
    'rehearsal1': 'Rehearsal \((1\%)\)',
    'rehearsal10': 'Rehearsal \((10\%)\)',
    'lora': 'LoRA',
    'naive-ft': 'Original LLaVA',
    'soft': 'Soft Targets (ST)',
    'ia3': 'IA3'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NL tasks}
  \\label{tab:vl_nl_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{2}{c|}{\\textbf{Task 0 (Instruct)}} & \\multicolumn{2}{c|}{\\textbf{Task 1 (VQA)}} & \\multicolumn{2}{c|}{\\textbf{Task 2 (OCR)}} & \\multicolumn{2}{c}{\\textbf{Task 3 (Ref)}} \\\\
     & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} & \\textbf{VL} & \\textbf{NL} \\\\
     & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} & \\textbf{(A $\\uparrow$)} & \\textbf{($\\Delta \\downarrow$)} \\\\
     \\midrule
"""

# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
)

latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_nl_d=model_results[2],
    t1_vl_a=model_results[3], t1_nl_d=model_results[4],
    t2_vl_a=model_results[5], t2_nl_d=model_results[6],
    t3_vl_a=model_results[7], t3_nl_d=model_results[8]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_nl_d=model_results[2],
        t1_vl_a=model_results[3], t1_nl_d=model_results[4],
        t2_vl_a=model_results[5], t2_nl_d=model_results[6],
        t3_vl_a=model_results[7], t3_nl_d=model_results[8]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


Model: naive-ft, Stage: 0, VL: 0.0037, NL: 0.3863352100999474
Model: naive-ft, Stage: 1, VL: 0.4421524886376234, NL: 0.2700812156708484
Model: naive-ft, Stage: 2, VL: 0.16669767843708932, NL: 0.34263702858390443
Model: naive-ft, Stage: 3, VL: 0.004805223973421436, NL: 0.3151680192897782
Model: olf, Stage: 0, VL: 0.0040999999999999995, NL: 0.38621147563923447
Model: olf, Stage: 1, VL: 0.43272009065155803, NL: 0.3355705798232125
Model: olf, Stage: 2, VL: 0.10596761881622202, NL: 0.35619713904716904
Model: olf, Stage: 3, VL: 0.020523550104608348, NL: 0.3279029484150912
Model: soft, Stage: 0, VL: 0.0045000000000000005, NL: 0.38408061658239784
Model: soft, Stage: 1, VL: 0.0015918367346938777, NL: 0.2454425568378775
Model: soft, Stage: 2, VL: 0.10230418779859914, NL: 0.33810199662515095
Model: soft, Stage: 3, VL: 0.003102369722228088, NL: 0.2831706057243446
Model: ia3, Stage: 0, VL: 0.001, NL: 0.40600564872147704
Model: ia3, Stage: 1, VL: 0.24948089844499902, NL: 0.3957752637623065
Model: ia

In [23]:
stages = ["Task 2 (Instruct)", "Task 3 (OCR)", "Task 4 (Ref)",  "Task 5 (VQA)"]
vl_evaluate_sequence = [["gqa"], ["gqa", "textvqa-ocr", "textvqa-pure"], ["refcoco", "gqa", "textvqa-ocr", "textvqa-pure"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure", "refcoco"]]

cl_runs = {
    "naive-ft": [
        "cl-instruct-stage-0-pythia+160m-iorv",
        "cl-ocr-stage-1-pythia+160m-iorv",
        "cl-ref-stage-2-pythia+160m-iorv",
        "cl-vqa-stage-3-pythia+160m-iorv"
    ],
    "sgm-rehearsal1": [
        "cl-instruct-stage-0-pythia+160m-sgm",
        "cl-ocr-stage-1-pythia+160m-sgm-rehearsal1-iorv",
        "cl-ref-stage-2-pythia+160m-sgm-rehearsal1-iorv",
        "cl-vqa-stage-3-pythia+160m-sgm-rehearsal1-iorv"
    ],
}


In [25]:
stages = ["Task 2 (VQA)", "Task 3 (OCR)", "Task 4 (Instruct)", "Task 5 (Ref)"]
vl_evaluate_sequence = [["vqa-v2"], ["vqa-v2", "textvqa-ocr", "textvqa-pure"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure"], ["vqa-v2", "gqa", "textvqa-ocr", "textvqa-pure", "refcoco"]]

cl_runs = {
    "naive-ft": [
        "cl-vqa-stage-0-pythia+160m-voir",
        "cl-ocr-stage-1-pythia+160m-voir",
        "cl-instruct-stage-2-pythia+160m-voir",
        "cl-ref-stage-3-pythia+160m-voir"
    ],
    "sgm-rehearsal1": [
        "cl-vqa-stage-0-pythia+160m-sgm-voir",
        "cl-ocr-stage-1-pythia+160m-sgm-rehearsal1-voir",
        "cl-instruct-stage-2-pythia+160m-sgm-rehearsal1-voir",
        "cl-ref-stage-3-pythia+160m-sgm-rehearsal1-voir",
    ],
}


In [1]:
import os
import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa", "refcoco"]
nlu_nlg_datasets = ["wsc273", "winogrande", "lambada_standard", "arc_easy", "arc_challenge"]


# Baseline run_id
baseline_run_id = "reproduction-align-pythia+160m"

# Load the JSON data from the results file
with open('results_nlp.json', 'r') as file:
    result = json.load(file)

# Check for the existence of baseline results
baseline_results = result.get(baseline_run_id)
if not baseline_results:
    raise ValueError(f"Baseline run ID '{baseline_run_id}' not found in results.")

# Function to replace None values with a small positive value
def replace_none_and_zeros(arr, epsilon=EPSILON):
    return [epsilon if (x is None or np.isnan(x) or x == 0) else x for x in arr]

# Calculate performance changes and averages for each CL run
cl_performance_change = {}
cl_performance = {}

for model_name, run_ids in cl_runs.items():
    changes = {}
    performances = {}
    missing_data = False

    for i, run_id in enumerate(run_ids):
        current_results = result.get(run_id)
        if not current_results:
            missing_data = True
            print(f"Run '{run_id}' missing for model '{model_name}'")
            break
        
        change = {dataset: current_results.get(dataset, np.nan) - baseline_results.get(dataset, np.nan) for dataset in baseline_results.keys()}
        changes[f'stage_{i}'] = change
        performances[f'stage_{i}'] = current_results
        
        vl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        baseline_vl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in vl_evaluate_sequence[i]])
        avg_delta_vl = hmean(baseline_vl_scores) - hmean(vl_scores)
        avg_acc_vl = hmean(vl_scores)
        
        nl_scores = replace_none_and_zeros([current_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        baseline_nl_scores = replace_none_and_zeros([baseline_results.get(dataset, np.nan) for dataset in nlu_nlg_datasets])
        avg_delta_nl = hmean(baseline_nl_scores) - hmean(nl_scores)
        avg_acc_nl = hmean(nl_scores)
        
        changes[f'stage_{i}_avg'] = {'VL': avg_delta_vl, 'NL': avg_delta_nl}
        performances[f'stage_{i}_avg'] = {'VL': avg_acc_vl, 'NL': avg_acc_nl}
        print(f"Model: {model_name}, Stage: {i}, VL: {avg_acc_vl}, NL: {avg_acc_nl}")
    
    if not missing_data:
        cl_performance_change[model_name] = changes
        cl_performance[model_name] = performances

# Save the performance changes and averages to JSON files
with open('cl_performance_change.json', 'w') as f:
    json.dump(cl_performance_change, f, indent=2)

with open('cl_performance.json', 'w') as f:
    json.dump(cl_performance, f, indent=2)

# Generate the LaTeX table
name_mapping = {
    'sgm-rehearsal1': 'mSGM + Rehearsal \((1\%)\)',
    'naive-ft': 'Original LLaVA'
}

latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Task-wise Accuracies and Forgetting of Each Mitigation Method across VL and NL tasks}
  \\label{tab:vl_nl_acc}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cc|cc|cc|cc}
     \\toprule
     \\textbf{Model}"""

# Add multicolumn headers based on stages
for stage in stages:
    latex_code += " & \\multicolumn{2}{c|}{\\textbf{" + stage + "}}"

latex_code = latex_code.rstrip('|')  # Remove the trailing '|'
latex_code += " \\\\\n"

# Add subheaders for VL and NL metrics
latex_code += "     & \\textbf{VL (A $\\uparrow$)} & \\textbf{NL ($\\Delta \\downarrow$)}" * len(stages)
latex_code += " \\\\\n     \\midrule\n"
# Include the Naive-FT benchmark first
model_name = 'naive-ft'
tasks = cl_performance[model_name]

model_results = (
    name_mapping[model_name],
    tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
    tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
    tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
    tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
)

latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
    model=model_results[0],
    t0_vl_a=model_results[1], t0_nl_d=model_results[2],
    t1_vl_a=model_results[3], t1_nl_d=model_results[4],
    t2_vl_a=model_results[5], t2_nl_d=model_results[6],
    t3_vl_a=model_results[7], t3_nl_d=model_results[8]
)

latex_code += "\\midrule\n"

# Include the rest of the models
for model_name, tasks in cl_performance.items():
    if model_name not in name_mapping or model_name == 'naive-ft':
        continue

    model_results = (
        name_mapping[model_name],
        tasks['stage_0_avg']['VL'] * 100, cl_performance_change[model_name]['stage_0_avg']['NL'] * 100,
        tasks['stage_1_avg']['VL'] * 100, cl_performance_change[model_name]['stage_1_avg']['NL'] * 100,
        tasks['stage_2_avg']['VL'] * 100, cl_performance_change[model_name]['stage_2_avg']['NL'] * 100,
        tasks['stage_3_avg']['VL'] * 100, cl_performance_change[model_name]['stage_3_avg']['NL'] * 100
    )
    
    latex_code += "{model} & {t0_vl_a:.2f} & {t0_nl_d:.2f} & {t1_vl_a:.2f} & {t1_nl_d:.2f} & {t2_vl_a:.2f} & {t2_nl_d:.2f} & {t3_vl_a:.2f} & {t3_nl_d:.2f} \\\\\n".format(
        model=model_results[0],
        t0_vl_a=model_results[1], t0_nl_d=model_results[2],
        t1_vl_a=model_results[3], t1_nl_d=model_results[4],
        t2_vl_a=model_results[5], t2_nl_d=model_results[6],
        t3_vl_a=model_results[7], t3_nl_d=model_results[8]
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


NameError: name 'cl_runs' is not defined