In [3]:
import json
import numpy as np
from scipy.stats import hmean
# Load the JSON data from the file
with open('results_nlp.json', 'r') as f:
    results = json.load(f)
# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the filtering criteria and datasets
nlu_datasets = ["wsc273", "winogrande", "arc_easy", "arc_challenge"]
nlg_datasets = ["lambada_standard"]
vl_datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]

# Define explicit mappings for LLaVA Originals and Base LLMs for each model type
model_mappings = {
    'Pythia': {
        'llava_original': {
            0.16: "stage-final-llava-v15-pythia+160m",
            0.41: "stage-final-llava-v15-pythia+410m",
            1.0: "stage-final-llava-v15-pythia+1b",
            1.4: "stage-final-llava-v15-pythia+1p4b",
            2.8: "stage-final-llava-v15-pythia+2p8b",
            '1.4-instruct': "stage-final-llava-v15-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        },
        'base_llm': {
            0.16: "reproduction-align-pythia+160m",
            0.41: "reproduction-align-pythia+410m",
            1.0: "reproduction-align-pythia+1b",
            1.4: "reproduction-align-pythia+1p4b",
            2.8: "reproduction-align-pythia+2p8b",
            '1.4-instruct': "reproduction-align-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        }
    },
    'Phi2': {
        'llava_original': {
            'phi2+3b': "stage-final-llava-v15-phi2+3b-repeat"
        },
        'base_llm': {
            'phi2+3b': "reproduction-align-phi2+3b"
        }
    },
    'Vicuna1.5': {
        'llava_original': {
            '7b': "reproduction-llava-v15+7b+stage-finetune+x7"
        },
        'base_llm': {
            '7b': "reproduction-llava-v15+7b+stage-align+x7"
        }
    },
    'LLaMA2': {
        'llava_original': {
            '7b': "reproduction-llama2"
        },
        'base_llm': {
            '7b': "vila_base_llm"
        }
    }
}

# Function to calculate NLU and NLG forgetting using harmonic mean
def calculate_forgetting_and_performance(model_mappings, results):
    scores = []
    for model_type, mappings in model_mappings.items():
        for scale, llava_model in mappings['llava_original'].items():
            base_model = mappings['base_llm'][scale]
            if llava_model in results and base_model in results:
                llava_nlu_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                base_nlu_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                llava_nlg_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                base_nlg_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                vl_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in vl_datasets]
                
                valid_llava_nlu_scores = [score for score in llava_nlu_scores if not np.isnan(score)]
                valid_base_nlu_scores = [score for score in base_nlu_scores if not np.isnan(score)]
                valid_llava_nlg_scores = [score for score in llava_nlg_scores if not np.isnan(score)]
                valid_base_nlg_scores = [score for score in base_nlg_scores if not np.isnan(score)]
                valid_vl_scores = [score for score in vl_scores if not np.isnan(score)]
                
                if valid_llava_nlu_scores and valid_base_nlu_scores:
                    llava_hmean_nlu = hmean(valid_llava_nlu_scores)
                    base_hmean_nlu = hmean(valid_base_nlu_scores)
                    nlu_forgetting = base_hmean_nlu - llava_hmean_nlu
                else:
                    nlu_forgetting = np.nan
                
                if valid_llava_nlg_scores and valid_base_nlg_scores:
                    llava_hmean_nlg = hmean(valid_llava_nlg_scores)
                    base_hmean_nlg = hmean(valid_base_nlg_scores)
                    nlg_forgetting = base_hmean_nlg - llava_hmean_nlg
                else:
                    nlg_forgetting = np.nan
                
                avg_vl_performance = hmean(valid_vl_scores) if valid_vl_scores else np.nan
                
                scores.append((model_type, scale, nlu_forgetting, nlg_forgetting, avg_vl_performance))
    return scores

# Calculate forgetting and performance for each model type
scores = calculate_forgetting_and_performance(model_mappings, results)

# Rename scales for Phi and Vicuna models
scale_rename = {
    'phi2+3b': '3.0',
    '7b': '7.0'
}

# Adjust the scales for the plots
for i in range(len(scores)):
    model_type, scale, nlu_forgetting, nlg_forgetting, vl_performance = scores[i]
    if model_type in ['Phi2', 'Vicuna1.5', 'LLaMA2']:
        scale = scale_rename.get(scale, scale)
    scores[i] = (model_type, scale, nlu_forgetting, nlg_forgetting, vl_performance)

# Generate the LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Avg. NLU and NLG Deltas and VL Performance of Each Mitigation Method}
  \\label{tab:nlu_nlg_vl_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|c|c|c}
     \\toprule
     \\textbf{Model} & \\textbf{Avg. NLU $\\Delta \\downarrow$} & \\textbf{Avg. NLG $\\delta \\downarrow$} & \\textbf{VL Performance (A $\\uparrow$)} \\\\
     \\midrule
"""

# Add the scores to the LaTeX table
for model_type, scale, nlu_forgetting, nlg_forgetting, vl_performance in scores:
    if scale == '1.4-instruct':
        scale = '1.4'
        latex_code += f"{model_type} {scale}B (Inst.) & {nlu_forgetting:.2f} & {nlg_forgetting:.2f} & {vl_performance:.2f} \\\\\n"
    else:
        latex_code += f"{model_type} {scale}B & {nlu_forgetting:.2f} & {nlg_forgetting:.2f} & {vl_performance:.2f} \\\\\n"

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{Model Performance:} Avg. NLU and NLG Deltas and VL Performance of Each Mitigation Method}
  \label{tab:nlu_nlg_vl_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|c|c|c}
     \toprule
     \textbf{Model} & \textbf{Avg. NLU $\Delta \downarrow$} & \textbf{Avg. NLG $\delta \downarrow$} & \textbf{VL Performance (A $\uparrow$)} \\
     \midrule
Pythia 0.16B & 0.94 & 12.01 & 5.29 \\
Pythia 0.41B & -1.19 & 8.62 & 30.37 \\
Pythia 1.0B & -1.63 & 4.95 & 43.58 \\
Pythia 1.4B & 0.55 & 8.07 & 43.97 \\
Pythia 2.8B & 1.74 & 9.18 & 44.20 \\
Pythia 1.4B (Inst.) & -1.20 & -1.01 & 43.93 \\
Phi2 3.0B & 2.60 & 4.39 & 25.33 \\
Vicuna1.5 7.0B & -0.98 & 2.04 & 56.55 \\
LLaMA2 7.0B & -2.15 & -0.43 & 57.22 \\

     \bottomrule
    \end{tabular}
  }
\end{table*}



In [5]:
import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the filtering criteria and datasets
nlu_datasets = ["wsc273", "winogrande", "arc_easy", "arc_challenge"]
nlg_datasets = ["lambada_standard"]
vl_datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]

# Define explicit mappings for LLaVA Originals and Base LLMs for each model type
model_mappings = {
    'Pythia': {
        'llava_original': {
            0.16: "stage-final-llava-v15-pythia+160m",
            0.41: "stage-final-llava-v15-pythia+410m",
            1.0: "stage-final-llava-v15-pythia+1b",
            1.4: "stage-final-llava-v15-pythia+1p4b",
            2.8: "stage-final-llava-v15-pythia+2p8b",
            '1.4-instruct': "stage-final-llava-v15-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        },
        'base_llm': {
            0.16: "reproduction-align-pythia+160m",
            0.41: "reproduction-align-pythia+410m",
            1.0: "reproduction-align-pythia+1b",
            1.4: "reproduction-align-pythia+1p4b",
            2.8: "reproduction-align-pythia+2p8b",
            '1.4-instruct': "reproduction-align-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        }
    },
    'Phi2': {
        'llava_original': {
            'phi2+3b': "stage-final-llava-v15-phi2+3b-repeat"
        },
        'base_llm': {
            'phi2+3b': "reproduction-align-phi2+3b"
        }
    },
    'Vicuna1.5': {
        'llava_original': {
            '7b': "reproduction-llava-v15+7b+stage-finetune+x7"
        },
        'base_llm': {
            '7b': "reproduction-llava-v15+7b+stage-align+x7"
        }
    },
    'LLaMA2': {
        'llava_original': {
            '7b': "reproduction-llama2"
        },
        'base_llm': {
            '7b': "vila_base_llm"
        }
    }
}

# Function to calculate NLU and NLG forgetting using harmonic mean
def calculate_forgetting_and_performance(model_mappings, results):
    scores = []
    for model_type, mappings in model_mappings.items():
        for scale, llava_model in mappings['llava_original'].items():
            base_model = mappings['base_llm'][scale]
            if llava_model in results and base_model in results:
                llava_nlu_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                base_nlu_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                llava_nlg_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                base_nlg_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                
                valid_llava_nlu_scores = [score for score in llava_nlu_scores if not np.isnan(score)]
                valid_base_nlu_scores = [score for score in base_nlu_scores if not np.isnan(score)]
                valid_llava_nlg_scores = [score for score in llava_nlg_scores if not np.isnan(score)]
                valid_base_nlg_scores = [score for score in base_nlg_scores if not np.isnan(score)]
                
                if valid_llava_nlu_scores and valid_base_nlu_scores:
                    llava_hmean_nlu = hmean(valid_llava_nlu_scores)
                    base_hmean_nlu = hmean(valid_base_nlu_scores)
                    nlu_forgetting = llava_hmean_nlu - base_hmean_nlu
                else:
                    nlu_forgetting = np.nan
                
                if valid_llava_nlg_scores and valid_base_nlg_scores:
                    llava_hmean_nlg = hmean(valid_llava_nlg_scores)
                    base_hmean_nlg = hmean(valid_base_nlg_scores)
                    nlg_forgetting = llava_hmean_nlg - base_hmean_nlg
                else:
                    nlg_forgetting = np.nan
                
                scores.append((model_type, scale, nlu_forgetting, nlg_forgetting))
    return scores

# Calculate forgetting and performance for each model type
scores = calculate_forgetting_and_performance(model_mappings, results)

# Rename scales for Phi and Vicuna models
scale_rename = {
    'phi2+3b': '3.0',
    '7b': '7.0'
}

# Adjust the scales for the plots
for i in range(len(scores)):
    model_type, scale, nlu_forgetting, nlg_forgetting = scores[i]
    if model_type in ['Phi2', 'Vicuna1.5', 'LLaMA2']:
        scale = scale_rename.get(scale, scale)
    scores[i] = (model_type, scale, nlu_forgetting, nlg_forgetting)

# Convert all values to positive for plotting
for i in range(len(scores)):
    model_type, scale, nlu_forgetting, nlg_forgetting = scores[i]
    scores[i] = (model_type, scale, -nlu_forgetting, -nlg_forgetting)

# Generate the LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Avg. NLU and NLG Deltas of Each Mitigation Method}
  \\label{tab:nlu_nlg_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|c|c|c}
     \\toprule
     \\textbf{Model Family} & \\textbf{Model Scale} & \\textbf{Avg. NLU $\\Delta \\downarrow$} & \\textbf{Avg. NLG $\\delta \\downarrow$} \\\\
     \\midrule
"""

current_family = None
for model_type, scale, nlu_forgetting, nlg_forgetting in scores:
    if model_type != current_family:
        if current_family is not None:
            latex_code += "\\midrule\n"
        current_family = model_type
        latex_code += f"\\multirow{{{len([s for s in scores if s[0] == model_type])}}}{{*}}{{{model_type}}} \n"
    
    scale_display = f"{scale}B" if scale != '1.4-instruct' else "1.4B (Inst.)"
    latex_code += f"& {scale_display} & {nlu_forgetting:.2f} & {nlg_forgetting:.2f} \\\\\n"

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{Model Performance:} Avg. NLU and NLG Deltas of Each Mitigation Method}
  \label{tab:nlu_nlg_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|c|c|c}
     \toprule
     \textbf{Model Family} & \textbf{Model Scale} & \textbf{Avg. NLU $\Delta \downarrow$} & \textbf{Avg. NLG $\delta \downarrow$} \\
     \midrule
\multirow{6}{*}{Pythia} 
& 0.16B & 0.94 & 12.01 \\
& 0.41B & -1.19 & 8.62 \\
& 1.0B & -1.63 & 4.95 \\
& 1.4B & 0.55 & 8.07 \\
& 2.8B & 1.74 & 9.18 \\
& 1.4B (Inst.) & -1.20 & -1.01 \\
\midrule
\multirow{1}{*}{Phi2} 
& 3.0B & 2.60 & 4.39 \\
\midrule
\multirow{1}{*}{Vicuna1.5} 
& 7.0B & -0.98 & 2.04 \\
\midrule
\multirow{1}{*}{LLaMA2} 
& 7.0B & -2.15 & -0.43 \\

     \bottomrule
    \end{tabular}
  }
\end{table*}



In [8]:
import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the filtering criteria and datasets
nlu_datasets = ["wsc273", "winogrande", "arc_easy", "arc_challenge"]
nlg_datasets = ["lambada_standard"]
vl_datasets = ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]

# Define explicit mappings for LLaVA Originals and Base LLMs for each model type
model_mappings = {
    'Pythia': {
        'llava_original': {
            0.16: "stage-final-llava-v15-pythia+160m",
            0.41: "stage-final-llava-v15-pythia+410m",
            1.0: "stage-final-llava-v15-pythia+1b",
            1.4: "stage-final-llava-v15-pythia+1p4b",
            2.8: "stage-final-llava-v15-pythia+2p8b",
            '1.4-instruct': "stage-final-llava-v15-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        },
        'base_llm': {
            0.16: "reproduction-align-pythia+160m",
            0.41: "reproduction-align-pythia+410m",
            1.0: "reproduction-align-pythia+1b",
            1.4: "reproduction-align-pythia+1p4b",
            2.8: "reproduction-align-pythia+2p8b",
            '1.4-instruct': "reproduction-align-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        }
    },
    'Phi2': {
        'llava_original': {
            'phi2+3b': "stage-final-llava-v15-phi2+3b-repeat"
        },
        'base_llm': {
            'phi2+3b': "reproduction-align-phi2+3b"
        }
    },
    'Vicuna1.5': {
        'llava_original': {
            '7b': "reproduction-llava-v15+7b+stage-finetune+x7"
        },
        'base_llm': {
            '7b': "reproduction-llava-v15+7b+stage-align+x7"
        }
    },
    'LLaMA2': {
        'llava_original': {
            '7b': "reproduction-llama2"
        },
        'base_llm': {
            '7b': "vila_base_llm"
        }
    }
}

# Function to calculate NLU and NLG forgetting using harmonic mean
def calculate_forgetting_and_performance(model_mappings, results):
    scores = []
    for model_type, mappings in model_mappings.items():
        for scale, llava_model in mappings['llava_original'].items():
            base_model = mappings['base_llm'][scale]
            if llava_model in results and base_model in results:
                llava_nlu_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                base_nlu_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlu_datasets]
                llava_nlg_scores = [results[llava_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                base_nlg_scores = [results[base_model].get(dataset, np.nan) * 100 for dataset in nlg_datasets]
                
                valid_llava_nlu_scores = [score for score in llava_nlu_scores if not np.isnan(score)]
                valid_base_nlu_scores = [score for score in base_nlu_scores if not np.isnan(score)]
                valid_llava_nlg_scores = [score for score in llava_nlg_scores if not np.isnan(score)]
                valid_base_nlg_scores = [score for score in base_nlg_scores if not np.isnan(score)]
                
                if valid_llava_nlu_scores and valid_base_nlu_scores:
                    llava_hmean_nlu = hmean(valid_llava_nlu_scores)
                    base_hmean_nlu = hmean(valid_base_nlu_scores)
                    nlu_forgetting = llava_hmean_nlu - base_hmean_nlu
                else:
                    nlu_forgetting = np.nan
                
                if valid_llava_nlg_scores and valid_base_nlg_scores:
                    llava_hmean_nlg = hmean(valid_llava_nlg_scores)
                    base_hmean_nlg = hmean(valid_base_nlg_scores)
                    nlg_forgetting = llava_hmean_nlg - base_hmean_nlg
                else:
                    nlg_forgetting = np.nan
                
                scores.append((model_type, scale, nlu_forgetting, nlg_forgetting))
    return scores

# Calculate forgetting and performance for each model type
scores = calculate_forgetting_and_performance(model_mappings, results)

# Rename scales for Phi and Vicuna models
scale_rename = {
    'phi2+3b': '3.0',
    '7b': '7.0'
}

# Adjust the scales for the plots
for i in range(len(scores)):
    model_type, scale, nlu_forgetting, nlg_forgetting = scores[i]
    if model_type in ['Phi2', 'Vicuna1.5', 'LLaMA2']:
        scale = scale_rename.get(scale, scale)
    scores[i] = (model_type, scale, nlu_forgetting, nlg_forgetting)

# Convert all values to positive for plotting
for i in range(len(scores)):
    model_type, scale, nlu_forgetting, nlg_forgetting = scores[i]
    scores[i] = (model_type, scale, -nlu_forgetting, -nlg_forgetting)

# Generate the LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Avg. NLU and NLG Deltas of Each Mitigation Method}
  \\label{tab:nlu_nlg_performance}
  \\centering
  \\begin{tabular}{llccc}
    \\toprule
    \\textbf{Model Family} & \\textbf{Model Scale} & \\textbf{Avg. NLU $\\Delta \\downarrow$} & \\textbf{Avg. NLG $\\delta \\downarrow$} \\\\
    \\midrule
"""

current_family = None
for model_type, scale, nlu_forgetting, nlg_forgetting in scores:
    if model_type != current_family:
        if current_family is not None:
            latex_code += "    \\midrule\n"
        current_family = model_type
        latex_code += f"    \\multirow{{{len([s for s in scores if s[0] == model_type])}}}{{*}}{{{model_type}}} \n"
    
    scale_display = f"{scale}B" if scale != '1.4-instruct' else "1.4B (Inst.)"
    latex_code += f"    & {scale_display} & {nlu_forgetting:.2f} & {nlg_forgetting:.2f} \\\\\n"

latex_code += """
    \\bottomrule
  \\end{tabular}
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{Model Performance:} Avg. NLU and NLG Deltas of Each Mitigation Method}
  \label{tab:nlu_nlg_performance}
  \centering
  \begin{tabular}{llccc}
    \toprule
    \textbf{Model Family} & \textbf{Model Scale} & \textbf{Avg. NLU $\Delta \downarrow$} & \textbf{Avg. NLG $\delta \downarrow$} \\
    \midrule
    \multirow{6}{*}{Pythia} 
    & 0.16B & 0.94 & 12.01 \\
    & 0.41B & -1.19 & 8.62 \\
    & 1.0B & -1.63 & 4.95 \\
    & 1.4B & 0.55 & 8.07 \\
    & 2.8B & 1.74 & 9.18 \\
    & 1.4B (Inst.) & -1.20 & -1.01 \\
    \midrule
    \multirow{1}{*}{Phi2} 
    & 3.0B & 2.60 & 4.39 \\
    \midrule
    \multirow{1}{*}{Vicuna1.5} 
    & 7.0B & -0.98 & 2.04 \\
    \midrule
    \multirow{1}{*}{LLaMA2} 
    & 7.0B & -2.15 & -0.43 \\

    \bottomrule
  \end{tabular}
\end{table*}



In [11]:
import json
import numpy as np
from scipy.stats import hmean

# Define a small epsilon value to replace zeros
EPSILON = 1e-3

# Define the filtering criteria and datasets
nlu_datasets = ["wsc273", "winogrande", "arc_easy", "arc_challenge"]
nlg_datasets = ["lambada_standard"]
all_nlp_datasets = nlu_datasets + nlg_datasets

# Define explicit mappings for LLaVA Originals and Base LLMs for each model type
model_mappings = {
    'Pythia': {
        'llava_original': {
            0.16: "stage-final-llava-v15-pythia+160m",
            0.41: "stage-final-llava-v15-pythia+410m",
            1.0: "stage-final-llava-v15-pythia+1b",
            1.4: "stage-final-llava-v15-pythia+1p4b",
            2.8: "stage-final-llava-v15-pythia+2p8b",
            '1.4-instruct': "stage-final-llava-v15-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        },
        'base_llm': {
            0.16: "reproduction-align-pythia+160m",
            0.41: "reproduction-align-pythia+410m",
            1.0: "reproduction-align-pythia+1b",
            1.4: "reproduction-align-pythia+1p4b",
            2.8: "reproduction-align-pythia+2p8b",
            '1.4-instruct': "reproduction-align-pythia+1p4b-instruct"  # Pythia 1.4B Instruct
        }
    },
    'Phi2': {
        'llava_original': {
            'phi2+3b': "stage-final-llava-v15-phi2+3b-repeat"
        },
        'base_llm': {
            'phi2+3b': "reproduction-align-phi2+3b"
        }
    },
    'Vicuna1.5': {
        'llava_original': {
            '7b': "reproduction-llava-v15+7b+stage-finetune+x7"
        },
        'base_llm': {
            '7b': "reproduction-llava-v15+7b+stage-align+x7"
        }
    },
    'LLaMA2': {
        'llava_original': {
            '7b': "reproduction-llama2"
        },
        'base_llm': {
            '7b': "vila_base_llm"
        }
    }
}

# Function to calculate deltas for each dataset
def calculate_deltas(model_mappings, results):
    scores = []
    for model_type, mappings in model_mappings.items():
        for scale, llava_model in mappings['llava_original'].items():
            base_model = mappings['base_llm'][scale]
            if llava_model in results and base_model in results:
                deltas = {dataset: (results[base_model].get(dataset, np.nan) * 100 - results[llava_model].get(dataset, np.nan) * 100)
                          for dataset in all_nlp_datasets}
                scores.append((model_type, scale, deltas))
    return scores

# Calculate deltas for each dataset
scores = calculate_deltas(model_mappings, results)

# Rename scales for Phi and Vicuna models
scale_rename = {
    'phi2+3b': '3.0',
    '7b': '7.0'
}

# Adjust the scales for the table
for i in range(len(scores)):
    model_type, scale, deltas = scores[i]
    if model_type in ['Phi2', 'Vicuna1.5', 'LLaMA2']:
        scale = scale_rename.get(scale, scale)
    scores[i] = (model_type, scale, deltas)

# Generate the LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{Model Performance:} Deltas of Each Mitigation Method for NLU and NLG Datasets}
  \\label{tab:nlu_nlg_performance}
  \\centering
  \\begin{tabular}{llcccccc}
    \\toprule
    \\textbf{Model Family} & \\textbf{Model Scale} & \\textbf{WSC273 $\\Delta \\downarrow$} & \\textbf{Winogrande $\\Delta \\downarrow$} & \\textbf{ARC Easy $\\Delta \\downarrow$} & \\textbf{ARC Challenge $\\Delta \\downarrow$} & \\textbf{Lambada $\\delta \\downarrow$} \\\\
    \\midrule
"""

current_family = None
for model_type, scale, deltas in scores:
    if model_type != current_family:
        if current_family is not None:
            latex_code += "    \\midrule\n"
        current_family = model_type
        latex_code += f"    \\multirow{{{len([s for s in scores if s[0] == model_type])}}}{{*}}{{{model_type}}} \n"
    
    scale_display = f"{scale}B" if scale != '1.4-instruct' else "1.4B (Inst.)"
    latex_code += f"    & {scale_display} "
    for dataset in all_nlp_datasets:
        delta = deltas[dataset]
        latex_code += f" & {delta:.2f} "
    latex_code += " \\\\\n"

latex_code += """
    \\bottomrule
  \\end{tabular}
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{Model Performance:} Deltas of Each Mitigation Method for NLU and NLG Datasets}
  \label{tab:nlu_nlg_performance}
  \centering
  \begin{tabular}{llcccccc}
    \toprule
    \textbf{Model Family} & \textbf{Model Scale} & \textbf{WSC273 $\Delta \downarrow$} & \textbf{Winogrande $\Delta \downarrow$} & \textbf{ARC Easy $\Delta \downarrow$} & \textbf{ARC Challenge $\Delta \downarrow$} & \textbf{Lambada $\delta \downarrow$} \\
    \midrule
    \multirow{6}{*}{Pythia} 
    & 0.16B  & 3.30  & -2.29  & 5.13  & -0.09  & 12.01  \\
    & 0.41B  & 4.03  & 0.47  & 1.68  & -2.30  & 8.62  \\
    & 1.0B  & 1.83  & -0.71  & 1.30  & -2.56  & 4.95  \\
    & 1.4B  & 3.30  & 0.32  & 0.76  & 0.00  & 8.07  \\
    & 2.8B  & 2.93  & -1.10  & 3.91  & 1.37  & 9.18  \\
    & 1.4B (Inst.)  & -5.49  & -3.63  & -3.70  & 1.11  & -1.01  \\
    \midrule
    \multirow{1}{*}{Phi2} 
    & 3.0B  & -1.47  & 0.16  & 2.95  & 4.86  & 4.39  \\
    \midrule
    \multirow{1}{*}{Vicuna1.5} 
    &