In [3]:

import os
import json
import numpy as np

# Define the datasets of interest
datasets = [
    'vqa-v2', 'textvqa-ocr', 'textvqa-pure', 'gqa', 'refcoco',
    'wsc273', 'winogrande', 'lambada_standard', 'arc_easy', 'arc_challenge'
]

# Define the result dictionary
results_nlp = {}

# Path to the JSON results file
results_file = 'results_nlp.json'

# Read the JSON results file
if os.path.isfile(results_file):
    with open(results_file, 'r') as f:
        results_nlp = json.load(f)
else:
    print(f"Error: File {results_file} not found.")


In [11]:
import json
import numpy as np

# Load the results from the JSON file
with open('results_nlp.json') as f:
    results_dict = json.load(f)

# Define the models of interest and their corresponding baselines with labels
models_of_interest = {
    "stage-final-llava-v15-pythia+1p4b": ("reproduction-align-pythia+1p4b", "LLaVA + Pythia Instruct (1.4B)"),
    "stage-final-llava-v15-pythia+1p4b-instruct-old": ("reproduction-align-pythia+1p4b-instruct-old", "LLaVA + Pythia (1.4B)"),
    "reproduction-llava-v15+7b+stage-finetune+x7": ("reproduction-llava-v15+7b+stage-align+x7", "LLaVA + LLaMA2 Instruct (7B)"),
    "reproduction-llama2": ("vila_base_llm", "LLaVA + LLaMA2 Base (7B)")
}

# Function to format values or return "-"
def format_value(value):
    return "{:.1f}".format(value * 100) if not np.isnan(value) else "-"

# Prepare the data for the LaTeX tables
table_data = []

for model, (baseline, label) in models_of_interest.items():
    accuracies = results_dict[model]
    baseline_accuracies = results_dict[baseline]
    
    avg_acc_vl = sum(accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]) / 4
    
    nlu_deltas = {dataset: accuracies[dataset] - baseline_accuracies.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge"]}
    avg_delta_nlu = sum(nlu_deltas[dataset] for dataset in nlu_deltas) / 4
    avg_acc_nlu = sum(accuracies[dataset] for dataset in nlu_deltas) / 4
    
    delta_nlg = accuracies["lambada_standard"] - baseline_accuracies.get("lambada_standard", 0)
    avg_acc_nlg = accuracies["lambada_standard"]
    
    table_data.append((label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg))

# Sort the data by Avg. VL Accuracy and highest NLG Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[5]), reverse=True)

# Generate LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance}}
  \\label{tab:model_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c|}{\\textbf{NLU Avg.}} & \\multicolumn{2}{c}{\\textbf{NLG Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

for label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg in table_data:
    latex_code += "{label} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nlu} & {avg_acc_nlu} & {delta_nlg} & {avg_acc_nlg} \\\\\n".format(
        label=label,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        avg_acc_vl=format_value(avg_acc_vl),
        delta_nlu=format_value(avg_delta_nlu),
        avg_acc_nlu=format_value(avg_acc_nlu),
        delta_nlg=format_value(delta_nlg),
        avg_acc_nlg=format_value(avg_acc_nlg)
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{LLaVA Model Performance}}
  \label{tab:model_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|cccc|c|cc|cc}
     \toprule
     \textbf{Model} & \multicolumn{4}{c|}{\textbf{Vision-Language (VL)}} & \textbf{VL Avg.} & \multicolumn{2}{c|}{\textbf{NLU Avg.}} & \multicolumn{2}{c}{\textbf{NLG Avg.}} \\
     & \textbf{VQAv2} & \textbf{TextVQA OCR} & \textbf{TextVQA Pure} & \textbf{GQA} & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ \\
     \midrule
LLaVA + LLaMA2 Base (7B) & 75.9 & 55.2 & 45.4 & 60.2 & 59.2 & 2.7 & 70.0 & 0.4 & 68.7 \\
LLaVA + LLaMA2 Instruct (7B) & 74.5 & 56.3 & 45.9 & 56.2 & 58.2 & 0.3 & 68.8 & -2.0 & 62.3 \\
LLaVA + Pythia Instruct (1.4B) & 66.2 & 38.5 & 35.5 & 46.1 & 46.6 & -1.1 & 53.0 & -8.1 & 40.9 \\
LLaVA + Pythia (1.4B) & 64.0 & 39.8 & 34.4 & 44.5 & 45.7 & -4.6 & 49.5 & -12.6 & 36.3 \\

     \bottomrule
    \end{tabular}
  }
\end{table*}



In [2]:
import json
import numpy as np

# Load the results from the JSON file
with open('results_nlp.json') as f:
    results_dict = json.load(f)

# Define the models of interest and their corresponding baselines with labels
models_of_interest = {
    "stage-final-llava-v15-pythia+1p4b": ("reproduction-align-pythia+1p4b", "LLaVA + Pythia Instruct (1.4B)"),
    "stage-final-llava-v15-pythia+1p4b-instruct": ("reproduction-align-pythia+1p4b-instruct", "LLaVA + Pythia (1.4B)"),
    "reproduction-llava-v15+7b+stage-finetune+x7": ("reproduction-llava-v15+7b+stage-align+x7", "LLaVA + LLaMA2 Instruct (7B)"),
    "reproduction-llama2": ("vila_base_llm", "LLaVA + LLaMA2 Base (7B)")
}

# Function to format values or return "-"
def format_value(value):
    return "{:.1f}".format(value * 100) if not np.isnan(value) else "-"

# Prepare the data for the LaTeX tables
table_data = []

for model, (baseline, label) in models_of_interest.items():
    accuracies = results_dict[model]
    baseline_accuracies = results_dict[baseline]
    
    avg_acc_vl = sum(accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]) / 4
    
    nlu_deltas = {dataset: accuracies[dataset] - baseline_accuracies.get(dataset, 0) for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge"]}
    avg_delta_nlu = sum(nlu_deltas[dataset] for dataset in nlu_deltas) / 4
    avg_acc_nlu = sum(accuracies[dataset] for dataset in nlu_deltas) / 4
    
    delta_nlg = accuracies["lambada_standard"] - baseline_accuracies.get("lambada_standard", 0)
    avg_acc_nlg = accuracies["lambada_standard"]
    
    table_data.append((label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg))

# Sort the data by Avg. VL Accuracy and highest NLG Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[5]), reverse=True)

# Generate LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance}}
  \\label{tab:model_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|cccc|c|cc|cc}
     \\toprule
     \\textbf{Model} & \\multicolumn{4}{c|}{\\textbf{Vision-Language (VL)}} & \\textbf{VL Avg.} & \\multicolumn{2}{c|}{\\textbf{NLU Avg.}} & \\multicolumn{2}{c}{\\textbf{NLG Avg.}} \\\\
     & \\textbf{VQAv2} & \\textbf{TextVQA OCR} & \\textbf{TextVQA Pure} & \\textbf{GQA} & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ & $\\Delta \\uparrow$ & Acc $\\uparrow$ \\\\
     \\midrule
"""

for label, accuracies, avg_acc_vl, avg_delta_nlu, avg_acc_nlu, delta_nlg, avg_acc_nlg in table_data:
    latex_code += "{label} & {vqa_v2} & {textvqa_ocr} & {textvqa_pure} & {gqa} & {avg_acc_vl} & {delta_nlu} & {avg_acc_nlu} & {delta_nlg} & {avg_acc_nlg} \\\\\n".format(
        label=label,
        vqa_v2=format_value(accuracies["vqa-v2"]),
        textvqa_ocr=format_value(accuracies["textvqa-ocr"]),
        textvqa_pure=format_value(accuracies["textvqa-pure"]),
        gqa=format_value(accuracies["gqa"]),
        avg_acc_vl=format_value(avg_acc_vl),
        delta_nlu=format_value(avg_delta_nlu),
        avg_acc_nlu=format_value(avg_acc_nlu),
        delta_nlg=format_value(delta_nlg),
        avg_acc_nlg=format_value(avg_acc_nlg)
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)


\begin{table*}[h]
  \caption{\textbf{LLaVA Model Performance}}
  \label{tab:model_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|cccc|c|cc|cc}
     \toprule
     \textbf{Model} & \multicolumn{4}{c|}{\textbf{Vision-Language (VL)}} & \textbf{VL Avg.} & \multicolumn{2}{c|}{\textbf{NLU Avg.}} & \multicolumn{2}{c}{\textbf{NLG Avg.}} \\
     & \textbf{VQAv2} & \textbf{TextVQA OCR} & \textbf{TextVQA Pure} & \textbf{GQA} & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ & $\Delta \uparrow$ & Acc $\uparrow$ \\
     \midrule
LLaVA + LLaMA2 Base (7B) & 75.9 & 55.2 & 45.4 & 60.2 & 59.2 & 2.7 & 70.0 & 0.4 & 68.7 \\
LLaVA + LLaMA2 Instruct (7B) & 74.5 & 56.3 & 45.9 & 56.2 & 58.2 & 0.3 & 68.8 & -2.0 & 62.3 \\
LLaVA + Pythia (1.4B) & 66.5 & 39.1 & 34.3 & 46.9 & 46.7 & 2.9 & 49.1 & 1.0 & 34.8 \\
LLaVA + Pythia Instruct (1.4B) & 66.2 & 38.5 & 35.5 & 46.1 & 46.6 & -1.1 & 53.0 & -8.1 & 40.9 \\

     \bottomrule
    \end{tabular}
  }
\end{table*}



In [1]:
import json
import numpy as np
from scipy.stats import hmean
from collections import defaultdict

# Load the results from the JSON file
with open('results_nlp.json') as f:
    results_dict = json.load(f)

# Define the models of interest and their corresponding baselines with labels
models_of_interest = {
    "stage-final-llava-v15-pythia+1p4b": ("reproduction-align-pythia+1p4b", "LLaVA + Pythia (1.4B)"),
    "stage-final-llava-v15-pythia+1p4b-instruct": ("reproduction-align-pythia+1p4b-instruct", "LLaVA + Pythia Instruct (1.4B)"),
    "reproduction-llava-v15+7b+stage-finetune+x7": ("reproduction-llava-v15+7b+stage-align+x7", "LLaVA + LLaMA2 Instruct (7B)"),
    "reproduction-llama2": ("vila_base_llm", "LLaVA + LLaMA2 Base (7B)")
}

# Function to format values or return "-"
def format_value(value):
    return "{:.2f}".format(value * 100) if not np.isnan(value) else "-"

# Function to check if a model is instruction fine-tuned
def is_instruction_fine_tuned(label):
    return "Instruct" in label

# Prepare the data for the LaTeX tables
table_data = []

for model, (baseline, label) in models_of_interest.items():
    accuracies = results_dict[model]
    baseline_accuracies = results_dict[baseline]
    
    avg_acc_vl = hmean([accuracies[dataset] for dataset in ["vqa-v2", "textvqa-ocr", "textvqa-pure", "gqa"]])
    avg_acc_nl = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    avg_acc_nlu = hmean([accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge"]])
    avg_acc_nlg = accuracies["lambada_standard"]
    baseline_avg_nlu = hmean([baseline_accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge"]])
    baseline_avg_nlg = baseline_accuracies["lambada_standard"]
    baseline_avg_acc_nl = hmean([baseline_accuracies[dataset] for dataset in ["wsc273", "winogrande", "arc_easy", "arc_challenge", "lambada_standard"]])
    delta_nl = baseline_avg_acc_nl - avg_acc_nl
    delta_nlu = baseline_avg_nlu - avg_acc_nlu
    delta_nlg = baseline_avg_nlg - avg_acc_nlg
    print(f"Model {label}: Avg. VL = {avg_acc_vl*100:.2f}, Avg. NL = {avg_acc_nl*100:.2f}, Delta NL = {delta_nl*100:.2f}, Delta NLU = {delta_nlu*100:.2f}, Delta NLG = {delta_nlg*100:.2f}")
    
    table_data.append((label, is_instruction_fine_tuned(label), avg_acc_vl, avg_acc_nl, delta_nl))

# Sort the data by Avg. VL Accuracy and highest NL Delta
table_data = sorted(table_data, key=lambda x: (x[2], -x[4]), reverse=True)

# Generate LaTeX table
latex_code = """
\\begin{table*}[h]
  \\caption{\\textbf{LLaVA Model Performance}}
  \\label{tab:model_performance}
  \\centering
  \\resizebox{\\linewidth}{!}{
    \\begin{tabular}{l|c|c|cc}
     \\toprule
     \\textbf{Model} & \\textbf{Instr.} & \\multicolumn{1}{c|}{\\textbf{VL Avg.}} & \\multicolumn{2}{c}{\\textbf{NL Avg.}} \\\\
     & & \\textbf{Acc $\\uparrow$} & \\textbf{Acc $\\uparrow$} & \\textbf{Delta $\\uparrow$} \\\\
     \\midrule
"""

for label, instr, avg_acc_vl, avg_acc_nl, delta_nl in table_data:
    latex_code += "{label} & {instr} & {avg_acc_vl} & {avg_acc_nl} & {delta_nl} \\\\\n".format(
        label=label,
        instr="\\ding{51}" if instr else "\\ding{55}",
        avg_acc_vl=format_value(avg_acc_vl),
        avg_acc_nl=format_value(avg_acc_nl),
        delta_nl=format_value(delta_nl)
    )

latex_code += """
     \\bottomrule
    \\end{tabular}
  }
\\end{table*}
"""

print(latex_code)

Model LLaVA + Pythia (1.4B): Avg. VL = 43.97, Avg. NL = 45.51, Delta NL = 2.18, Delta NLU = 0.55, Delta NLG = 8.07
Model LLaVA + Pythia Instruct (1.4B): Avg. VL = 43.93, Avg. NL = 41.37, Delta NL = -1.16, Delta NLU = -1.20, Delta NLG = -1.01
Model LLaVA + LLaMA2 Instruct (7B): Avg. VL = 56.55, Avg. NL = 64.44, Delta NL = -0.36, Delta NLU = -0.98, Delta NLG = 2.04
Model LLaVA + LLaMA2 Base (7B): Avg. VL = 57.22, Avg. NL = 66.23, Delta NL = -1.84, Delta NLU = -2.15, Delta NLG = -0.43

\begin{table*}[h]
  \caption{\textbf{LLaVA Model Performance}}
  \label{tab:model_performance}
  \centering
  \resizebox{\linewidth}{!}{
    \begin{tabular}{l|c|c|cc}
     \toprule
     \textbf{Model} & \textbf{Instr.} & \multicolumn{1}{c|}{\textbf{VL Avg.}} & \multicolumn{2}{c}{\textbf{NL Avg.}} \\
     & & \textbf{Acc $\uparrow$} & \textbf{Acc $\uparrow$} & \textbf{Delta $\uparrow$} \\
     \midrule
LLaVA + LLaMA2 Base (7B) & \ding{55} & 57.22 & 66.23 & -1.84 \\
LLaVA + LLaMA2 Instruct (7B) & \ding{51} & 

In [8]:
# Load a ViT-L/14@336px model
import torch
from transformers import ViTFeatureExtractor, ViTForImageClassification

# Load the ViT model. It needs to be the ViT-L/14@336px model: clip-vit-l-336px
model = ViTForImageClassification.from_pretrained("openai/clip-vit-large-patch14-336")
feature_extractor = ViTFeatureExtractor.from_pretrained("openai/clip-vit-large-patch14-336")





You are using a model of type clip to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at openai/clip-vit-large-patch14-336 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.cls_token', 'embeddings.patch_embeddings.projection.bias', 'embeddings.patch_embeddings.projection.weight', 'embeddings.position_embeddings', 'encoder.layer.0.attention.attention.key.bias', 'encoder.layer.0.attention.attention.key.weight', 'encoder.layer.0.attention.attention.query.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.0.attention.attention.value.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.l

In [10]:
model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7