In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from utils.load_env_vars import load_env

load_env()

from torch.utils.data import DataLoader
from data.dataloader import CyListSceneUnderstandingDataset, CyListVQADataset
from judge.judge_utils import query_sglang_llama3, query_sglang_llama3_batched
import json 
import re

from tqdm import tqdm


-----------------------------
Environment variables loaded:
HF_HOME /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/hfcache
HUGGINGFACE_HUB_CACHE /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/hfcache
WANDB_PROJECT llava-changes
TOKENIZERS_PARALLELISM true
-----------------------------


# load samples to check against the judge

In [8]:
# load txt file

with open('/pfss/mlde/workspaces/mlde_wsp_Multimodal_on_42/CycliST/Cyclist/assets/prompts/vqa_prompt_counting.txt', 'r') as f:
    lines = f.readlines()
    judge_prompt = ''.join(lines)
    print(judge_prompt)

Hey, can you judge this prediction for my visual question answering dataset? The question is about counting objects with certain properties and the answer is a numbers. 

Tell me if the model predicted the correct number. Give a score of 1 if the model predicted the groundtruth and 0 if the number is not the same as the groundtruth.


return a json object with the following format:
{{
    "prediction": "There is one cycling object.",
    "groundtruth": "1",
    "score": 1
    "type": "TP"
}}

Prediction:
{}

GT:
{}



In [4]:
import pandas as pd

# load csv file with

csv_paths =[ "/pfss/mlde/workspaces/mlde_wsp_Multimodal_on_42/CycliST/Cyclist/output/eval/unicycle_count_answer.csv"]

for csv_path in csv_paths:
    print("Evaluating", csv_path)
    # load csv
    df = pd.read_csv(csv_path, sep=";", names=['query', 'answer','pred'])
    #display(df.head())

    types = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
    score = 0
    evaluated = 0


    for idx, row in enumerate(df.iterrows()):
        if idx > 4:
            break
        # question = row[1]
        pred = row[1][1]
        answer = row[1][2]
        print("PRED",pred)
        print("ANSWER",answer)

        
        response = query_sglang_llama3(pred, answer, judge_prompt=judge_prompt)
        json_response = re.search(r'{(.|\r?\n)*}', response).group()
        json_response = json.loads(json_response)
        print(json_response)
        if json_response['prediction'] == "false" and json_response['groundtruth'] == "false":
            types["TN"] += 1
        elif json_response['prediction'] == "true" and json_response['groundtruth'] == "true":
            types["TP"] += 1
        elif json_response['prediction'] == "true" and json_response['groundtruth'] == "false":
            types["FP"] += 1
        elif json_response['prediction'] == "false" and json_response['groundtruth'] == "true":
            types["FN"] += 1
        else:
            print("ERROR")
        #types[json_response['type']] += 1
        #if str(json_response['score']) == "1":
        #    score += 1
        evaluated += 1
        print("---------------------------------")

    print("Accuracy", score/evaluated)
    print(types)

Evaluating /pfss/mlde/workspaces/mlde_wsp_Multimodal_on_42/CycliST/Cyclist/output/eval/unicycle_count_answer.csv
PRED One object, the golden sphere, is changing its xy coordinates.
ANSWER 1
Hey, can you judge this prediction for my visual question answering dataset? The Answers can be given as a whole sentence or simple yes/no or attributes like 'color' or 'shape' or 'size' etc.

Tell me if the have a true positive, false positive, true negative or false negative. 
A true positive is when the prediction is true and the ground truth is true, a false positive is when the prediction is true and the ground truth is false, a true negative is when the prediction is false and the ground truth is false and a false negative is when the prediction is false and the GT is true.
Put this into the json as shown above. Use the following format for this: true positive, false positive, true negative, false negative.

some examples are:
the model says 'red' and the ground truth is 'red' -> TP
the model 

KeyboardInterrupt: 

{'FP': 0, 'TP': 0, 'FN': 2, 'TN': 3}

# judge one sample at a time

In [6]:
import pandas as pd

# load csv file with

csv_paths =[ "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/enlarge_low_answers.csv",
            "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/linear_low_answers.csv",
            "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/orbit_low_answers.csv",
            "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/recolor_low_answers.csv",
            "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/rotation_low_answers.csv"]

for csv_path in csv_paths:
    print("Evaluating", csv_path)
    # load csv
    df = pd.read_csv(csv_path, sep=";", names=['query', 'answer','pred'])
    #display(df.head())

    types = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
    score = 0
    evaluated = 0


    for idx, row in enumerate(tqdm(df.iterrows(), total=len(df))):
        if idx > 50:
            break
        # question = row[1]
        pred = row[1][1]
        answer = row[1][2]

        #print(pred, answer)
        
        response = query_sglang_llama3(pred, answer)
        json_response = re.search(r'{(.|\r?\n)*}', response).group()
        json_response = json.loads(json_response)
        # print(json_response['score'], json_response['type'])
        types[json_response['type']] += 1
        if str(json_response['score']) == "1":
            score += 1
        evaluated += 1

    print("Accuracy", score/evaluated)
    print(types)
                        

Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/enlarge_low_answers.csv


  pred = row[1][1]
  answer = row[1][2]
  4%|▍         | 31/824 [02:25<1:02:01,  4.69s/it]


Accuracy 0.4838709677419355
{'FP': 3, 'TP': 15, 'FN': 13, 'TN': 0}
Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/linear_low_answers.csv


  pred = row[1][1]
  answer = row[1][2]
  3%|▎         | 31/961 [02:32<1:16:21,  4.93s/it]


Accuracy 0.8064516129032258
{'FP': 2, 'TP': 25, 'FN': 4, 'TN': 0}
Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/orbit_low_answers.csv


  pred = row[1][1]
  answer = row[1][2]
  3%|▎         | 31/1114 [02:27<1:25:52,  4.76s/it]


Accuracy 0.5806451612903226
{'FP': 1, 'TP': 18, 'FN': 12, 'TN': 0}
Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/recolor_low_answers.csv


  pred = row[1][1]
  answer = row[1][2]
  3%|▎         | 31/1063 [02:05<1:09:26,  4.04s/it]


Accuracy 0.6451612903225806
{'FP': 2, 'TP': 20, 'FN': 9, 'TN': 0}
Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/rotation_low_answers.csv


  pred = row[1][1]
  answer = row[1][2]
  3%|▎         | 31/975 [02:21<1:11:40,  4.56s/it]

Accuracy 0.7419354838709677
{'FP': 2, 'TP': 23, 'FN': 6, 'TN': 0}





# batched processing

In [8]:
import openai
client = openai.Client(
    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")



judge_prompt= """
Hey, can you judge this prediction for my visual question answering dataset? The Answers can be given as a whole sentence or simple yes/no or attributes like 'color' or 'shape' or 'size' etc.

all forms of false like 'no', 'false', '0' should be mapped into false
all forms of true like 'yes', 'true', '1' are the same and should be mapped into false

Can you  map me the prediction and answer into the following format:
{{
    "prediction": "<true, false>",
    "groundtruth": "<true, false>"
}}

Prediction:
{}

GT:
{}
"""


In [24]:
import pandas as pd
import numpy as np
from openai import OpenAI
import time



# load csv file with
base_path = "/pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/"
cycles = ["enlarge", "linear", "orbit", "recolor", "rotation"]
clutter = ["low"]


last_idx =0
results_all_cycles = []
for cyc in cycles:
    for clut in clutter:
        csv_path = base_path + f"llava_ov_unicycle_fps_8/{cyc}_{clut}_answers.csv"
        client = OpenAI(base_url=f"http://127.0.0.1:30000/v1", api_key="None")


        print("Evaluating", csv_path)
        # load csv
        df = pd.read_csv(csv_path, sep=";", names=['query', 'answer','pred'])
        #display(df.head())

        types = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
        score = 0
        evaluated = 0

        
        #create batch json to send to llm
        batch_size = len(df)
        for chunk in np.array_split(df, int(len(df)/batch_size)):

            pred = chunk['pred']
            answer = chunk['answer']
            
            requests = []
            for idx, (p, a) in enumerate(zip(pred, answer)):
                requests.append(
                    {
                        "custom_id": "request-{}".format(idx+ last_idx),
                        "method": "POST",
                        "url": "/chat/completions",
                        "body": {
                            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
                            "messages": [
                                {"role": "system", "content": "You are a helpful AI assistant helping me score the predictions of a model."},
                                {"role": "user",
                                "content": judge_prompt.format(p, a)},
                            ],
                            "max_tokens": 200,
                        },
                    }
                )

            input_file_path = "batch_requests_{}_{}.jsonl".format(cyc, clut)

            # Write the requests to a file
            with open(input_file_path, "w") as f:
                for req in requests:
                    f.write(json.dumps(req) + "\n")
                    
            # Send the file to OpenAI
            with open(input_file_path, "rb") as f:
                file_response = client.files.create(file=f, purpose="batch")
                
            batch_response = client.batches.create(
            input_file_id=file_response.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            )

            print(f"Batch job created with ID: {batch_response.id}")
            
            # check if batch was completed
            while batch_response.status not in ["completed", "failed", "cancelled"]:
                time.sleep(3)
                print(f"Batch job status: {batch_response.status}...trying again in 5 seconds...")
                batch_response = client.batches.retrieve(batch_response.id)

            # if batch was completed, get the results
            if batch_response.status == "completed":
                print("Batch job completed successfully!")
                print(f"Request counts: {batch_response.request_counts}")

                print("loading data from file", batch_response.output_file_id)
                result_file_id = batch_response.output_file_id
                file_response = client.files.content(result_file_id)
                result_content = file_response.read().decode("utf-8")

                results = [
                    json.loads(line) for line in result_content.split("\n") if line.strip() != ""
                ]
                results_all_cycles.append(results)

                # for result in results:
                #     print(f"Request {result['custom_id']}:")
                #     # print(f"Response: {result['response']}")
                #     print(result['response']['body']['choices']['message']['content'])

                print("Cleaning up files...")
                # Only delete the result file ID since file_response is just content
                client.files.delete(result_file_id)
            else:
                print(f"Batch job failed with status: {batch_response.status}")
                if hasattr(batch_response, "errors"):
                    print(f"Errors: {batch_response.errors}")
            last_idx += idx
        

Evaluating /pfss/mlde/workspaces/mlde_wsp_PI_Kersting/LLaVA-cake/Cyclist/data/answers/llava_ov_unicycle_fps_8/enlarge_low_answers.csv
Batch job created with ID: batch_7d99c823-5584-4a65-b2f6-bfeb10f15912
Batch job status: validating...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batch job status: in_progress...trying again in 5 seconds...
Batc

In [39]:
for results, cycle in zip(results_all_cycles, cycles):
    score = 0
    evaluated = 0
    types = {"FP": 0, "TP": 0, "FN": 0, "TN": 0}
    for result in results:
        # print(f"Request {result['custom_id']}:")
        # print(f"Response: {result['response']}")
        response = result['response']['body']['choices']['message']['content']

        try:
            #parse json from response
            # json_response = re.search(r'{(.|\r?\n)*}', response).group()
            # json_response = json.loads(json_response)
            # # print(json_response['score'], json_response['type'])
            # types[json_response['type']] += 1
            # if str(json_response['score']) == "1":
            #     score += 1
            # evaluated += 1
            
            
            json_response = re.search(r'{(.|\r?\n)*}', response).group()
            json_response = json.loads(json_response)
            if json_response['prediction'] == "false" and json_response['groundtruth'] == "false":
                types["TN"] += 1
                score += 1
            elif json_response['prediction'] == "true" and json_response['groundtruth'] == "true":
                types["TP"] += 1
                score += 1
            elif json_response['prediction'] == "true" and json_response['groundtruth'] == "false":
                types["FP"] += 1
            elif json_response['prediction'] == "false" and json_response['groundtruth'] == "true":
                types["FN"] += 1
            else:
                continue
                #print("ERROR")
            #types[json_response['type']] += 1
            #if str(json_response['score']) == "1":
            #    score += 1
            evaluated += 1
            # print("---------------------------------")

        except:
            continue
    print("--------")
    print("Cycle", cycle)
    print(types)
    precision = types["TP"] / (types["TP"] + types["FP"])
    recall = types["TP"] / (types["TP"] + types["FN"])
    f1 = 2 * (precision * recall) / (precision + recall)
    print("Accuracy", score/evaluated)
    print("Precision", precision)
    print("Recall", recall)
    print("F1", f1)
    

--------
Cycle enlarge
{'FP': 192, 'TP': 214, 'FN': 33, 'TN': 380}
Accuracy 0.7252747252747253
Precision 0.5270935960591133
Recall 0.8663967611336032
F1 0.6554364471669218
--------
Cycle linear
{'FP': 205, 'TP': 234, 'FN': 52, 'TN': 419}
Accuracy 0.7175824175824176
Precision 0.5330296127562643
Recall 0.8181818181818182
F1 0.6455172413793104
--------
Cycle orbit
{'FP': 163, 'TP': 405, 'FN': 54, 'TN': 428}
Accuracy 0.7933333333333333
Precision 0.7130281690140845
Recall 0.8823529411764706
F1 0.7887049659201557
--------
Cycle recolor
{'FP': 184, 'TP': 374, 'FN': 67, 'TN': 427}
Accuracy 0.7614068441064639
Precision 0.6702508960573477
Recall 0.8480725623582767
F1 0.7487487487487489
--------
Cycle rotation
{'FP': 194, 'TP': 289, 'FN': 29, 'TN': 459}
Accuracy 0.7703398558187435
Precision 0.598343685300207
Recall 0.9088050314465409
F1 0.7215980024968789


In [32]:
len(results_all_cycles[0])

824