LLAVA case study

In [2]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch
import pandas as pd
from PIL import Image
from torch.amp import autocast

device = "cpu"
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-1.5-7b-hf").to(device)

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████| 3/3 [01:24<00:00, 28.18s/it]


load dataset

In [3]:
train_dataset = pd.read_csv("generated_img_dataset/train.csv")
valid_dataset = pd.read_csv("generated_img_dataset/val.csv")
test_dataset = pd.read_csv("generated_img_dataset/test.csv")

In [6]:
test_dataset

Unnamed: 0,EM,EN,unicode,label,strategy,image
0,👔📈,big business,U+1F454 U+1F4C8,1,1,./generated_img_dataset/test_google/0.png
1,🏢🤑🤑,big business,U+1F3E2 U+1F911 U+1F911,1,1,./generated_img_dataset/test_google/1.png
2,👨‍💻🤝,big business,U+1F468 U+200D U+1F4BB U+1F91D,1,1,./generated_img_dataset/test_google/2.png
3,🏢🧑‍🤝‍🧑🧑‍🤝‍🧑🧑‍🤝‍🧑,big business,U+1F3E2 U+1F9D1 U+200D U+1F91D U+200D U+1F9D1 ...,1,1,./generated_img_dataset/test_google/3.png
4,👩‍💻🤑,big business,U+1F469 U+200D U+1F4BB U+1F911,1,1,./generated_img_dataset/test_google/4.png
...,...,...,...,...,...,...
488,👍👣,effective entrance,U+1F44D U+1F463,0,6,./generated_img_dataset/test_google/513.png
489,👏🪜,effective entrance,U+1F44F U+1FA9C,0,6,./generated_img_dataset/test_google/514.png
490,😤🗣️💬,effective entrance,U+1F624 U+1F5E3 U+FE0F U+1F4AC,0,6,./generated_img_dataset/test_google/515.png
491,💨🤬,effective entrance,U+1F4A8 U+1F92C,0,6,./generated_img_dataset/test_google/516.png


In [None]:
import torch
from PIL import Image
from tqdm.auto import tqdm

# --- Configuration ---
BATCH_SIZE = 8 # <<< START SMALL (e.g., 2 or 4) and increase if possible

# Helper function to yield batches from the DataFrame
def generate_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:min(i + batch_size, len(df))]

def batch_zero_shot_predict(batch_samples):
    """
    Given a batch of samples (a slice of your DataFrame),
    construct conversations, generate prompts, process images and text
    through the model to produce zero-shot predictions for the batch.
    Returns a tuple (list_of_predictions, list_of_generated_texts).
    """
    batch_prompts_structured = []
    batch_raw_images = []
    batch_indices_processed = [] # Keep track of which samples were successful

    # 1. Prepare batch data (prompts and images)
    for index, sample in batch_samples.iterrows():
        # Construct prompt message
        prompt_message = f"Does this emoji sequence mean '{sample['EN']}'? Answer yes or no."

        # Build the conversation structure for the processor template
        # Note: We still build one structure per sample before applying the template
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_message},
                    {"type": "image"},
                ],
            },
        ]
        batch_prompts_structured.append(conversation)

        # Load the image
        try:
            raw_image = Image.open(sample['image']).convert("RGB")
            batch_raw_images.append(raw_image)
            batch_indices_processed.append(index) # Store original index
        except Exception as e:
            print(f"Warning: Error loading image {sample['image']} for index {index}: {e}. Skipping this sample.")
            # Append placeholders to keep lists aligned for now, will filter later
            # Or handle differently if your processor/model is sensitive to None
            batch_raw_images.append(None) # Placeholder

    # Filter out samples where image loading failed
    valid_indices = [i for i, img in enumerate(batch_raw_images) if img is not None]
    if not valid_indices:
        print("Warning: No valid images found in this batch. Skipping.")
        # Return empty lists matching the expected output structure
        return [0] * len(batch_samples), ["<IMAGE_LOAD_ERROR>"] * len(batch_samples)

    # Keep only valid images and their corresponding structured prompts
    final_batch_raw_images = [batch_raw_images[i] for i in valid_indices]
    final_batch_prompts_structured = [batch_prompts_structured[i] for i in valid_indices]
    original_indices_for_valid = [batch_samples.index[i] for i in valid_indices] # Get original df index

    # 2. Apply chat template and tokenize prompts
    # We need to apply the template individually then tokenize as a batch
    # because apply_chat_template usually works on a single conversation.
    batch_final_prompts_text = []
    for conv in final_batch_prompts_structured:
         # Use add_generation_prompt=True for the model's turn
        prompt_text = processor.apply_chat_template(conv, add_generation_prompt=True)
        batch_final_prompts_text.append(prompt_text)

    # 3. Process the batch of images and text prompts
    # Use padding=True for batching text inputs
    inputs = processor(
        images=final_batch_raw_images,
        text=batch_final_prompts_text,
        return_tensors="pt",
        padding=True, # Crucial for batching text
        truncation=True # Good practice
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 4. Generate outputs from the model for the batch
    generated_ids = None
    try:
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=50)
    except Exception as e:
         print(f"Error during model generation for a batch (possible OOM): {e}")
         # Handle error: return default predictions for this batch
         # Create placeholder results aligned with the original batch size
         batch_predictions = [0] * len(batch_samples)
         batch_generated_texts = ["<GENERATION_ERROR>"] * len(batch_samples)
         return batch_predictions, batch_generated_texts

    # Clean up GPU memory if applicable
    # del inputs
    # if torch.cuda.is_available():
    #     torch.cuda.empty_cache()

    # 5. Decode the generated tokens to text for the batch
    # Use batch_decode for efficiency
    batch_decoded_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    # 6. Interpret results and map back to original batch structure
    batch_predictions_map = {}
    batch_generated_texts_map = {}
    for i, decoded_text in enumerate(batch_decoded_texts):
        cleaned_text = decoded_text.strip().lower()
        # The LLaVA output might include the prompt, need to isolate the answer
        # Find the last occurrence of the assistant's turn marker if apply_chat_template added one
        # Or simply check the end of the string if the template is simple
        # Example check (adapt based on your model's exact output format):
        answer_part = cleaned_text.split("assistant:")[-1] # separator

        prediction = 1 if "yes" in answer_part else 0
        original_index = original_indices_for_valid[i] # Map back using original df index
        batch_predictions_map[original_index] = prediction
        batch_generated_texts_map[original_index] = cleaned_text # Store full generated text

    # Create final lists in the original batch order, filling in defaults for skipped items
    
    final_predictions = []
    final_gen_texts = []
    for index in batch_samples.index:
        final_predictions.append(batch_predictions_map.get(index, 0)) # Default 0 if skipped/error
        final_gen_texts.append(batch_generated_texts_map.get(index, "<SKIPPED_OR_ERROR>"))

    return final_predictions, final_gen_texts

# --- Main prediction loop using batches ---
predictions = []
generated_texts = []
true_labels = test_dataset["label"].tolist()

# Create the batch generator with a progress bar
batch_generator = generate_batches(test_dataset, BATCH_SIZE)
num_batches = (len(test_dataset) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Starting prediction with batch size {BATCH_SIZE} on device {device}...")
for batch_df in tqdm(batch_generator, total=num_batches, desc="Processing Batches"):
    batch_preds, batch_gen_texts = batch_zero_shot_predict(batch_df)
    predictions.extend(batch_preds)
    generated_texts.extend(batch_gen_texts)

    # Optional: Print progress for the first item in the batch
    first_idx = batch_df.index[0]
    print(f"Batch starting row {first_idx}: EN = {batch_df.iloc[0]['EN']} -> Prediction: {batch_preds[0]}, Generated: {batch_gen_texts[0][:100]}...") # Print truncated generated text

print("Finished prediction.")

# from sklearn.metrics import accuracy_score, f1_score
# print(f"Accuracy: {accuracy_score(true_labels, predictions)}")
# print(f"F1 Macro: {f1_score(true_labels, predictions, average='macro')}")


Starting prediction with batch size 8 on device cpu...


Processing Batches:   0%|          | 0/62 [00:00<?, ?it/s]Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Processing Batches:   2%|▏         | 1/62 [11:29<11:40:53, 689.40s/it]

Batch starting row 0: EN = big business -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big business'? answer yes or no. assistant: yes....


Processing Batches:   3%|▎         | 2/62 [19:14<9:17:37, 557.62s/it] 

Batch starting row 8: EN = big expenditure -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big expenditure'? answer yes or no. assistant: yes....


Processing Batches:   5%|▍         | 3/62 [27:00<8:26:50, 515.44s/it]

Batch starting row 16: EN = big voice -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big voice'? answer yes or no. assistant: yes....


Processing Batches:   6%|▋         | 4/62 [34:42<7:57:56, 494.43s/it]

Batch starting row 24: EN = big group -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big group'? answer yes or no. assistant: yes....


Processing Batches:   8%|▊         | 5/62 [42:27<7:39:52, 484.08s/it]

Batch starting row 32: EN = big man -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big man'? answer yes or no. assistant: no....


Processing Batches:  10%|▉         | 6/62 [50:34<7:32:35, 484.92s/it]

Batch starting row 40: EN = big city -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big city'? answer yes or no. assistant: yes...


Processing Batches:  11%|█▏        | 7/62 [58:27<7:20:48, 480.89s/it]

Batch starting row 48: EN = big tipper -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big tipper'? answer yes or no. assistant: no....


Processing Batches:  13%|█▎        | 8/62 [1:06:13<7:08:39, 476.29s/it]

Batch starting row 56: EN = big day -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big day'? answer yes or no. assistant: yes....


Processing Batches:  15%|█▍        | 9/62 [1:13:57<6:57:21, 472.48s/it]

Batch starting row 64: EN = hot doll -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot doll'? answer yes or no. assistant: no....


Processing Batches:  16%|█▌        | 10/62 [1:21:53<6:50:18, 473.44s/it]

Batch starting row 72: EN = hot water -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot water'? answer yes or no. assistant: yes....


Processing Batches:  18%|█▊        | 11/62 [1:29:54<6:44:34, 475.97s/it]

Batch starting row 80: EN = hot stove -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot stove'? answer yes or no. assistant: yes....


Processing Batches:  19%|█▉        | 12/62 [1:37:40<6:33:56, 472.72s/it]

Batch starting row 88: EN = hot topic -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot topic'? answer yes or no. assistant: no...


Processing Batches:  21%|██        | 13/62 [1:45:36<6:26:50, 473.68s/it]

Batch starting row 96: EN = hot merchandise -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot merchandise'? answer yes or no. assistant: yes....


Processing Batches:  23%|██▎       | 14/62 [1:53:32<6:19:35, 474.49s/it]

Batch starting row 104: EN = hot argument -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot argument'? answer yes or no. assistant: no....


Processing Batches:  24%|██▍       | 15/62 [2:01:37<6:14:04, 477.55s/it]

Batch starting row 112: EN = hot forehead -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot forehead'? answer yes or no. assistant: no....


Processing Batches:  26%|██▌       | 16/62 [2:09:28<6:04:35, 475.55s/it]

Batch starting row 120: EN = full game -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full game'? answer yes or no. assistant: no...


Processing Batches:  27%|██▋       | 17/62 [2:17:26<5:57:23, 476.52s/it]

Batch starting row 128: EN = full auditorium -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full auditorium'? answer yes or no. assistant: no....


Processing Batches:  29%|██▉       | 18/62 [2:25:09<5:46:27, 472.45s/it]

Batch starting row 136: EN = full attention -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full attention'? answer yes or no. assistant: no....


Processing Batches:  31%|███       | 19/62 [2:32:50<5:36:05, 468.96s/it]

Batch starting row 144: EN = full glass -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full glass'? answer yes or no. assistant: yes....


Processing Batches:  32%|███▏      | 20/62 [2:42:28<5:51:14, 501.78s/it]

Batch starting row 152: EN = little boy -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little boy'? answer yes or no. assistant: yes....


Processing Batches:  34%|███▍      | 21/62 [2:52:29<6:03:12, 531.52s/it]

Batch starting row 160: EN = little man -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little man'? answer yes or no. assistant: no....


Processing Batches:  35%|███▌      | 22/62 [3:02:30<6:08:07, 552.19s/it]

Batch starting row 168: EN = little house -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little house'? answer yes or no. assistant: yes....


Processing Batches:  37%|███▋      | 23/62 [3:12:34<6:09:09, 567.93s/it]

Batch starting row 176: EN = thin oil -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin oil'? answer yes or no. assistant: no....


Processing Batches:  39%|███▊      | 24/62 [3:22:25<6:04:05, 574.87s/it]

Batch starting row 184: EN = thin soup -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin soup'? answer yes or no. assistant: no....


Processing Batches:  40%|████      | 25/62 [3:32:32<6:00:25, 584.46s/it]

Batch starting row 192: EN = thin air -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin air'? answer yes or no. assistant: yes....


Processing Batches:  42%|████▏     | 26/62 [3:50:35<7:20:28, 734.12s/it]

Batch starting row 200: EN = thin line -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin line'? answer yes or no. assistant: no....


Processing Batches:  44%|████▎     | 27/62 [3:58:23<6:21:33, 654.10s/it]

Batch starting row 208: EN = ineffectual ruler -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'ineffectual ruler'? answer yes or no. assistant: yes....


Processing Batches:  45%|████▌     | 28/62 [4:06:21<5:40:47, 601.41s/it]

Batch starting row 216: EN = ineffectual therapy -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'ineffectual therapy'? answer yes or no. assistant: no....


Processing Batches:  47%|████▋     | 29/62 [4:14:16<5:09:49, 563.33s/it]

Batch starting row 224: EN = effective step -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective step'? answer yes or no. assistant: no....


Processing Batches:  48%|████▊     | 30/62 [4:22:08<4:45:50, 535.94s/it]

Batch starting row 232: EN = effective reprimand -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective reprimand'? answer yes or no. assistant: no....


Processing Batches:  50%|█████     | 31/62 [4:29:56<4:26:25, 515.65s/it]

Batch starting row 240: EN = effective entrance -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective entrance'? answer yes or no. assistant: no....


Processing Batches:  52%|█████▏    | 32/62 [4:37:59<4:12:50, 505.70s/it]

Batch starting row 248: EN = big business -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big business'? answer yes or no. assistant: no...


Processing Batches:  53%|█████▎    | 33/62 [4:45:59<4:00:46, 498.16s/it]

Batch starting row 256: EN = big expenditure -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big expenditure'? answer yes or no. assistant: no....


Processing Batches:  55%|█████▍    | 34/62 [4:53:46<3:48:04, 488.72s/it]

Batch starting row 264: EN = big voice -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big voice'? answer yes or no. assistant: no....


Processing Batches:  56%|█████▋    | 35/62 [5:01:26<3:35:59, 479.99s/it]

Batch starting row 272: EN = big group -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big group'? answer yes or no. assistant: no....


Processing Batches:  58%|█████▊    | 36/62 [5:08:59<3:24:36, 472.17s/it]

Batch starting row 280: EN = big man -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big man'? answer yes or no. assistant: no....


Processing Batches:  60%|█████▉    | 37/62 [5:16:58<3:17:29, 473.98s/it]

Batch starting row 288: EN = big city -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big city'? answer yes or no. assistant: yes....


Processing Batches:  61%|██████▏   | 38/62 [5:25:16<3:12:29, 481.22s/it]

Batch starting row 296: EN = big tipper -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big tipper'? answer yes or no. assistant: no....


Processing Batches:  63%|██████▎   | 39/62 [5:32:49<3:01:15, 472.83s/it]

Batch starting row 304: EN = big day -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'big day'? answer yes or no. assistant: no....


Processing Batches:  65%|██████▍   | 40/62 [5:40:30<2:52:06, 469.39s/it]

Batch starting row 312: EN = hot doll -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot doll'? answer yes or no. assistant: no....


Processing Batches:  66%|██████▌   | 41/62 [5:48:16<2:43:52, 468.23s/it]

Batch starting row 320: EN = hot water -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot water'? answer yes or no. assistant: no....


Processing Batches:  68%|██████▊   | 42/62 [5:55:53<2:35:00, 465.01s/it]

Batch starting row 328: EN = hot topic -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot topic'? answer yes or no. assistant: no...


Processing Batches:  69%|██████▉   | 43/62 [6:03:33<2:26:42, 463.30s/it]

Batch starting row 336: EN = hot merchandise -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot merchandise'? answer yes or no. assistant: no....


Processing Batches:  71%|███████   | 44/62 [6:10:55<2:17:06, 457.05s/it]

Batch starting row 344: EN = hot temper -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot temper'? answer yes or no. assistant: no....


Processing Batches:  73%|███████▎  | 45/62 [6:18:44<2:10:30, 460.61s/it]

Batch starting row 352: EN = hot argument -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot argument'? answer yes or no. assistant: no....


Processing Batches:  74%|███████▍  | 46/62 [6:26:20<2:02:28, 459.26s/it]

Batch starting row 360: EN = hot forehead -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'hot forehead'? answer yes or no. assistant: no....


Processing Batches:  76%|███████▌  | 47/62 [6:34:09<1:55:30, 462.04s/it]

Batch starting row 368: EN = full game -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full game'? answer yes or no. assistant: no....


Processing Batches:  77%|███████▋  | 48/62 [6:42:13<1:49:23, 468.80s/it]

Batch starting row 376: EN = full auditorium -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full auditorium'? answer yes or no. assistant: no....


Processing Batches:  79%|███████▉  | 49/62 [6:49:58<1:41:17, 467.47s/it]

Batch starting row 384: EN = full attention -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full attention'? answer yes or no. assistant: no....


Processing Batches:  81%|████████  | 50/62 [6:57:54<1:33:59, 469.98s/it]

Batch starting row 392: EN = full life -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'full life'? answer yes or no. assistant: no....


Processing Batches:  82%|████████▏ | 51/62 [7:05:33<1:25:34, 466.79s/it]

Batch starting row 400: EN = little boy -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little boy'? answer yes or no. assistant: no....


Processing Batches:  84%|████████▍ | 52/62 [7:13:09<1:17:14, 463.45s/it]

Batch starting row 408: EN = little man -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little man'? answer yes or no. assistant: yes....


Processing Batches:  85%|████████▌ | 53/62 [7:20:55<1:09:40, 464.47s/it]

Batch starting row 416: EN = little house -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'little house'? answer yes or no. assistant: no....


Processing Batches:  87%|████████▋ | 54/62 [7:28:40<1:01:55, 464.40s/it]

Batch starting row 424: EN = thin oil -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin oil'? answer yes or no. assistant: no....


Processing Batches:  89%|████████▊ | 55/62 [7:38:39<58:53, 504.77s/it]  

Batch starting row 432: EN = thin soup -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin soup'? answer yes or no. assistant: no....


Processing Batches:  90%|█████████ | 56/62 [7:51:15<58:02, 580.39s/it]

Batch starting row 440: EN = thin air -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin air'? answer yes or no. assistant: no....


Processing Batches:  92%|█████████▏| 57/62 [8:04:01<52:59, 635.90s/it]

Batch starting row 448: EN = thin line -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'thin line'? answer yes or no. assistant: no...


Processing Batches:  94%|█████████▎| 58/62 [8:16:13<44:19, 664.89s/it]

Batch starting row 456: EN = ineffectual ruler -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'ineffectual ruler'? answer yes or no. assistant: no....


Processing Batches:  95%|█████████▌| 59/62 [8:28:24<34:13, 684.50s/it]

Batch starting row 464: EN = ineffectual therapy -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'ineffectual therapy'? answer yes or no. assistant: no....


Processing Batches:  97%|█████████▋| 60/62 [8:40:15<23:04, 692.46s/it]

Batch starting row 472: EN = effective step -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective step'? answer yes or no. assistant: no...


Processing Batches:  98%|█████████▊| 61/62 [8:52:56<11:53, 713.11s/it]

Batch starting row 480: EN = effective reprimand -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective reprimand'? answer yes or no. assistant: no....


Processing Batches: 100%|██████████| 62/62 [8:59:10<00:00, 521.79s/it]

Batch starting row 488: EN = effective entrance -> Prediction: 1, Generated: user:  
does this emoji sequence mean 'effective entrance'? answer yes or no. assistant: no....
Finished prediction.
Accuracy: 0.4949290060851927
F1 Macro: 0.3310719131614654





save predictions

In [11]:
generated_texts

["user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  \ndoes this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  \ndoes this emoji sequence mean 'big expenditure'? answer yes or no. assistant: yes.",
 "user:  \ndoes this emoji sequence mean 'big expenditure'? answer yes or no. assistant: yes.",
 "user:  \ndoes this emoji sequence mean 'big expenditure'? answer ye

In [15]:
cleaned_generated_texts = [text.replace('\n', '') for text in generated_texts]
cleaned_generated_texts

["user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: yes.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: no.",
 "user:  does this emoji sequence mean 'big expenditure'? answer yes or no. assistant: yes.",
 "user:  does this emoji sequence mean 'big expenditure'? answer yes or no. assistant: yes.",
 "user:  does this emoji sequence mean 'big expenditure'? answer yes or no. assistant: ye

In [None]:
test_dataset['predicted_label'] = predictions
test_dataset['generated_text'] = cleaned_generated_texts
test_dataset.to_csv("results.csv", index=False)

In [32]:
import pandas as pd

cleaned_generated_texts = pd.read_csv("results.csv")
cleaned_generated_texts["generated_text"][0]

"user:  does this emoji sequence mean 'big business'? answer yes or no. assistant: yes."

In [33]:
predictions2 = []
for text in cleaned_generated_texts["generated_text"]:
    parts = text.split("assistant:")
    assistant_response = parts[-1].strip() if len(parts) > 1 else text.strip()
    prediction = 1 if "yes" in assistant_response.lower() else 0

    print(parts, prediction)
    predictions2.append(prediction)

["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' no.'] 0
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' no.'] 0
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' no.'] 0
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' no.'] 0
["user:  does this emoji sequence mean 'big business'? answer yes or no. ", ' no.'] 0
["user:  does this emoji sequence mean 'big expenditure'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 'big expenditure'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 'big expenditure'? answer yes or no. ", ' yes.'] 1
["user:  does this emoji sequence mean 

In [34]:
from sklearn.metrics import accuracy_score, f1_score
true_labels = cleaned_generated_texts["label"]
print(f"Accuracy: {accuracy_score(true_labels, predictions2)}")
print(f"F1 Macro: {f1_score(true_labels, predictions2, average='macro')}")

Accuracy: 0.6734279918864098
F1 Macro: 0.6447824783283882


In [35]:
test_dataset = pd.read_csv("generated_img_dataset/test.csv")


In [36]:
test_dataset['predicted_label'] = predictions2
test_dataset['generated_text'] = cleaned_generated_texts["generated_text"]
test_dataset.to_csv("results2.csv", index=False)