## Experiment 2 LLaMA3.2: Image-to-Text Matching

Set-up

In [None]:
from huggingface_hub import login
import torch

login()

HF_TOKEN = "your_huggingface_token_here"  # Replace with your Hugging Face token

# This will be removed in the final version - LLaMA requires access from Meta
hf_key = HF_TOKEN

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch.nn.functional as F
import random
import pandas as pd
import os
from PIL import Image
from transformers import AutoProcessor, MllamaForConditionalGeneration
from tqdm import tqdm

Label selection

In [3]:
possible_labels_s_r = [
    "looloo", "loolah", "looloh", "loonoo", "loonah", "loonoh", "loomoo", "loomah", "loomoh",
    "lahloo", "lahlah", "lahloh", "lahnoo", "lahnah", "lahnoh", "lahmoo", "lahmah", "lahmoh",
    "lohloo", "lohlah", "lohloh", "lohnoo", "lohnah", "lohnoh", "lohmoo", "lohmah", "lohmoh",
    "nooloo", "noolah", "nooloh", "noonoo", "noonah", "noonoh", "noomoo", "noomah", "noomoh",
    "nahloo", "nahlah", "nahloh", "nahnoo", "nahnah", "nahnoh", "nahmoo", "nahmah", "nahmoh",
    "nohloo", "nohlah", "nohloh", "nohnoo", "nohnah", "nohnoh", "nohmoo", "nohmah", "nohmoh",
    "mooloo", "moolah", "mooloh", "moonoo", "moonah", "moonoh", "moomoo", "moomah", "moomoh",
    "mahloo", "mahlah", "mahloh", "mahnoo", "mahnah", "mahnoh", "mahmoo", "mahmah", "mahmoh",
    "mohloo", "mohlah", "mohloh", "mohnoo", "mohnah", "mohnoh", "mohmoo", "mohmah", "mohmoh"
]

possible_labels_p_nr = [
    "teetee", "teetuh", "teetay", "teekee", "teekuh", "teekay", "teepee", "teepuh", "teepay",
    "tuhtee", "tuhtuh", "tuhtay", "tuhkee", "tuhkuh", "tuhkay", "tuhpee", "tuhpuh", "tuhpay",
    "taytee", "taytuh", "taytay", "taykee", "taykuh", "taykay", "taypee", "taypuh", "taypay",
    "keetee", "keetuh", "keetay", "keekee", "keekuh", "keekay", "keepee", "keepuh", "keepay",
    "kuhtee", "kuhtuh", "kuhtay", "kuhkee", "kuhkuh", "kuhkay", "kuhpee", "kuhpuh", "kuhpay",
    "kaytee", "kaytuh", "kaytay", "kaykee", "kaykuh", "kaykay", "kaypee", "kaypuh", "kaypay",
    "peetee", "peetuh", "peetay", "peekee", "peekuh", "peekay", "peepee", "peepuh", "peepay",
    "puhtee", "puhtuh", "puhtay", "puhkee", "puhkuh", "puhkay", "puhpee", "puhpuh", "puhpay",
    "paytee", "paytuh", "paytay", "paykee", "paykuh", "paykay", "paypee", "paypuh", "paypay"
]

possible_labels_s_r = random.sample(possible_labels_s_r, 10)
possible_labels_p_nr = random.sample(possible_labels_p_nr, 10)

final_list = possible_labels_s_r + possible_labels_p_nr

print(final_list)

['mahnoo', 'lahlah', 'loonah', 'moomoo', 'nooloo', 'noonoh', 'moolah', 'lahmoo', 'lohmah', 'mahlah', 'tuhkuh', 'kuhkuh', 'teekuh', 'paytee', 'taytuh', 'peepay', 'puhkuh', 'tuhkay', 'puhpee', 'taypuh']


In [None]:
class ImageTextMatcher:
    def __init__(self, image_folder="images"):
        """
        Initialize the analyzer with the path to the image folder.
        
        Args:
            image_folder: Path to the folder containing images
        """
        self.image_folder = image_folder
        self.results = []
        self.models = {}
        
    def load_model(self, model_name):
        """
        Load a multimodal model.
        
        Args:
            model_name: Name of the model to load ('llama')
        """
        if model_name == 'llama':
            processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")
            model = MllamaForConditionalGeneration.from_pretrained(
                "meta-llama/Llama-3.2-11B-Vision-Instruct",
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.models['llama'] = {'model': model, 'processor': processor}
        else:
            raise ValueError(f"Unsupported model: {model_name}, Model not Found.")

    def classify_image(self, image_path, possible_labels):
        """
        Classify the image based on possible labels and calculate confidence score.
        
        Args:
            image_path: Path to the image file
            possible_labels: List of possible labels (S-R or P-NR)
            
        Returns:
            Predicted label and confidence score
        """
        model_info = self.models['llama']
        model = model_info['model']
        processor = model_info['processor']

        image = Image.open(image_path).convert('RGB')
        
        # Prompt setup
        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": f"You are given an image for which you need to assign a label. Use one of the following labels: {possible_labels}. Only respond with the label."}
            ]}
        ]

        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

        inputs = processor(
            image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt"
        ).to(model.device)
        
        # Generate classification
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                output_scores=True,
                return_dict_in_generate=True,
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                top_k=40
            )

            generated_token_ids = outputs.sequences[0][inputs['input_ids'].size(1):] # Only new tokens
            scores = outputs.scores # Logits for each new token

            token_probs = []
            for i, token_id in enumerate(generated_token_ids):
                logits = scores[i] # Logits for i-th token
                probs = F.softmax(logits, dim=-1)
                token_prob = probs[0, token_id]
                token_probs.append(token_prob.item())

            # Final predicted text (after decoding tokens)
            predicted_text = processor.tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()

            # Confidence score (mean of token probabilities)
            confidence_score = sum(token_probs) / len(token_probs)

            print(f"Predicted text: {predicted_text}")
            print(f"Token probabilities: {[round(p, 4) for p in token_probs]}")
            print(f"Mean confidence: {confidence_score:.4f}")

            return predicted_text, confidence_score

    def prepare_dataset_for_classification(self, image_paths=None):
        """
        Prepare dataset for image classification.
        
        Args:
            image_paths: List of paths to images (if None, scan the image folder)
            
        Returns:
            List of image paths
        """
        # If image paths are not provided, scan the image folder
        if image_paths is None:
            image_paths = []
            for filename in os.listdir(self.image_folder):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                    image_paths.append(os.path.join(self.image_folder, filename))
        
        return image_paths
    
    def classify_dataset(self, model_name, image_paths, possible_labels_s_r, possible_labels_p_nr):
        """
        Classify a dataset of images using S-R and P-NR labels and calculate confidence scores.
        
        Args:
            model_name: Name of the model to use
            image_paths: List of image paths to classify
            possible_labels_s_r: Labels for sonorant+rounded pseudowords
            possible_labels_p_nr: Labels for plosief+non-rounded pseudowords
            
        Returns:
            DataFrame with classification results
        """
        if model_name not in self.models:
            self.load_model(model_name)
        
        classification_results = []

        # Classify images in batches of 4
        batch_size = 4
        for i in range(0, len(image_paths), batch_size):
            batch = image_paths[i:i+batch_size]
            
            for image_path in tqdm(batch, desc=f"Classifying images {i}-{i+len(batch)}"):
                print("batch number: ", i)
                try:
                    # First classify with S-R labels
                    predicted_class_s_r, score_s_r = self.classify_image(image_path, possible_labels_s_r)
                    
                    # Then classify with P-NR labels
                    predicted_class_p_nr, score_p_nr = self.classify_image(image_path, possible_labels_p_nr)

                    filename = os.path.basename(image_path)
                    image_type = 'Unknown'
                    if 'curved' in filename.lower():
                        image_type = 'Curved'
                    elif 'jagged' in filename.lower():
                        image_type = 'Jagged'
                                
                    # Store result with confidence scores
                    classification_results.append({
                        'image_path': image_path,
                        'image_filename': os.path.basename(image_path),
                        'image_type': image_type,
                        'predicted_class_s_r': predicted_class_s_r,
                        'score_s_r': score_s_r,
                        'predicted_class_p_nr': predicted_class_p_nr,
                        'score_p_nr': score_p_nr
                    })
                    
                except Exception as e:
                    print(f"Error classifying {image_path}: {str(e)}")
                    continue

            # Clear CUDA cache between batches
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        return pd.DataFrame(classification_results)

    def analyze_classification_results(self, results_df):
        """
        Analyze the classification results.
        
        Args:
            results_df: DataFrame with classification results
            
        Returns:
            DataFrame with classification metrics
        """
        # Extract and analyze the scores for S-R and P-NR words
        curved_scores_s_r = results_df[results_df['image_type'] == 'Curved']['score_s_r']
        jagged_scores_s_r = results_df[results_df['image_type'] == 'Jagged']['score_s_r']
        
        curved_scores_p_nr = results_df[results_df['image_type'] == 'Curved']['score_p_nr']
        jagged_scores_p_nr = results_df[results_df['image_type'] == 'Jagged']['score_p_nr']
        
        # Compare average scores
        analysis_results = {
            'avg_score_s_r_curved': curved_scores_s_r.mean(),
            'avg_score_s_r_jagged': jagged_scores_s_r.mean(),
            'avg_score_p_nr_curved': curved_scores_p_nr.mean(),
            'avg_score_p_nr_jagged': jagged_scores_p_nr.mean()
        }

        return pd.DataFrame([analysis_results])

if __name__ == "__main__":
    # Initialize
    matcher = ImageTextMatcher(image_folder="images")
    image_paths = matcher.prepare_dataset_for_classification()
    classification_results = matcher.classify_dataset("llama", image_paths, possible_labels_s_r, possible_labels_p_nr)


    ## Change folder to [1,2,3]
    # Save results
    classification_results.to_csv("10/image_classifications.csv", index=False)
    
    # Save metrics
    classification_metrics = matcher.analyze_classification_results(classification_results)
    classification_metrics.to_csv("10/classification_metrics.csv", index=False)
    
    print("Classification complete. Results saved to CSV files.")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Classifying images 0-4:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  0
Predicted text: moolah.
Token probabilities: [0.7212, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9442


Classifying images 0-4:  25%|██▌       | 1/4 [00:33<01:39, 33.33s/it]

Predicted text: teekuh.
Token probabilities: [0.1089, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8218
batch number:  0
Predicted text: moolah.
Token probabilities: [0.6409, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9282


Classifying images 0-4:  50%|█████     | 2/4 [01:04<01:04, 32.07s/it]

Predicted text: tuhkuh.
Token probabilities: [0.4362, 0.2681, 0.636, 1.0, 1.0]
Mean confidence: 0.6681
batch number:  0
Predicted text: moolah.
Token probabilities: [0.5437, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9087


Classifying images 0-4:  75%|███████▌  | 3/4 [01:35<00:31, 31.75s/it]

Predicted text: peepay.
Token probabilities: [0.1229, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8246
batch number:  0
Predicted text: moolah.
Token probabilities: [0.6408, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9282


Classifying images 0-4: 100%|██████████| 4/4 [02:07<00:00, 31.79s/it]


Predicted text: teekuh.
Token probabilities: [0.0842, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8168


Classifying images 4-8:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  4
Predicted text: moolah.
Token probabilities: [0.7227, 1.0, 1.0, 0.8721, 1.0]
Mean confidence: 0.9190


Classifying images 4-8:  25%|██▌       | 1/4 [00:32<01:38, 32.81s/it]

Predicted text: taytuh
Token probabilities: [0.6467, 0.8536, 0.636, 1.0, 0.3486]
Mean confidence: 0.6970
batch number:  4
Predicted text: moolah.
Token probabilities: [0.6132, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9226


Classifying images 4-8:  50%|█████     | 2/4 [01:04<01:04, 32.03s/it]

Predicted text: taypuh.
Token probabilities: [0.5205, 0.8644, 0.5556, 1.0, 1.0]
Mean confidence: 0.7881
batch number:  4
Predicted text: moolah.
Token probabilities: [0.6064, 1.0, 1.0, 0.7655, 1.0]
Mean confidence: 0.8744


Classifying images 4-8:  75%|███████▌  | 3/4 [01:35<00:31, 31.76s/it]

Predicted text: taytuh.
Token probabilities: [0.5263, 0.7491, 0.572, 1.0, 0.8885]
Mean confidence: 0.7472
batch number:  4
Predicted text: noonoh.
Token probabilities: [0.2078, 1.0, 1.0, 1.0]
Mean confidence: 0.8019


Classifying images 4-8: 100%|██████████| 4/4 [02:04<00:00, 31.15s/it]


Predicted text: taytuh.
Token probabilities: [0.552, 0.5501, 0.3141, 1.0, 1.0]
Mean confidence: 0.6832


Classifying images 8-12:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  8
Predicted text: moolah.
Token probabilities: [0.5608, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9122


Classifying images 8-12:  25%|██▌       | 1/4 [00:33<01:39, 33.12s/it]

Predicted text: peepay.
Token probabilities: [0.1206, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8241
batch number:  8
Predicted text: moolah.
Token probabilities: [0.7276, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9455


Classifying images 8-12:  50%|█████     | 2/4 [01:04<01:03, 31.93s/it]

Predicted text: taytuh.
Token probabilities: [0.6001, 0.6763, 0.3386, 1.0, 1.0]
Mean confidence: 0.7230
batch number:  8
Predicted text: moolah.
Token probabilities: [0.7517, 0.9031, 1.0, 1.0, 1.0]
Mean confidence: 0.9310


Classifying images 8-12:  75%|███████▌  | 3/4 [01:35<00:31, 31.56s/it]

Predicted text: taytuh.
Token probabilities: [0.5495, 0.6763, 0.3486, 1.0, 1.0]
Mean confidence: 0.7149
batch number:  8
Predicted text: nooloo.
Token probabilities: [0.0824, 1.0, 1.0, 0.4389, 1.0]
Mean confidence: 0.7043


Classifying images 8-12: 100%|██████████| 4/4 [02:06<00:00, 31.65s/it]


Predicted text: puhpee.
Token probabilities: [0.1315, 1.0, 0.2725, 0.3692, 1.0]
Mean confidence: 0.5546


Classifying images 12-16:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  12
Predicted text: noonoh.
Token probabilities: [0.0835, 1.0, 1.0, 1.0]
Mean confidence: 0.7709


Classifying images 12-16:  25%|██▌       | 1/4 [00:33<01:40, 33.58s/it]

Predicted text: teekuh.
Token probabilities: [0.1175, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8235
batch number:  12
Predicted text: moolah.
Token probabilities: [0.6602, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9320


Classifying images 12-16:  50%|█████     | 2/4 [01:21<01:24, 42.25s/it]

Predicted text: taypuh.
Token probabilities: [0.5346, 0.7449, 0.5223, 1.0, 1.0]
Mean confidence: 0.7604
batch number:  12
Predicted text: moomoo.
Token probabilities: [0.7151, 0.1733, 1.0, 1.0, 1.0]
Mean confidence: 0.7777


Classifying images 12-16:  75%|███████▌  | 3/4 [02:16<00:48, 48.09s/it]

Predicted text: puhkuh.
Token probabilities: [0.2825, 1.0, 0.8067, 1.0, 1.0]
Mean confidence: 0.8178
batch number:  12
Predicted text: moolah.
Token probabilities: [0.6844, 1.0, 1.0, 0.884, 1.0]
Mean confidence: 0.9137


Classifying images 12-16: 100%|██████████| 4/4 [03:10<00:00, 47.57s/it]


Predicted text: taytuh.
Token probabilities: [0.6162, 0.8169, 0.5883, 1.0, 0.539]
Mean confidence: 0.7121


Classifying images 16-20:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  16
Predicted text: moolah.
Token probabilities: [0.6883, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9377


Classifying images 16-20:  25%|██▌       | 1/4 [00:32<01:37, 32.36s/it]

Predicted text: tuhkay.
Token probabilities: [0.5099, 0.1492, 0.3386, 1.0, 1.0]
Mean confidence: 0.5995
batch number:  16
Predicted text: moolah.
Token probabilities: [0.7491, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9498


Classifying images 16-20:  50%|█████     | 2/4 [01:03<01:03, 31.73s/it]

Predicted text: taytuh.
Token probabilities: [0.5326, 0.7849, 0.4833, 1.0, 1.0]
Mean confidence: 0.7602
batch number:  16
Predicted text: moolah.
Token probabilities: [0.7239, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9448


Classifying images 16-20:  75%|███████▌  | 3/4 [01:35<00:31, 31.59s/it]

Predicted text: taytuh.
Token probabilities: [0.5548, 0.8235, 0.4444, 1.0, 1.0]
Mean confidence: 0.7645
batch number:  16
Predicted text: moolah.
Token probabilities: [0.6774, 0.9011, 1.0, 1.0, 1.0]
Mean confidence: 0.9157


Classifying images 16-20: 100%|██████████| 4/4 [02:06<00:00, 31.62s/it]


Predicted text: peepay.
Token probabilities: [0.1157, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8231


Classifying images 20-24:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  20
Predicted text: lohmah.
Token probabilities: [0.0441, 0.5311, 1.0, 1.0, 1.0]
Mean confidence: 0.7150


Classifying images 20-24:  25%|██▌       | 1/4 [00:32<01:38, 32.88s/it]

Predicted text: taypuh.
Token probabilities: [0.4687, 0.796, 0.5056, 1.0, 1.0]
Mean confidence: 0.7541
batch number:  20
Predicted text: nooloo.
Token probabilities: [0.1149, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8230


Classifying images 20-24:  50%|█████     | 2/4 [01:02<01:01, 30.81s/it]

Predicted text: paytee.
Token probabilities: [0.0846, 1.0, 0.8863, 1.0]
Mean confidence: 0.7427
batch number:  20
Predicted text: moolah
Token probabilities: [0.5946, 1.0, 1.0, 0.116]
Mean confidence: 0.6776


Classifying images 20-24:  75%|███████▌  | 3/4 [01:29<00:29, 29.32s/it]

Predicted text: teekuh
Token probabilities: [0.1164, 1.0, 1.0, 0.2467]
Mean confidence: 0.5908
batch number:  20
Predicted text: moolah.
Token probabilities: [0.8165, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9633


Classifying images 20-24: 100%|██████████| 4/4 [02:02<00:00, 30.72s/it]


Predicted text: puhkuh.
Token probabilities: [0.2639, 1.0, 0.7275, 1.0, 1.0]
Mean confidence: 0.7983


Classifying images 24-28:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  24
Predicted text: moolah.
Token probabilities: [0.7521, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9504


Classifying images 24-28:  25%|██▌       | 1/4 [00:39<01:57, 39.10s/it]

Predicted text: taypuh.
Token probabilities: [0.5399, 0.8536, 0.5112, 1.0, 1.0]
Mean confidence: 0.7809
batch number:  24
Predicted text: loonah.
Token probabilities: [0.0709, 1.0, 1.0, 1.0]
Mean confidence: 0.7677


Classifying images 24-28:  50%|█████     | 2/4 [01:42<01:46, 53.33s/it]

Predicted text: taypuh.
Token probabilities: [0.4734, 0.8421, 0.5, 1.0, 1.0]
Mean confidence: 0.7631
batch number:  24
Predicted text: moolah.
Token probabilities: [0.732, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9464


Classifying images 24-28:  75%|███████▌  | 3/4 [02:47<00:58, 58.52s/it]

Predicted text: peepay.
Token probabilities: [0.0921, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8184
batch number:  24
Predicted text: moolah.
Token probabilities: [0.7007, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9401


Classifying images 24-28: 100%|██████████| 4/4 [03:57<00:00, 59.39s/it]


Predicted text: taypuh.
Token probabilities: [0.5016, 0.8361, 0.5112, 1.0, 1.0]
Mean confidence: 0.7698


Classifying images 28-32:   0%|          | 0/4 [00:00<?, ?it/s]

batch number:  28
Predicted text: moolah.
Token probabilities: [0.7176, 0.9031, 1.0, 1.0, 1.0]
Mean confidence: 0.9241


Classifying images 28-32:  25%|██▌       | 1/4 [00:32<01:37, 32.62s/it]

Predicted text: puhkuh.
Token probabilities: [0.1982, 1.0, 0.895, 1.0, 1.0]
Mean confidence: 0.8186
batch number:  28
Predicted text: nooloo.
Token probabilities: [0.0709, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8142


Classifying images 28-32:  50%|█████     | 2/4 [01:03<01:03, 31.78s/it]

Predicted text: teekuh.
Token probabilities: [0.0896, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.8179
batch number:  28
Predicted text: moolah.
Token probabilities: [0.7701, 1.0, 1.0, 1.0, 1.0]
Mean confidence: 0.9540


Classifying images 28-32:  75%|███████▌  | 3/4 [01:34<00:31, 31.26s/it]

Predicted text: tuhkuh.
Token probabilities: [0.6274, 0.5611, 0.884, 1.0, 1.0]
Mean confidence: 0.8145
batch number:  28
Predicted text: loonah.
Token probabilities: [0.155, 1.0, 0.833, 1.0]
Mean confidence: 0.7470


Classifying images 28-32: 100%|██████████| 4/4 [02:02<00:00, 30.65s/it]


Predicted text: peepay.
Token probabilities: [0.1037, 1.0, 1.0, 0.8991, 1.0]
Mean confidence: 0.8006


Classifying images 32-34:   0%|          | 0/2 [00:00<?, ?it/s]

batch number:  32
Predicted text: loonah.
Token probabilities: [0.103, 1.0, 1.0, 1.0]
Mean confidence: 0.7757


Classifying images 32-34:  50%|█████     | 1/2 [00:28<00:28, 28.73s/it]

Predicted text: puhkuh.
Token probabilities: [0.1921, 1.0, 0.8299, 1.0, 1.0]
Mean confidence: 0.8044
batch number:  32
Predicted text: loonah.
Token probabilities: [0.1393, 1.0, 0.8135, 1.0]
Mean confidence: 0.7382


Classifying images 32-34: 100%|██████████| 2/2 [00:56<00:00, 28.16s/it]

Predicted text: tuhkuh.
Token probabilities: [0.5955, 0.4117, 0.7887, 1.0, 1.0]
Mean confidence: 0.7592
Classification complete. Results saved to CSV files.



