## Experiment 2 Molmo: Image-to-Text Matching

Set-up

In [None]:
from huggingface_hub import login
import torch

login()

HF_TOKEN = "your_huggingface_token_here"  # Replace with your Hugging Face token

# This will be removed in the final version - LLaMA requires access from Meta
hf_key = HF_TOKEN

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch.nn.functional as F
import random
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
import pandas as pd
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig

Label selection

In [3]:
possible_labels_s_r = [
    "looloo", "loolah", "looloh", "loonoo", "loonah", "loonoh", "loomoo", "loomah", "loomoh",
    "lahloo", "lahlah", "lahloh", "lahnoo", "lahnah", "lahnoh", "lahmoo", "lahmah", "lahmoh",
    "lohloo", "lohlah", "lohloh", "lohnoo", "lohnah", "lohnoh", "lohmoo", "lohmah", "lohmoh",
    "nooloo", "noolah", "nooloh", "noonoo", "noonah", "noonoh", "noomoo", "noomah", "noomoh",
    "nahloo", "nahlah", "nahloh", "nahnoo", "nahnah", "nahnoh", "nahmoo", "nahmah", "nahmoh",
    "nohloo", "nohlah", "nohloh", "nohnoo", "nohnah", "nohnoh", "nohmoo", "nohmah", "nohmoh",
    "mooloo", "moolah", "mooloh", "moonoo", "moonah", "moonoh", "moomoo", "moomah", "moomoh",
    "mahloo", "mahlah", "mahloh", "mahnoo", "mahnah", "mahnoh", "mahmoo", "mahmah", "mahmoh",
    "mohloo", "mohlah", "mohloh", "mohnoo", "mohnah", "mohnoh", "mohmoo", "mohmah", "mohmoh"
]

possible_labels_p_nr = [
    "teetee", "teetuh", "teetay", "teekee", "teekuh", "teekay", "teepee", "teepuh", "teepay",
    "tuhtee", "tuhtuh", "tuhtay", "tuhkee", "tuhkuh", "tuhkay", "tuhpee", "tuhpuh", "tuhpay",
    "taytee", "taytuh", "taytay", "taykee", "taykuh", "taykay", "taypee", "taypuh", "taypay",
    "keetee", "keetuh", "keetay", "keekee", "keekuh", "keekay", "keepee", "keepuh", "keepay",
    "kuhtee", "kuhtuh", "kuhtay", "kuhkee", "kuhkuh", "kuhkay", "kuhpee", "kuhpuh", "kuhpay",
    "kaytee", "kaytuh", "kaytay", "kaykee", "kaykuh", "kaykay", "kaypee", "kaypuh", "kaypay",
    "peetee", "peetuh", "peetay", "peekee", "peekuh", "peekay", "peepee", "peepuh", "peepay",
    "puhtee", "puhtuh", "puhtay", "puhkee", "puhkuh", "puhkay", "puhpee", "puhpuh", "puhpay",
    "paytee", "paytuh", "paytay", "paykee", "paykuh", "paykay", "paypee", "paypuh", "paypay"
]

possible_labels_s_r = random.sample(possible_labels_s_r, 10)
possible_labels_p_nr = random.sample(possible_labels_p_nr, 10)

final_list = possible_labels_s_r + possible_labels_p_nr

print(final_list)

['mohnah', 'nohmah', 'mohnoo', 'lohloh', 'looloo', 'mahnoh', 'moonoh', 'lohloo', 'nohnah', 'moomoo', 'puhkee', 'teepay', 'tuhkay', 'teekuh', 'tuhpuh', 'peekuh', 'peepay', 'paypay', 'kuhkuh', 'taypay']


In [None]:
class ImageTextMatcher:
    def __init__(self, image_folder="images"):
        """
        Initialize the analyzer with the path to the image folder.
        
        Args:
            image_folder: Path to the folder containing images
        """
        self.image_folder = image_folder
        self.results = []
        self.models = {}
        
    def load_model(self, model_name):
        if model_name == 'molmo':
            processor = AutoProcessor.from_pretrained(
                'allenai/Molmo-7B-D-0924',
                trust_remote_code=True,
                torch_dtype=torch.float16,
                device_map='auto')
            model = AutoModelForCausalLM.from_pretrained(
                'allenai/Molmo-7B-D-0924',
                trust_remote_code=True,
                torch_dtype=torch.float16,
                device_map='auto')
            self.models['molmo'] = {'model': model, 'processor': processor}
        else:
            raise ValueError(f"Unsupported model: {model_name}, Model not Found.")
    
    def classify_image(self, image_path, possible_labels):
        """
        Classify the image into one of the predefined classes.
        
        Args:
            image_path: Path to the image file
        
        Returns:
            Predicted class name
        """
        model_info = self.models['molmo']
        model = model_info['model']
        processor = model_info['processor']
        
        image = Image.open(image_path).convert('RGB')
        
        # Prompt setup
        prompt = (
            f"You are given an image for which you need to assign a label. Use one of the following labels: {possible_labels}. Only respond with the label."
        )
        
        inputs = processor.process(
            images=[image],
            text=prompt
        )

        inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

        # Generate with output scores to compute probabilities
        with torch.no_grad():
            with torch.autocast(device_type="cuda", enabled=True, dtype=torch.float32):
                outputs = model.generate_from_batch(
                    inputs,
                    GenerationConfig(max_new_tokens=5, stop_strings="<|endoftext|>"),
                    tokenizer=processor.tokenizer,
                    output_scores=True,
                    return_dict_in_generate=True,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    top_k=40
                )

                generated_token_ids = outputs.sequences[0][inputs['input_ids'].size(1):]  # Only new tokens
                scores = outputs.scores  # Logits for each new token

                token_probs = []
                for i, token_id in enumerate(generated_token_ids):
                    logits = scores[i]  # Logits for i-th token
                    probs = F.softmax(logits, dim=-1)
                    token_prob = probs[0, token_id]
                    token_probs.append(token_prob.item())

                # Final predicted text (after decoding tokens)
                predicted_text = processor.tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()

                # Confidence score (mean of token probabilities)
                confidence_score = sum(token_probs) / len(token_probs)

                print(f"Predicted text: {predicted_text}")
                print(f"Token probabilities: {[round(p, 4) for p in token_probs]}")
                print(f"Mean confidence: {confidence_score:.4f}")

                return predicted_text, confidence_score
            
    def prepare_dataset_for_classification(self, image_paths=None):
        """
        Prepare dataset for image classification.
        
        Args:
            image_paths: List of paths to images (if None, scan the image folder)
            
        Returns:
            List of image paths
        """
        # If image paths are not provided, scan the image folder
        if image_paths is None:
            image_paths = []
            for filename in os.listdir(self.image_folder):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                    image_paths.append(os.path.join(self.image_folder, filename))
        
        return image_paths
    
    def classify_dataset(self, model_name, image_paths, possible_labels_s_r, possible_labels_p_nr):
        """
        Classify a dataset of images using S-R and P-NR labels and calculate confidence scores.
        
        Args:
            model_name: Name of the model to use
            image_paths: List of image paths to classify
            possible_labels_s_r: Labels for sonorant+rounded pseudowords
            possible_labels_p_nr: Labels for plosief+non-rounded pseudowords
            
        Returns:
            DataFrame with classification results
        """
        if model_name not in self.models:
            self.load_model(model_name)
        
        classification_results = []

        # Classify images in batches of 4
        batch_size = 4
        for i in range(0, len(image_paths), batch_size):
            batch = image_paths[i:i+batch_size]
            
            for image_path in tqdm(batch, desc=f"Classifying images {i}-{i+len(batch)}"):
                try:
                    # First classify with S-R labels
                    predicted_class_s_r, score_s_r = self.classify_image(image_path, possible_labels_s_r)
                    
                    # Then classify with P-NR labels
                    predicted_class_p_nr, score_p_nr = self.classify_image(image_path, possible_labels_p_nr)

                    filename = os.path.basename(image_path)
                    image_type = 'Unknown'
                    if 'curved' in filename.lower():
                        image_type = 'Curved'
                    elif 'jagged' in filename.lower():
                        image_type = 'Jagged'
                                
                    # Store result with confidence scores
                    classification_results.append({
                        'image_path': image_path,
                        'image_filename': os.path.basename(image_path),
                        'image_type': image_type,
                        'predicted_class_s_r': predicted_class_s_r,
                        'score_s_r': score_s_r,
                        'predicted_class_p_nr': predicted_class_p_nr,
                        'score_p_nr': score_p_nr
                    })
                    
                except Exception as e:
                    print(f"Error classifying {image_path}: {str(e)}")
                    continue

            # Clear CUDA cache between batches
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        return pd.DataFrame(classification_results)
    
    def analyze_classification_results(self, results_df):
        """
        Analyze the classification results.
        
        Args:
            results_df: DataFrame with classification results
            
        Returns:
            DataFrame with classification metrics
        """
        # Extract and analyze the scores for S-R and P-NR words
        curved_scores_s_r = results_df[results_df['image_type'] == 'Curved']['score_s_r']
        jagged_scores_s_r = results_df[results_df['image_type'] == 'Jagged']['score_s_r']
        
        curved_scores_p_nr = results_df[results_df['image_type'] == 'Curved']['score_p_nr']
        jagged_scores_p_nr = results_df[results_df['image_type'] == 'Jagged']['score_p_nr']
        
        # Compare average scores
        analysis_results = {
            'avg_score_s_r_curved': curved_scores_s_r.mean(),
            'avg_score_s_r_jagged': jagged_scores_s_r.mean(),
            'avg_score_p_nr_curved': curved_scores_p_nr.mean(),
            'avg_score_p_nr_jagged': jagged_scores_p_nr.mean()
        }

        return pd.DataFrame([analysis_results])

if __name__ == "__main__":
    # Initialize
    matcher = ImageTextMatcher(image_folder="images")
    image_paths = matcher.prepare_dataset_for_classification()
    classification_results = matcher.classify_dataset("molmo", image_paths, possible_labels_s_r, possible_labels_p_nr)

    ## Change folder to [1,2,3]
    # Save results
    classification_results.to_csv("10/image_classifications.csv", index=False)
    
    # Save metrics
    classification_metrics = matcher.analyze_classification_results(classification_results)
    classification_metrics.to_csv("10/classification_metrics.csv", index=False)
    
    print("Classification complete. Results saved to CSV files.")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Classifying images 0-4:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7078, 1.0, 1.0, 1.0]
Mean confidence: 0.9269


Classifying images 0-4:  25%|██▌       | 1/4 [00:46<02:20, 46.77s/it]

Predicted text: tuhkay
Token probabilities: [0.5307, 1.0, 1.0, 1.0]
Mean confidence: 0.8827
Predicted text: looloo
Token probabilities: [0.0708, 1.0, 1.0, 1.0]
Mean confidence: 0.7677


Classifying images 0-4:  50%|█████     | 2/4 [01:20<01:18, 39.15s/it]

Predicted text: teepay
Token probabilities: [0.6442, 1.0, 1.0, 1.0]
Mean confidence: 0.9110
Predicted text: mohnah
Token probabilities: [0.6504, 1.0, 1.0, 1.0]
Mean confidence: 0.9126


Classifying images 0-4:  75%|███████▌  | 3/4 [01:56<00:37, 37.54s/it]

Predicted text: teepay
Token probabilities: [0.5127, 1.0, 1.0, 1.0]
Mean confidence: 0.8782
Predicted text: mahnoh
Token probabilities: [0.2341, 1.0, 1.0, 1.0]
Mean confidence: 0.8085


Classifying images 0-4: 100%|██████████| 4/4 [02:36<00:00, 39.07s/it]


Predicted text: teepay
Token probabilities: [0.6092, 1.0, 1.0, 1.0]
Mean confidence: 0.9023


Classifying images 4-8:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mahnoh
Token probabilities: [0.1209, 1.0, 1.0, 1.0]
Mean confidence: 0.7802


Classifying images 4-8:  25%|██▌       | 1/4 [00:35<01:46, 35.65s/it]

Predicted text: tuhkay
Token probabilities: [0.5014, 1.0, 1.0, 1.0]
Mean confidence: 0.8754
Predicted text: mohnah
Token probabilities: [0.6716, 1.0, 1.0, 1.0]
Mean confidence: 0.9179


Classifying images 4-8:  50%|█████     | 2/4 [01:13<01:14, 37.00s/it]

Predicted text: teepay
Token probabilities: [0.5649, 1.0, 1.0, 1.0]
Mean confidence: 0.8912
Predicted text: looloo
Token probabilities: [0.1323, 1.0, 1.0, 1.0]
Mean confidence: 0.7831


Classifying images 4-8:  75%|███████▌  | 3/4 [01:52<00:38, 38.03s/it]

Predicted text: teepay
Token probabilities: [0.5985, 1.0, 1.0, 1.0]
Mean confidence: 0.8996
Predicted text: mahnoh
Token probabilities: [0.1966, 1.0, 1.0, 1.0]
Mean confidence: 0.7991


Classifying images 4-8: 100%|██████████| 4/4 [02:26<00:00, 36.51s/it]


Predicted text: tuhkay
Token probabilities: [0.3253, 1.0, 1.0, 1.0]
Mean confidence: 0.8313


Classifying images 8-12:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7944, 1.0, 1.0, 1.0]
Mean confidence: 0.9486


Classifying images 8-12:  25%|██▌       | 1/4 [00:32<01:38, 32.96s/it]

Predicted text: tuhkay
Token probabilities: [0.452, 1.0, 1.0, 1.0]
Mean confidence: 0.8630
Predicted text: mohnah
Token probabilities: [0.6504, 1.0, 1.0, 1.0]
Mean confidence: 0.9126


Classifying images 8-12:  50%|█████     | 2/4 [01:06<01:06, 33.04s/it]

Predicted text: tuhkay
Token probabilities: [0.4256, 1.0, 1.0, 1.0]
Mean confidence: 0.8564
Predicted text: mohnah
Token probabilities: [0.7396, 1.0, 1.0, 1.0]
Mean confidence: 0.9349


Classifying images 8-12:  75%|███████▌  | 3/4 [01:50<00:38, 38.20s/it]

Predicted text: tuhkay
Token probabilities: [0.0789, 1.0, 0.5399, 1.0, 1.0]
Mean confidence: 0.7237
Predicted text: mohnah
Token probabilities: [0.5822, 1.0, 1.0, 1.0]
Mean confidence: 0.8955


Classifying images 8-12: 100%|██████████| 4/4 [02:30<00:00, 37.55s/it]


Predicted text: teepay
Token probabilities: [0.5352, 1.0, 1.0, 1.0]
Mean confidence: 0.8838


Classifying images 12-16:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: looloo
Token probabilities: [0.1074, 1.0, 1.0, 1.0]
Mean confidence: 0.7768


Classifying images 12-16:  25%|██▌       | 1/4 [00:38<01:56, 38.90s/it]

Predicted text: tuhkay
Token probabilities: [0.4548, 1.0, 1.0, 1.0]
Mean confidence: 0.8637
Predicted text: mohnah
Token probabilities: [0.8104, 1.0, 1.0, 1.0]
Mean confidence: 0.9526


Classifying images 12-16:  50%|█████     | 2/4 [01:16<01:16, 38.40s/it]

Predicted text: teepay
Token probabilities: [0.706, 1.0, 1.0, 1.0]
Mean confidence: 0.9265
Predicted text: mohnah
Token probabilities: [0.6826, 1.0, 1.0, 1.0]
Mean confidence: 0.9207


Classifying images 12-16:  75%|███████▌  | 3/4 [01:53<00:37, 37.40s/it]

Predicted text: teepay
Token probabilities: [0.6483, 1.0, 1.0, 1.0]
Mean confidence: 0.9121
Predicted text: mahnoh
Token probabilities: [0.0808, 1.0, 1.0, 1.0]
Mean confidence: 0.7702


Classifying images 12-16: 100%|██████████| 4/4 [02:29<00:00, 37.49s/it]


Predicted text: teepay
Token probabilities: [0.5514, 1.0, 1.0, 1.0]
Mean confidence: 0.8879


Classifying images 16-20:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7389, 1.0, 1.0, 1.0]
Mean confidence: 0.9347


Classifying images 16-20:  25%|██▌       | 1/4 [00:36<01:49, 36.46s/it]

Predicted text: tuhkay
Token probabilities: [0.3548, 1.0, 1.0, 1.0]
Mean confidence: 0.8387
Predicted text: mohnah
Token probabilities: [0.7922, 1.0, 1.0, 1.0]
Mean confidence: 0.9480


Classifying images 16-20:  50%|█████     | 2/4 [01:13<01:13, 36.90s/it]

Predicted text: tuhkay
Token probabilities: [0.5017, 1.0, 1.0, 1.0]
Mean confidence: 0.8754
Predicted text: mahnoh
Token probabilities: [0.1782, 1.0, 1.0, 1.0]
Mean confidence: 0.7946


Classifying images 16-20:  75%|███████▌  | 3/4 [01:47<00:35, 35.52s/it]

Predicted text: teepay
Token probabilities: [0.6198, 1.0, 1.0, 1.0]
Mean confidence: 0.9050
Predicted text: mahnoh
Token probabilities: [0.243, 1.0, 1.0, 1.0]
Mean confidence: 0.8108


Classifying images 16-20: 100%|██████████| 4/4 [02:21<00:00, 35.43s/it]


Predicted text: teepay
Token probabilities: [0.6282, 1.0, 1.0, 1.0]
Mean confidence: 0.9070


Classifying images 20-24:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mahnoh
Token probabilities: [0.2663, 1.0, 1.0, 1.0]
Mean confidence: 0.8166


Classifying images 20-24:  25%|██▌       | 1/4 [00:36<01:49, 36.63s/it]

Predicted text: teepay
Token probabilities: [0.6856, 1.0, 1.0, 1.0]
Mean confidence: 0.9214
Predicted text: mohnah
Token probabilities: [0.805, 1.0, 1.0, 1.0]
Mean confidence: 0.9513


Classifying images 20-24:  50%|█████     | 2/4 [01:14<01:14, 37.18s/it]

Predicted text: teepay
Token probabilities: [0.4568, 1.0, 1.0, 1.0]
Mean confidence: 0.8642
Predicted text: mohnah
Token probabilities: [0.8343, 1.0, 1.0, 1.0]
Mean confidence: 0.9586


Classifying images 20-24:  75%|███████▌  | 3/4 [01:46<00:35, 35.02s/it]

Predicted text: tuhkay
Token probabilities: [0.3936, 1.0, 1.0, 1.0]
Mean confidence: 0.8484
Predicted text: mohnah
Token probabilities: [0.7358, 1.0, 1.0, 1.0]
Mean confidence: 0.9339


Classifying images 20-24: 100%|██████████| 4/4 [02:21<00:00, 35.39s/it]


Predicted text: tuhkay
Token probabilities: [0.3776, 1.0, 1.0, 1.0]
Mean confidence: 0.8444


Classifying images 24-28:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7158, 1.0, 1.0, 1.0]
Mean confidence: 0.9290


Classifying images 24-28:  25%|██▌       | 1/4 [00:35<01:46, 35.56s/it]

Predicted text: teepay
Token probabilities: [0.6114, 1.0, 1.0, 1.0]
Mean confidence: 0.9028
Predicted text: mohnah
Token probabilities: [0.6666, 1.0, 1.0, 1.0]
Mean confidence: 0.9167


Classifying images 24-28:  50%|█████     | 2/4 [01:14<01:14, 37.48s/it]

Predicted text: teepay
Token probabilities: [0.5831, 1.0, 1.0, 1.0]
Mean confidence: 0.8958
Predicted text: mohnah
Token probabilities: [0.7211, 0.8945, 1.0, 1.0]
Mean confidence: 0.9039


Classifying images 24-28:  75%|███████▌  | 3/4 [01:55<00:39, 39.07s/it]

Predicted text: teepay
Token probabilities: [0.7043, 1.0, 1.0, 1.0]
Mean confidence: 0.9261
Predicted text: mohnah
Token probabilities: [0.681, 1.0, 1.0, 1.0]
Mean confidence: 0.9202


Classifying images 24-28: 100%|██████████| 4/4 [02:34<00:00, 38.54s/it]


Predicted text: teepay
Token probabilities: [0.6472, 1.0, 1.0, 1.0]
Mean confidence: 0.9118


Classifying images 28-32:   0%|          | 0/4 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7563, 1.0, 1.0, 1.0]
Mean confidence: 0.9391


Classifying images 28-32:  25%|██▌       | 1/4 [00:34<01:44, 34.93s/it]

Predicted text: teepay
Token probabilities: [0.6383, 1.0, 1.0, 1.0]
Mean confidence: 0.9096
Predicted text: mohnah
Token probabilities: [0.8376, 1.0, 1.0, 1.0]
Mean confidence: 0.9594


Classifying images 28-32:  50%|█████     | 2/4 [01:09<01:09, 34.93s/it]

Predicted text: tuhkay
Token probabilities: [0.2899, 1.0, 1.0, 1.0]
Mean confidence: 0.8225
Predicted text: looloo
Token probabilities: [0.1107, 1.0, 1.0, 1.0]
Mean confidence: 0.7777


Classifying images 28-32:  75%|███████▌  | 3/4 [01:43<00:34, 34.25s/it]

Predicted text: tuhkay
Token probabilities: [0.4578, 1.0, 1.0, 1.0]
Mean confidence: 0.8645
Predicted text: looloo
Token probabilities: [0.0954, 1.0, 1.0, 1.0]
Mean confidence: 0.7738


Classifying images 28-32: 100%|██████████| 4/4 [02:19<00:00, 34.86s/it]


Predicted text: tuhkay
Token probabilities: [0.4737, 1.0, 1.0, 1.0]
Mean confidence: 0.8684


Classifying images 32-34:   0%|          | 0/2 [00:00<?, ?it/s]

Predicted text: mohnah
Token probabilities: [0.7995, 1.0, 1.0, 1.0]
Mean confidence: 0.9499


Classifying images 32-34:  50%|█████     | 1/2 [00:36<00:36, 36.87s/it]

Predicted text: tuhkay
Token probabilities: [0.291, 1.0, 1.0, 1.0]
Mean confidence: 0.8227
Predicted text: nohmah
Token probabilities: [0.075, 0.4448, 1.0, 1.0]
Mean confidence: 0.6300


Classifying images 32-34: 100%|██████████| 2/2 [01:13<00:00, 36.99s/it]

Predicted text: teepay
Token probabilities: [0.6511, 1.0, 1.0, 1.0]
Mean confidence: 0.9128
Classification complete. Results saved to CSV files.



