## Import Libraries

In [2]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from PIL import Image
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch
import os
from glob import glob
import torch
import numpy as np
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
from zero_dce import (
    Trainer, plot_result
)
from open_clip import create_model_from_pretrained, get_tokenizer
from torchvision import transforms


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model from Hugging Face (Apple DFN2B)
model, preprocess = create_model_from_pretrained('hf-hub:apple/DFN2B-CLIP-ViT-L-14')
model = model.to(device).eval()
tokenizer = get_tokenizer('ViT-L-14')

In [4]:
def enhance_images_with_zero_dce(full_image_paths, model_weights_path):
    trainer = Trainer()
    trainer.build_model(pretrain_weights=model_weights_path)

    enhanced_paths = []

    for path in full_image_paths:
        path = os.path.normpath(path)
        image, enhanced = trainer.infer_gpu(path, image_resize_factor=1)

        alpha = 0.4
        enhanced_toned = alpha * enhanced + (1 - alpha) * np.asarray(image) / 255.0
        enhanced_toned = np.clip(enhanced_toned, 0, 1)

        enhanced_path = path.replace("tpc-imgs", "enhanced-imgs")
        os.makedirs(os.path.dirname(enhanced_path), exist_ok=True)
        Image.fromarray((enhanced_toned * 255).astype(np.uint8)).save(enhanced_path)

        enhanced_paths.append(enhanced_path)

    return enhanced_paths

## Image Preprocessing

In [5]:
# Cell 2: Load prompts and unique labels from JSON
def load_dataset(json_path):
    with open(json_path) as f:
        data = json.load(f)
    # image_paths = ['/content/' + entry['image_path'] for entry in data]
    image_paths = ['C:\College\Semester 4\Research\\' + entry['image_path'] for entry in data]
    descriptions = [entry['description'] for entry in data]
    slangs = [entry['slang'] for entry in data]
    # slangs = [entry['label'] for entry in data]

    label_to_desc = {}
    for desc, slang in zip(descriptions, slangs):
        label_to_desc[slang] = desc

    unique_slangs = list(label_to_desc.keys())
    unique_descriptions = list(label_to_desc.values())

    return image_paths, slangs, unique_slangs, unique_descriptions

In [6]:
# Cell 3: Encode descriptions into text embeddings
def encode_text_prompts(model, descriptions):
    text_tokens = tokenizer(descriptions).to(device)
    with torch.no_grad():
        text_embeddings = model.encode_text(text_tokens)
        text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings
    return text_embeddings

## Inference

In [7]:
# Cell 4: Run inference on images, returning predicted slang labels
def run_inference_batch(image_paths, model, text_embeddings, unique_slangs, batch_size=5):
    predictions = []
    imgs_batch = []

    for i, img_path in enumerate(image_paths):
        img = Image.open(img_path).convert('RGB')
        img_tensor = preprocess(img).unsqueeze(0).to(device)  # preprocess and convert to tensor here
        imgs_batch.append(img_tensor)

        if (i + 1) % batch_size == 0 or (i + 1) == len(image_paths):
            imgs_tensor  = torch.cat(imgs_batch, dim=0)  # now this works
            with torch.no_grad():
                img_embs = model.encode_image(imgs_tensor)
                img_embs /= img_embs.norm(dim=-1, keepdim=True)

            cos_sim = img_embs @ text_embeddings.T
            top_vals, top_idxs = cos_sim.topk(k=3, dim=1)

            for indices in top_idxs:
                predicted_top_slangs = [unique_slangs[i] for i in indices]
                predictions.append(predicted_top_slangs)
            imgs_batch = []

    return predictions

## Evaluation

In [8]:
# Cell 5: Evaluate predictions with accuracy, precision, recall, and confusion matrix
TOP_K = 3
def evaluate(true_slangs, predicted_descriptions, desc_to_slang, unique_slangs, max_k=4):
    for k in range(1, max_k + 1):
        if k == 1:
            preds_desc = [preds[0] for preds in predicted_descriptions]
            preds_slangs = [desc_to_slang[desc] for desc in preds_desc]
        else:
            preds_slangs = []
            for true_label, pred_descs in zip(true_slangs, predicted_descriptions):
                topk_slangs = [desc_to_slang[d] for d in pred_descs[:k]]
                if true_label in topk_slangs:
                    preds_slangs.append(true_label)
                else:
                    preds_slangs.append(topk_slangs[0])  

        accuracy = accuracy_score(true_slangs, preds_slangs)
        precision = precision_score(true_slangs, preds_slangs, labels=unique_slangs, average=None, zero_division=0)
        recall = recall_score(true_slangs, preds_slangs, labels=unique_slangs, average=None, zero_division=0)
        f1 = f1_score(true_slangs, preds_slangs, labels=unique_slangs, average=None, zero_division=0)
        cm = confusion_matrix(true_slangs, preds_slangs, labels=unique_slangs)

        print(f"\nTop-{k} Evaluation:")
        print(f"Accuracy: {accuracy*100:.2f}%\n")
        for i, label in enumerate(unique_slangs):
            print(f"Label: {label}")
            print(f"  Precision: {precision[i]:.3f}")
            print(f"  Recall:    {recall[i]:.3f}")
            print(f"  F1 Score:  {f1[i]:.3f}\n")
        print("Confusion Matrix:")
        print(cm)
        print("-" * 40)


In [10]:
# Cell 6: Main script - run inference, evaluate
if __name__ == "__main__":
    json_path = "metadata_unbalanced.json" 
    base_dir = "C:/College/Semester 4/Research"
    model_weights_path = r"C:/College/Semester 4/Research/Zero-DCE/pretrained-models/model200_dark_faces.pth"

    image_paths, true_slangs, unique_slangs, unique_descriptions = load_dataset(json_path)
    image_paths = [os.path.join(base_dir, p) for p in image_paths]

    if "unbalanced" in json_path.lower():
        print("[INFO] Unbalanced dataset detected — applying Zero-DCE enhancement...")
        image_paths = enhance_images_with_zero_dce(
            full_image_paths=image_paths,
            model_weights_path=model_weights_path
    )

    desc_to_slang = {desc: slang for slang, desc in zip(unique_slangs, unique_descriptions)}
    text_emb = encode_text_prompts(model, unique_descriptions)

    print("[INFO] Running inference...")
    predicted_descriptions = run_inference_batch(image_paths, model, text_emb, unique_descriptions, batch_size=5)

    print("[INFO] Evaluating predictions...")
    evaluate(true_slangs, predicted_descriptions, desc_to_slang, unique_slangs)

[INFO] Unbalanced dataset detected — applying Zero-DCE enhancement...
[INFO] Running inference...
[INFO] Evaluating predictions...

Top-1 Evaluation:
Accuracy: 56.03%

Label: Boxelder
  Precision: 0.917
  Recall:    0.786
  F1 Score:  0.846

Label: Fragrant Sumac
  Precision: 0.267
  Recall:    0.300
  F1 Score:  0.282

Label:  Poison Oak
  Precision: 0.462
  Recall:    0.480
  F1 Score:  0.471

Label:  Poison Ivy
  Precision: 0.574
  Recall:    0.780
  F1 Score:  0.661

Label: Poison Sumac
  Precision: 0.710
  Recall:    0.440
  F1 Score:  0.543

Confusion Matrix:
[[33  2  2  4  1]
 [ 1 12 23  3  1]
 [ 0  5 24 19  2]
 [ 2  2  2 39  5]
 [ 0 24  1  3 22]]
----------------------------------------

Top-2 Evaluation:
Accuracy: 81.90%

Label: Boxelder
  Precision: 0.974
  Recall:    0.881
  F1 Score:  0.925

Label: Fragrant Sumac
  Precision: 0.828
  Recall:    0.600
  F1 Score:  0.696

Label:  Poison Oak
  Precision: 0.704
  Recall:    0.760
  F1 Score:  0.731

Label:  Poison Ivy
  Precisi