# SETUP
Before running anything:
1. Get openAI API key and store as "OPENAI_API_KEY" in .env file (create in working directory)
2. Get Kaggle API json file (should be called Kaggle Settings.json) and store in ".kaggle" folder (create in working directory)

Running code below will:
1. Download 2gb Dermnet dataset
2. Create a json file called "diagnosis_mapping.json" that contains each image path and diagnosis
3. Create a persistent vectorDB called " using chroma
4. Store image and metadata embeddings in IRISVectorDB


#### Just run this first, dont need to edit anything

In [1]:
import os
import getpass
import json
import re
from dotenv import load_dotenv
from PIL import Image

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [2]:
# # Download the dataset from Kaggle (ensure kagglehub is installed and configured)
# import kagglehub
# dataset_path = kagglehub.dataset_download("shubhamgoel27/dermnet")
# print("Dataset downloaded to:", dataset_path)

dataset_path = "../data/dermnet_data"

In [3]:
# --- Create Documents from Image Files ---
def extract_diagnosis(filename):
    """
    Extracts a diagnosis string from a filename by removing leading numbers/special characters
    and capitalizing the remaining words.
    """
    name = os.path.splitext(filename)[0]
    cleaned = re.sub(r'^[\d_\-]+', '', name)
    parts = re.split(r'[\d_\-]+', cleaned)
    diagnosis = ' '.join(part.strip().capitalize() for part in parts if part.strip())
    return diagnosis

def create_documents_from_images(root_dir):
    """
    Walk through the dataset directory and create a Document for each image.
    The document's page_content is set to the image file path and its metadata contains the diagnosis.
    """
    from langchain.docstore.document import Document
    docs = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                full_path = os.path.join(root, file)
                diagnosis = extract_diagnosis(file)
                doc = Document(page_content=full_path, metadata={"diagnosis": diagnosis, "path": full_path})
                docs.append(doc)
    return docs

docs = create_documents_from_images(dataset_path)
print(f"Created {len(docs)} image documents.")

Created 5 image documents.


In [4]:
import time
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from langchain_iris import IRISVector
import os

# Initialize the image embedding function with your chosen model and checkpoint.
multimodal_ef = OpenCLIPEmbeddings(model_name="ViT-g-14", checkpoint="laion2b_s34b_b88k")

# Define your IRIS connection parameters.
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972'
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

# Choose a collection name (avoid periods as it becomes a SQL table name).
COLLECTION_NAME = "dermnet_multimodal"

start = time.time()

# Create or update the IRISVector persistent vector store with the small batch of image documents.
db = IRISVector.from_documents(
    embedding=multimodal_ef,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

elapsed = time.time() - start
print("Vector store created with image embeddings for the small batch.")
ids = db.get().get("ids", [])
print(f"Number of docs in vector store: {len(ids)}")
print(f"Time taken: {elapsed:.2f} seconds")

# def batch(iterable, n=1):
#     """Yield successive n-sized batches from iterable."""
#     for i in range(0, len(iterable), n):
#         yield iterable[i:i + n]

# # Process docs in batches of 1000
# batch_size = 1000

# # For the first batch, create the store.
# for i, batch_docs in enumerate(batch(docs, batch_size)):
#     print(f"Processing batch {i+1} with {len(batch_docs)} documents...")
#     if i == 0:
#         # Create the store for the first batch.
#         db = IRISVector.from_documents(
#             embedding=multimodal_ef,
#             documents=batch_docs,
#             collection_name=COLLECTION_NAME,
#             connection_string=CONNECTION_STRING,
#         )
#     else:
#         # For subsequent batches, add documents to the existing store.
#         db.add_documents(batch_docs)
#     print(f"Batch {i+1} processed.")

# # After processing, you can check the total number of documents.
# elapsed = time.time() - start
# ids = db.get().get("ids", [])
# print(f"Total number of docs in vector store: {len(ids)}")
# print(f"Time taken: {elapsed:.2f} seconds")

Vector store created with image embeddings for the small batch.
Number of docs in vector store: 5
Time taken: 5.67 seconds


In [5]:
# --- Query the Vector Store Using an Image ---
# Update this path to point to your query image.
query_image_path = "../data/test.jpg"
# Embed the query image. Note: embed_image expects a list of image URIs.
query_embedding = multimodal_ef.embed_image([query_image_path])[0]

# Perform a similarity search using the query image's embedding vector.
results = db.similarity_search_by_vector(query_embedding, k=3)

print("Top similar results:")
for result in results:
    diagnosis = result.metadata.get("diagnosis", "N/A")
    path = result.metadata.get("path", "N/A")
    print(f"Diagnosis: {diagnosis} | Path: {path}")

Top similar results:
Diagnosis: Vitiligo | Path: ../data/dermnet_data\Vitiligo\vitiligo-0003.jpg
Diagnosis: Allergic Contact Dermatitis | Path: ../data/dermnet_data\Eczema\allergic-contact-dermatitis-0003.jpg
Diagnosis: Zoster | Path: ../data/dermnet_data\Shingles\zoster-13.jpg


In [1]:
# ## REQUERY
# from langchain_experimental.open_clip import OpenCLIPEmbeddings
# from langchain_iris import IRISVector
# import os

# # Initialize the embedding function
# multimodal_ef = OpenCLIPEmbeddings(model_name="ViT-g-14", checkpoint="laion2b_s34b_b88k")

# # Set up your connection parameters again
# username = 'demo'
# password = 'demo'
# hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
# port = '1972'
# namespace = 'USER'
# CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
# COLLECTION_NAME = "dermnet_multimodal"

# # Load the persistent vector store
# db = IRISVector.from_documents(
#     embedding=multimodal_ef,
#     documents=[],  # Passing an empty list so that it doesn't re-embed
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
# )

# # --- Query the Vector Store Using an Image ---
# query_image_path = "../data/test.jpg"
# query_embedding = multimodal_ef.embed_image([query_image_path])[0]

# results = db.similarity_search_by_vector(query_embedding, k=3)

# print("Top similar results:")
# for result in results:
#     diagnosis = result.metadata.get("diagnosis", "N/A")
#     path = result.metadata.get("path", "N/A")
#     print(f"Diagnosis: {diagnosis} | Path: {path}")


Top similar results:
Diagnosis: Rhinophyma | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\test\Acne and Rosacea Photos\07Rhinophyma1.jpg
Diagnosis: Rhinophyma | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\test\Acne and Rosacea Photos\07Rhinophyma1.jpg
Diagnosis: Rhinophyma | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\test\Acne and Rosacea Photos\07Rhinophyma1.jpg
