# SETUP
Before running anything:
1. Get openAI API key and store as "OPENAI_API_KEY" in .env file (create in working directory)
2. Get Kaggle API json file (should be called Kaggle Settings.json) and store in ".kaggle" folder (create in working directory)

Running code below will:
1. Download 2gb Dermnet dataset
2. Create a json file called "diagnosis_mapping.json" that contains each image path and diagnosis
3. Create a persistent vectorDB called " using chroma
4. Store image and metadata embeddings in vectorDB


#### Just run this first, dont need to edit anything

In [None]:
import os
import getpass
import json
import re
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [5]:
# Download the dataset from Kaggle (ensure kagglehub is installed and configured)
import kagglehub
dataset_path = kagglehub.dataset_download("shubhamgoel27/dermnet")
print("Dataset downloaded to:", dataset_path)

Dataset downloaded to: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1


In [6]:
# Step 1: Generate Diagnosis Mapping JSON

def extract_diagnosis(filename):
    """
    Remove file extension and leading numbers/special characters, then split and capitalize to create a diagnosis string.
    """
    name = os.path.splitext(filename)[0]
    cleaned = re.sub(r'^[\d_\-]+', '', name)
    parts = re.split(r'[\d_\-]+', cleaned)
    diagnosis = ' '.join(part.strip().capitalize() for part in parts if part.strip())
    return diagnosis

def download_and_generate_json(root_dir):
    output_file = "diagnosis_mapping.json"
    data = []
    # Recursively walk through dataset directories
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                full_path = os.path.join(root, file)
                diagnosis = extract_diagnosis(file)
                data.append({
                    "path": full_path,
                    "diagnosis": diagnosis
                })
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"Diagnosis mapping saved to {output_file}")
    return output_file

json_path = download_and_generate_json(dataset_path)

Diagnosis mapping saved to diagnosis_mapping.json


In [7]:
# Step 2: Create Document objects from the JSON data

from langchain.docstore.document import Document

def create_documents_from_json(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    docs = []
    for entry in data:
        # For embedding, we use the diagnosis text; image path is stored in metadata
        doc = Document(page_content=entry['diagnosis'], metadata=entry)
        docs.append(doc)
    return docs

docs = create_documents_from_json(json_path)
print(f"Created {len(docs)} documents.")

Created 19559 documents.


In [8]:
# Step 3: Set up IRISVector with OpenAI Embeddings

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_iris import IRISVector

embeddings = OpenAIEmbeddings()

# Define your IRIS connection parameters (adjust as needed)
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972'
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

# Choose a collection name (avoid dots since under the hood it becomes a SQL table)
COLLECTION_NAME = "dermnet_collection"

# Create (or update) the IRISVector persistent store from the documents
db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)
print("Connction String:", CONNECTION_STRING)

# Verify the number of documents stored
ids = db.get().get("ids", [])
print(f"Number of docs in vector store: {len(ids)}")

  embeddings = OpenAIEmbeddings()


Number of docs in vector store: 19559


In [9]:
# Step 4: Query the vector store

# For IRISVector, we query using a text string. For example, if you want to search for similar entries related to "Acne":
query_text = "Acne"
results = db.similarity_search(query_text, k=3)

print("Top similar results:")
for result in results:
    # Each result is a Document object; metadata contains the original image path and diagnosis
    diagnosis = result.metadata.get("diagnosis", "N/A")
    path = result.metadata.get("path", "N/A")
    print(f"Diagnosis: {diagnosis} | Path: {path}")

Top similar results:
Diagnosis: Acne | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\train\Psoriasis pictures Lichen Planus and related diseases\07acne06270532.jpg
Diagnosis: Acne | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\train\Acne and Rosacea Photos\07Acne081101.jpg
Diagnosis: Acne Scar | Path: C:\Users\tanhu\.cache\kagglehub\datasets\shubhamgoel27\dermnet\versions\1\test\Acne and Rosacea Photos\acne-scar-2.jpg


### Edit code below and run

In [None]:
# DB_path = "dermnetVectorDB"
# data_path = "./diagnosis_mapping.json"

# create_and_add_to_DB(DB_path, data_path)

### To check if it works, run a vector search query, make edits accordingly

In [None]:
# multimodal_ef = OpenCLIPEmbeddingFunction() # multimodal embedding function
# image_loader = ImageLoader() # multimodal data loader
# # client = chromadb.Client() # non-persistent DB
# client = chromadb.PersistentClient(path=DB_path) # edit to your own path

# multimodalDB = client.get_or_create_collection(name="multimodalDB", embedding_function= multimodal_ef, data_loader=image_loader) # multimodal collection

# # Query/search n most similar items
# results = multimodalDB.query(
#     query_uris=["/Users/shinherng/Downloads/skinCond9.jpg"], # edit to your own test image
#     n_results=3
# )

# results["metadatas"][0][0]["Diagnosis"] # to access top 1 similar diagnosis