# SETUP
Before running anything:
1. Get openAI API key and store as "OPENAI_API_KEY" in .env file (create in working directory)
2. Get Kaggle API json file (should be called Kaggle Settings.json) and store in ".kaggle" folder (create in working directory)

Running code below will:
1. Download 2gb Dermnet dataset
2. Create a json file called "diagnosis_mapping.json" that contains each image path and diagnosis
3. Create a persistent vectorDB called " using chroma
4. Store image and metadata embeddings in vectorDB


#### Just run this first, dont need to edit anything

In [None]:
import os
import json
import re
import kagglehub
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader

def extract_diagnosis(filename):
    # Remove file extension and split into components
    name = os.path.splitext(filename)[0]
    
    # Remove leading numbers and special characters
    cleaned = re.sub(r'^[\d_\-]+', '', name)
    
    # Split on numbers, hyphens, and underscores, then capitalize each word
    parts = re.split(r'[\d_\-]+', cleaned)
    diagnosis = ' '.join(part.strip().capitalize() for part in parts if part.strip())
    
    return diagnosis

def create_and_add_to_DB(DB_path, data_path):
    multimodal_ef = OpenCLIPEmbeddingFunction() # multimodal embedding function
    image_loader = ImageLoader() # multimodal data loader
    # client = chromadb.Client() # non-persistent DB
    client = chromadb.PersistentClient(path=DB_path) # create persistent DB

    multimodalDB = client.get_or_create_collection(name="multimodalDB", embedding_function= multimodal_ef, data_loader=image_loader) # multimodal collection
    # import data and add to DB 
    with open(data_path, 'r') as f: #
        data = json.load(f)

    for i in data:
        multimodalDB.add(
            metadatas=[
                {"Diagnosis": i['diagnosis'], "Path": i['path']}
                ],
            uris = [i['path']],
            ids = [i['path']]
        )
    print("All images and data embedded into vectorDB ")
    
    return

def download_and_generate_json():
    root_dir = kagglehub.dataset_download("shubhamgoel27/dermnet") # USE THIS IF FIRST TIME RUNNING
    print("root_dir (change root_dir in commented code above for yourself):", root_dir)
    # root_dir = "/Users/shinherng/.cache/kagglehub/datasets/shubhamgoel27/dermnet/versions/1/train" # CHANGE TO YOUR OWN IF NOT FIRST TIME RUNNING
    output_file = "diagnosis_mapping.json"
    data = []

    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                full_path = os.path.join(root, file)
                diagnosis = extract_diagnosis(file)
                
                data.append({
                    "path": full_path,
                    "diagnosis": diagnosis
                })

    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)

    return


download_and_generate_json()

### Edit code below and run

In [None]:
DB_path = "dermnetVectorDB" # edit if you want
data_path = "/Users/shinherng/Documents/GitHub/dermacare/diagnosis_mapping.json" # EDIT THIS TO YOUR OWN PATH

create_and_add_to_DB(DB_path, data_path)

### To check if it works, run a vector search query, make edits accordingly

In [None]:
multimodal_ef = OpenCLIPEmbeddingFunction() # multimodal embedding function
image_loader = ImageLoader() # multimodal data loader
# client = chromadb.Client() # non-persistent DB
client = chromadb.PersistentClient(path=DB_path) # edit to your own path

multimodalDB = client.get_or_create_collection(name="multimodalDB", embedding_function= multimodal_ef, data_loader=image_loader) # multimodal collection

# Query/search n most similar items
results = multimodalDB.query(
    query_uris=["/Users/shinherng/Downloads/skinCond9.jpg"], # edit to your own test image
    n_results=3
)

results["metadatas"][0][0]["Diagnosis"] # to access top 1 similar diagnosis