In [3]:
# %% [markdown]
# # 01 – Data acquisition 📦  (v2)
#
# 0. Imports & paths
import os, json, zipfile, shutil, subprocess, time, re, requests, itertools
from pathlib import Path
from typing import List
from tqdm import tqdm


RAW   = Path("/mnt/ssd1/saumia/data/raw");   
RAW.mkdir(parents=True, exist_ok=True)
IMG   = Path("/mnt/ssd1/saumia/data/images"); 
IMG.mkdir(parents=True,  exist_ok=True)
TEXT  = Path("/mnt/ssd1/saumia/data/text");  
TEXT.mkdir(parents=True,  exist_ok=True)
MEDS  = Path("/mnt/ssd1/saumia/data/meds");  
MEDS.mkdir(parents=True,  exist_ok=True)

IMG_EXT = {".png", ".jpg", ".jpeg", ".bmp"}

def unzip_here(z: Path):
    with zipfile.ZipFile(z) as zf: zf.extractall(RAW)
    z.unlink()

import subprocess, zipfile
from pathlib import Path

def kaggle_dl(slug: str):
    owner, name = slug.split("/")
    zip_name = f"{name}.zip"
    target   = RAW / zip_name

    print(f"◼ Kaggle {slug}")
    subprocess.run(
        ["kaggle", "datasets", "download", "-d", slug, "-p", str(RAW), "--force"],
        check=True
    )

    # Only unzip if a proper ZIP was downloaded
    if target.exists():
        try:
            with zipfile.ZipFile(target, "r") as zf:
                print(f"  ↳ Extracting {zip_name} …")
                zf.extractall(RAW)
        except zipfile.BadZipFile:
            print(f"⚠ Warning: {zip_name} is not a valid ZIP, skipping unzip")
        finally:
            target.unlink()
    else:
        print(f"  ↳ No {zip_name} found; assuming direct file download")


def move_imgs(folder: Path):
    for p in folder.rglob("*"):
        if p.suffix.lower() in IMG_EXT:
            cls = p.parent.name.replace(" ", "_").lower()
            dst = IMG/cls; 
            subprocess.run(["sudo", "mkdir", "-p", str(dst)], check=True)
            subprocess.run(["sudo", "mv", str(p), str(dst/p.name)], check=True)

In [21]:
# 1 – Kaggle datasets (images + tables)
KAGGLE_LIST = [
    "shubhamgoel27/dermnet",
    "ismailpromus/skin-diseases-image-dataset",
    "subirbiswas19/skin-disease-dataset",
    "henriqueolivoantonio/allergy-degrees",
    "boltcutters/food-allergens-and-allergies",
    "niyarrbarman/symptom2disease",

]
for slug in KAGGLE_LIST: kaggle_dl(slug)

# bucket images → images/
for src in [
    RAW/"dermnet",                       # shubhamgoel27
    RAW/"Skin diseases",                 # ismailpromus
    RAW/"skin disease dataset"           # subirbiswas
]:
    if src.exists(): move_imgs(src)

# copy any CSV/JSON tables → text/
for p in RAW.rglob("*"):
    if p.suffix.lower() in {".csv", ".json"}:
        subprocess.run(["sudo", "cp", str(p), str(TEXT/p.name)], check=True)

◼ Kaggle shubhamgoel27/dermnet


Dataset URL: https://www.kaggle.com/datasets/shubhamgoel27/dermnet
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
Downloading dermnet.zip to ../mnt/ssd1/saumia/data/raw


100%|██████████████████████████████████████| 1.72G/1.72G [00:01<00:00, 992MB/s]



  ↳ Extracting dermnet.zip …
◼ Kaggle ismailpromus/skin-diseases-image-dataset
Dataset URL: https://www.kaggle.com/datasets/ismailpromus/skin-diseases-image-dataset
License(s): copyright-authors
Downloading skin-diseases-image-dataset.zip to ../mnt/ssd1/saumia/data/raw


100%|██████████████████████████████████████| 5.19G/5.19G [00:05<00:00, 940MB/s]



  ↳ Extracting skin-diseases-image-dataset.zip …
◼ Kaggle subirbiswas19/skin-disease-dataset
Dataset URL: https://www.kaggle.com/datasets/subirbiswas19/skin-disease-dataset
License(s): CC0-1.0
Downloading skin-disease-dataset.zip to ../mnt/ssd1/saumia/data/raw

  ↳ Extracting skin-disease-dataset.zip …


100%|██████████████████████████████████████| 17.3M/17.3M [00:00<00:00, 848MB/s]


◼ Kaggle henriqueolivoantonio/allergy-degrees
Dataset URL: https://www.kaggle.com/datasets/henriqueolivoantonio/allergy-degrees
License(s): ODbL-1.0
Downloading allergy-degrees.zip to ../mnt/ssd1/saumia/data/raw

  ↳ Extracting allergy-degrees.zip …
◼ Kaggle boltcutters/food-allergens-and-allergies


100%|██████████████████████████████████████| 13.6M/13.6M [00:00<00:00, 820MB/s]


Dataset URL: https://www.kaggle.com/datasets/boltcutters/food-allergens-and-allergies
License(s): copyright-authors
Downloading food-allergens-and-allergies.zip to ../mnt/ssd1/saumia/data/raw

  ↳ Extracting food-allergens-and-allergies.zip …
◼ Kaggle niyarrbarman/symptom2disease


100%|█████████████████████████████████████| 2.17k/2.17k [00:00<00:00, 2.32MB/s]


Dataset URL: https://www.kaggle.com/datasets/niyarrbarman/symptom2disease
License(s): CC0-1.0
Downloading symptom2disease.zip to ../mnt/ssd1/saumia/data/raw

  ↳ Extracting symptom2disease.zip …


100%|██████████████████████████████████████| 43.6k/43.6k [00:00<00:00, 103MB/s]


In [None]:
# %% [markdown]
# ### Fetch Symptom2Disease from Kaggle

# %%
import subprocess, zipfile, shutil
from pathlib import Path

# Define paths
RAW = Path("/mnt/ssd1/saumia/data/raw")
TEXT = Path("/mnt/ssd1/saumia/data/text"); TEXT.mkdir(exist_ok=True)

# Dataset slug
slug = "niyarrbarman/symptom2disease"
print(f"➜ Kaggle {slug}")

# Download & unzip
subprocess.run(
    ["kaggle", "datasets", "download", "-d", slug, "-p", str(RAW)],
    check=True
)

# Extract all zip files matching pattern
for z in RAW.glob("symptom2disease*.zip"):
    with zipfile.ZipFile(z) as zf:
        zf.extractall(RAW)
    z.unlink()

# Locate CSV file (case-insensitive pattern match)
csv_candidates = list(RAW.glob("*ymptom2*isease*.csv"))
if not csv_candidates:
    raise FileNotFoundError("❌ No Symptom2Disease CSV file found in RAW folder.")

csv = csv_candidates[0]

# Safely copy to text folder, handling overwrite/casing conflict
target = TEXT / "symptom2disease.csv"
if target.exists():
    target.unlink()  # prevent overwrite issue on case-insensitive FS
    
subprocess.run(["sudo", "cp", str(csv), str(target)], check=True)
print(f"✅ {csv.name} copied to data/text/symptom2disease.csv")


➜ Kaggle niyarrbarman/symptom2disease


Dataset URL: https://www.kaggle.com/datasets/niyarrbarman/symptom2disease
License(s): CC0-1.0
Downloading symptom2disease.zip to /mnt/ssd1/saumia/data/raw

✅ Symptom2Disease.csv copied to data/text/symptom2disease.csv


100%|█████████████████████████████████████| 43.6k/43.6k [00:00<00:00, 99.4MB/s]


In [43]:
# 4 – ISIC Archive (skin-lesion images)
user = os.getenv("ISIC_USERNAME"); pwd = os.getenv("ISIC_PASSWORD")
if user and pwd:
    print("◼ ISIC Archive login…")
    sess = requests.Session()
    r = sess.post("https://isic-archive.com/api/v2/login", json={
        "username": user, "password": pwd})
    r.raise_for_status()
    token = r.json()["authToken"]["token"]

    HDR = {"Authorization": f"token {token}"}
    #  fetch 1000 public images with metadata “benign”/“malignant”
    params = {"limit": 1000, "offset": 0, "sort": "name", "sortdir": 1}
    items = sess.get("https://isic-archive.com/api/v2/image", headers=HDR,
                     params=params).json()
    print(f"  ↳ downloading {len(items)} thumbnails …")
    for it in tqdm(items):
        url = it['_links']['thumbnail']  # 224×224 JPEG
        img = sess.get(url, headers=HDR).content
        cls = "isic_" + it['meta']['clinical']['benign_malignant']
        (IMG/cls).mkdir(exist_ok=True)
        with open(IMG/cls/f"{it['_id']}.jpg", "wb") as f: f.write(img)
else:
    print("⚠ Skip ISIC (set ISIC_USERNAME/PASSWORD env-vars to enable)")

⚠ Skip ISIC (set ISIC_USERNAME/PASSWORD env-vars to enable)


In [46]:
from pathlib import Path
import shutil, hashlib, cv2

RAW      = Path("/mnt/ssd1/saumia/data/raw")
IMG_DIR  = Path("/mnt/ssd1/saumia/data/images")
IMG_EXT  = {".jpg", ".jpeg", ".png", ".bmp"}
seen     = set()

# All 3 folders you unzipped
parts = [
    RAW / "mendeley_images",
    RAW / "mendeley_images1",
    RAW / "mendeley_images2",
]

for part in parts:
    print(f"🔍 Scanning {part.name} ...")
    for p in part.rglob("*"):
        if p.suffix.lower() not in IMG_EXT:
            continue
        img = cv2.imread(str(p))
        if img is None or min(img.shape[:2]) < 50:
            continue
        h = hashlib.sha1(img).hexdigest()
        if h in seen:
            continue
        seen.add(h)
        cls = p.parent.name.replace(" ", "_").lower()
        dst = IMG_DIR / cls
        subprocess.run(["sudo", "mkdir", "-p", str(dst)], check=True)
        subprocess.run(["sudo", "cp", str(p), str(dst / p.name)], check=True)

print("✅ Done: All Mendeley images copied to data/images/")


🔍 Scanning mendeley_images ...


🔍 Scanning mendeley_images1 ...
🔍 Scanning mendeley_images2 ...
✅ Done: All Mendeley images copied to data/images/


In [None]:
# 6 – openFDA drug-label dumps (medication look-ups)
FDA = "https://api.fda.gov/drug/label.json"
PAGES = 5
for skip in range(0, PAGES*1000, 1000):
    fname = MEDS/f"drug_label_{skip//1000:02d}.json"
    if fname.exists(): continue
    print(f"◼ openFDA page {skip//1000}")
    r = requests.get(FDA, params={"limit":1000, "skip":skip}, timeout=60)
    r.raise_for_status()
    subprocess.run(["sudo", "tee", str(fname)], input=r.text.encode("utf-8"), check=True)
    time.sleep(.3)

In [13]:
import pandas as pd
from pathlib import Path
import shutil


for f in RAW.iterdir():
    ext = f.suffix.lower()
    if ext in (".xlsx", ".xls", ".xlsm", ".csv"):
        print(f"→ Loading {f.name}")
        if ext in (".xlsx", ".xls", ".xlsm"):
            df = pd.read_excel(f, engine="openpyxl")
        else:
            df = pd.read_csv(f)
        df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
        df = df.dropna(how="all")
        out = TEXT / f"{f.stem.lower().replace(' ', '_')}.csv"
        subprocess.run(["sudo", "cp", str(f), str(out)], check=True)
        print(f"   Wrote {out.name} ({len(df):,} rows)")

    elif ext == ".json":
        dest = TEXT / f.name
        shutil.copy2(f, dest)
        print(f"   Copied {f.name}")


→ Loading Symptom2Disease.csv
   Wrote symptom2disease.csv (1,200 rows)
→ Loading FoodData.csv
   Wrote fooddata.csv (184 rows)
