In [20]:
# =====================================================
# 📌 Milestone 1: Data Collection & Preprocessing
# Project: AI-Powered Enhanced EHR Imaging & Documentation System
# =====================================================

# -------------------------------
# Step 0: Install Required Libraries
# -------------------------------
!pip install -q kaggle pandas numpy pillow opencv-python pdfplumber pydicom


In [21]:
# -------------------------------
# Step 1: Upload Kaggle API Key
# -------------------------------
from google.colab import files
uploaded = files.upload()  # Upload your kaggle.json here

# Move it to correct folder
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [65]:
# -------------------------------
# Step 2: Create Project Folders
# -------------------------------
import os
folders = [
    "data/raw/images",
    "data/raw/notes",
    "data/images_processed",
    "data/ehr_notes_processed",
    "data"
]
for f in folders:
    os.makedirs(f, exist_ok=True)

In [66]:
# -------------------------------
# Step 3: Download Datasets from Kaggl
# -------------------------------

# 3a) COVID-19 Radiography Database (X-ray)
!kaggle datasets download -d tawsifurrahman/covid19-radiography-database -p data/raw/images/
!unzip -q data/raw/images/covid19-radiography-database.zip -d data/raw/images/
!rm data/raw/images/covid19-radiography-database.zip

# 3b) Medical Transcriptions (EHR notes)
!kaggle datasets download -d tboyle10/medicaltranscriptions -p data/raw/notes/
!unzip -q data/raw/notes/medicaltranscriptions.zip -d data/raw/notes/
!rm data/raw/notes/medicaltranscriptions.zip

# Verify downloads
print("Sample images:", os.listdir("data/raw/images/")[:5])
print("Sample EHR notes:", os.listdir("data/raw/notes/")[:5])

Dataset URL: https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database
License(s): copyright-authors
User cancelled operation
unzip:  cannot find or open data/raw/images/covid19-radiography-database.zip, data/raw/images/covid19-radiography-database.zip.zip or data/raw/images/covid19-radiography-database.zip.ZIP.
rm: cannot remove 'data/raw/images/covid19-radiography-database.zip': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions
License(s): CC0-1.0
Downloading medicaltranscriptions.zip to data/raw/notes
  0% 0.00/4.85M [00:00<?, ?B/s]
100% 4.85M/4.85M [00:00<00:00, 687MB/s]
replace data/raw/notes/mtsamples.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Sample images: ['COVID-19_Radiography_Dataset']
Sample EHR notes: ['mtsamples.csv']


In [67]:
# -------------------------------
# Step 4: Process Medical Images
# -------------------------------
from PIL import Image
import pydicom
import glob
import os

SRC_IMG = "data/raw/images/"
DST_IMG = "data/images_processed/"
TARGET = (256, 256)

os.makedirs(DST_IMG, exist_ok=True)

# Get all image files recursively
images = glob.glob(SRC_IMG + "**/*.png", recursive=True)
images += glob.glob(SRC_IMG + "**/*.jpg", recursive=True)
images += glob.glob(SRC_IMG + "**/*.jpeg", recursive=True)
images += glob.glob(SRC_IMG + "**/*.dcm", recursive=True)

print("Total images found:", len(images))

# Process and save
for path in images:
    try:
        fn = os.path.basename(path)
        if fn.lower().endswith('.dcm'):
            import pydicom
            ds = pydicom.dcmread(path)
            arr = ds.pixel_array.astype(float)
            arr = 255*(arr - arr.min())/(arr.max() - arr.min())
            img = Image.fromarray(arr.astype('uint8')).convert('L')
        else:
            img = Image.open(path).convert('L')
        img = img.resize(TARGET)
        outname = os.path.splitext(fn)[0] + '.png'
        img.save(os.path.join(DST_IMG, outname))
    except Exception as e:
        print("Skipping", path, "due to", e)

print("Processed images saved to:", DST_IMG)
print("Total processed images:", len(os.listdir(DST_IMG)))

Total images found: 42330
Processed images saved to: data/images_processed/
Total processed images: 21165


In [68]:
# -------------------------------
# Step 5: Extract & Clean EHR Notes
# -------------------------------
import pandas as pd
import os

SRC_NOTES = "data/raw/notes/mtsamples.csv"
DST_NOTES = "data/ehr_notes_processed/"
os.makedirs(DST_NOTES, exist_ok=True)

# Read CSV
df = pd.read_csv(SRC_NOTES)
print("Total records in CSV:", len(df))
print("Columns:", df.columns)

# Use 'transcription' column (or the relevant one) to create text files
for i, row in df.iterrows():
    text = str(row['transcription'])  # adjust column name if different
    pid = f"{i+1:04d}"
    outfn = os.path.join(DST_NOTES, f"note_{pid}.txt")
    with open(outfn, 'w', encoding='utf-8') as f:
        f.write(text)

print("Processed EHR notes saved to:", DST_NOTES)
print("Total processed notes:", len(os.listdir(DST_NOTES)))


Total records in CSV: 4999
Columns: Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')
Processed EHR notes saved to: data/ehr_notes_processed/
Total processed notes: 4999


In [70]:
# -------------------------------
# Step 6: Create Mapping CSV
# -------------------------------
import glob
import pandas as pd
import os

DST_IMG = 'data/images_processed/'
DST_NOTES = 'data/ehr_notes_processed/'
os.makedirs('data', exist_ok=True)

# Get all processed images and notes
images = sorted(glob.glob(DST_IMG + '*.png'))
notes = sorted(glob.glob(DST_NOTES + '*.txt'))

print("Total images:", len(images))
print("Total notes:", len(notes))

if len(images) == 0 or len(notes) == 0:
    print("No images or notes found. CSV will be empty.")
else:
    rows = []
    for i, img in enumerate(images[:len(notes)]):  # truncate to smaller count
        pid = f"{i+1:04d}"
        rows.append({
            'file_id': pid,
            'image_path': img,
            'note_path': notes[i],
            'diagnosis':'',
            'icd10':''
        })

    mapping_df = pd.DataFrame(rows)
    mapping_df.to_csv('data/mapping.csv', index=False)
    print("mapping.csv created with", len(mapping_df), "records")


Total images: 21165
Total notes: 4999
mapping.csv created with 4999 records


In [71]:
# -------------------------------
# Step 7: ICD-10 Lookup Table
# -------------------------------
import pandas as pd

# ICD-10 lookup table
lookup_data = {
    "condition_keyword": ["pneumonia", "hypertension", "diabetes"],
    "icd10_code": ["J18.9", "I10", "E11.9"],
    "icd10_description": [
        "Pneumonia, unspecified",
        "Essential (primary) hypertension",
        "Type 2 diabetes mellitus without complications"
    ]
}
lookup_df = pd.DataFrame(lookup_data)
lookup_df.to_csv('data/icd_lookup.csv', index=False)
print("ICD lookup table saved.")

# Function to suggest ICD codes from note text
def suggest_icd(note_text):
    """
    Returns all matching ICD-10 codes for a given note text.
    If no match, returns 'UNKNOWN'.
    """
    t = note_text.lower()
    codes = []
    for _, r in lookup_df.iterrows():
        if r['condition_keyword'].lower() in t:
            codes.append(r['icd10_code'])
    return ", ".join(codes) if codes else 'UNKNOWN'

# Example test
sample_note = "Patient has hypertension and diabetes."
predicted_icd = suggest_icd(sample_note)
print("Sample Note:", sample_note)
print("Predicted ICD-10:", predicted_icd)


ICD lookup table saved.
Sample Note: Patient has hypertension and diabetes.
Predicted ICD-10: I10, E11.9


In [72]:
# -------------------------------
# Step 8: Sanity Checks & ICD Auto-Fill (Fixed)
# -------------------------------
import pandas as pd
import os
import re

mapping_path = 'data/mapping.csv'

# Check if mapping.csv exists and is not empty
if not os.path.exists(mapping_path) or os.path.getsize(mapping_path) == 0:
    print("mapping.csv is missing or empty. Please run previous steps.")
else:
    m = pd.read_csv(mapping_path)
    print("Total records in mapping.csv:", len(m))

    # -------------------------------
    # Check for missing image files
    # -------------------------------
    missing_images = m[~m['image_path'].apply(os.path.exists)]
    print("Missing images:", len(missing_images))
    if len(missing_images) > 0:
        print(missing_images[['file_id','image_path']].head())

    # -------------------------------
    # Check for missing note files
    # -------------------------------
    missing_notes = m[~m['note_path'].apply(os.path.exists)]
    print("Missing notes:", len(missing_notes))
    if len(missing_notes) > 0:
        print(missing_notes[['file_id','note_path']].head())

    # -------------------------------
    # Ensure icd10 column is string type
    # -------------------------------
    m['icd10'] = m['icd10'].astype(str)

    # -------------------------------
    # Fill NaN or UNKNOWN ICD codes using lookup table
    # -------------------------------
    def suggest_icd(note_text):
        t = note_text.lower()
        codes = []
        for _, r in lookup_df.iterrows():
            if r['condition_keyword'].lower() in t:
                codes.append(r['icd10_code'])
        return ", ".join(codes) if codes else 'UNKNOWN'

    for i, row in m.iterrows():
        icd_value = row['icd10'].strip()
        if icd_value == '' or icd_value.upper() in ['NAN','UNKNOWN']:
            try:
                with open(row['note_path'], 'r', encoding='utf-8') as f:
                    text = f.read()
                m.at[i, 'icd10'] = suggest_icd(text)
            except:
                m.at[i, 'icd10'] = 'UNKNOWN'

    # -------------------------------
    # Check ICD-10 code format
    # -------------------------------
    icd_pattern = re.compile(r'^[A-Z][0-9]{2}(?:\.[0-9A-Za-z]{1,4})?$')

    def is_bad_icd(x):
        x_str = str(x).strip()
        if x_str == '' or x_str.upper() == 'UNKNOWN':
            return False
        return not bool(icd_pattern.match(x_str))

    bad_icd = m[m['icd10'].apply(is_bad_icd)]
    print("Bad ICD codes:", len(bad_icd))
    if len(bad_icd) > 0:
        print(bad_icd[['file_id','icd10']].head())

    # -------------------------------
    # Save updated mapping CSV
    # -------------------------------
    m.to_csv(mapping_path, index=False)
    print("Sanity checks completed and mapping.csv updated with ICD codes.")


Total records in mapping.csv: 4999
Missing images: 0
Missing notes: 0
Bad ICD codes: 388
    file_id       icd10
1         2  I10, E11.9
2         3  I10, E11.9
13       14  I10, E11.9
19       20  I10, E11.9
67       68  I10, E11.9
Sanity checks completed and mapping.csv updated with ICD codes.


In [73]:

# -------------------------------
# ✅ Milestone 1 Completed
# -------------------------------
print("""
Milestone 1 Complete ✅:
- Datasets downloaded and cleaned
- Images processed (PNG, 256x256)
- EHR notes extracted (TXT)
- Mapping CSV created
- ICD lookup table ready
- Sanity checks done
""")


Milestone 1 Complete ✅:
- Datasets downloaded and cleaned
- Images processed (PNG, 256x256)
- EHR notes extracted (TXT)
- Mapping CSV created
- ICD lookup table ready
- Sanity checks done

