In [None]:
%%writefile readme.md

#  MedSynth  Privacy-Preserving Synthetic EHR Generator

A privacy-aware synthetic Electronic Health Record (EHR) generator with built-in **Membership Inference Defense Metrics**.

Built for healthcare AI teams, researchers, and privacy engineers who need realistic patient data — without exposing real patient records.

---

##  Features

 **Generate Synthetic EHR Data**
- Realistic fields: age, diagnosis, medication, readmission, visit duration
- Statistically sampled from real-world distributions

 **Privacy Defense Metrics**
- Detects **Membership Inference Risk**
- Uses a shadow classifier to test if synthetic data leaks real patient info
- Flags high-risk generation patterns

**Data Utility Preservation**
- Measures RMSE between real and synthetic distributions
- Ensures synthetic data remains useful for downstream ML

 **Exportable Outputs**
- Download as CSV or JSON
- Ready for use

Writing readme.md


In [None]:
%%writefile requirements.txt
pandas
numpy
matplotlib
scikit-learn
streamlit
fpdf
transformers

Overwriting requirements.txt


In [None]:
!pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%%writefile ehr_schema.py

import pandas as pd
import numpy as np


# this is a dictionary i created for the model like a fake model...

ehr_fields = {
    "patient_id": lambda: pd.Series(range(10000, 10100)),
    "age": lambda: pd.Series([round(x) for x in np.random.uniform(18, 90, size=100)]),
    "gender": lambda: pd.Series(np.random.choice(["M", "F", "O"], size=100)),
    "race": lambda: pd.Series(np.random.choice(["White", "Black", "Asian", "Hispanic"], size=100)),
    "ethnicity": lambda: pd.Series(np.random.choice(["Non-Hispanic", "Hispanic"], size=100)),
    "diagnosis": lambda: pd.Series(np.random.choice([
        "Hypertension", "Diabetes", "Asthma", "Heart Failure", "Obesity"
    ], size=100)),
    "medication": lambda: pd.Series(np.random.choice([
        "Metformin", "Lisinopril", "Albuterol", "Insulin", "Atorvastatin"
    ], size=100)),
    "visit_duration": lambda: pd.Series(np.random.randint(5, 180, size=100)),
    "readmitted": lambda: pd.Series(np.random.choice(["Yes", "No"], size=100))
}


def generate_synthetic_ehr(num_records=100):
    """Generate synthetic EHR data based on schema"""
    data = {}
    for field, generator in ehr_fields.items():
        data[field] = generator()

    df = pd.DataFrame(data)
    return df.head(num_records)

Writing ehr_schema.py


In [None]:
from ehr_schema import generate_synthetic_ehr
import numpy as np

#This gives you a working CSV of fake but realistic patient records.

ehr_data = generate_synthetic_ehr(num_records=100)
ehr_data.to_csv("synthetic_ehr.csv", index=False)

print("Synthetic EHR generated.")
print(ehr_data.head())

Synthetic EHR generated.
   patient_id  age gender      race ethnicity      diagnosis    medication  \
0       10000   27      O  Hispanic  Hispanic   Hypertension  Atorvastatin   
1       10001   62      M     White  Hispanic         Asthma     Albuterol   
2       10002   74      O     White  Hispanic        Obesity  Atorvastatin   
3       10003   76      F     Asian  Hispanic         Asthma     Albuterol   
4       10004   42      M     White  Hispanic  Heart Failure     Metformin   

   visit_duration readmitted  
0              15         No  
1              98         No  
2             153         No  
3             125        Yes  
4             108        Yes  


In [None]:
%%writefile privacy_metrics.py

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

def check_membership_inference_risk(real_data, synthetic_data, target_col="diagnosis"):
    """
    Train a membership inference detector
    If model can predict if a record was in training data → high risk
    """

    real_data['membership'] = 1
    synthetic_data['membership'] = 0

    combined = pd.concat([real_data.sample(frac=0.5),synthetic_data])

    X = combined.drop(columns=['membership', target_col],errors='ignore')
    y = combined['membership']

    X = pd.get_dummies(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

    model = RandomForestClassifier(n_estimators=50)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    acc = accuracy_score(y_test,preds)

    print(f"Membership Inference Accuracy: {acc:.2f}")
    return acc < 0.6 # it means 60%

Writing privacy_metrics.py


In [None]:
%%writefile app.py

import gradio as gr
import pandas as pd
import numpy as np
import os
from io import BytesIO

# Dynamically generate synthetic data to match ANY input schema
def generate_synthetic_like(real_df, num_records=100):
    """Generate synthetic data with same columns and dtypes as real_df"""
    synthetic = pd.DataFrame()

    for col in real_df.columns:
        col_data = real_df[col].dropna()
        if len(col_data) == 0:
            synthetic[col] = ["Unknown"] * num_records
            continue

        if pd.api.types.is_numeric_dtype(col_data):
            if pd.api.types.is_integer_dtype(col_data):
                low, high = int(col_data.min()), int(col_data.max())
                synthetic[col] = np.random.randint(low, high + 1, size=num_records)
            else:
                low, high = float(col_data.min()), float(col_data.max())
                synthetic[col] = np.random.uniform(low, high, size=num_records)
        else:
            # Categorical or string
            values = col_data.astype(str).tolist()
            synthetic[col] = np.random.choice(values, size=num_records)

    return synthetic

# Membership Inference Risk Detection
def check_membership_inference_risk(real_data, synthetic_data, target_col=None):
    try:
        # Use first non-ID, non-trivial column as target if not found
        if target_col is None:
            for col in real_data.columns:
                if col.lower() not in ['id', 'patient_id', 'record_id'] and len(real_data[col].dropna()) > 0:
                    target_col = col
                    break
            if target_col is None:
                target_col = real_data.columns[0]

        real_data = real_data.copy()
        synthetic_data = synthetic_data.copy()
        real_data['membership'] = 1
        synthetic_data['membership'] = 0

        # Combine half real + all synthetic
        combined = pd.concat([
            real_data.sample(frac=0.5, random_state=42),
            synthetic_data
        ], ignore_index=True)

        # Drop target and membership
        X = combined.drop(columns=[target_col, 'membership'], errors='ignore')
        y = combined['membership']

        # One-hot encode categorical features
        X = pd.get_dummies(X, max_categories=10)

        if X.empty or len(X.columns) == 0:
            return False  # Not enough features

        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import accuracy_score

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
        model.fit(X_train, y_train)
        acc = accuracy_score(y_test, model.predict(X_test))

        print(f"Membership Inference Attack Accuracy: {acc:.3f}")
        return acc < 0.6  # Safe if below 60%

    except Exception as e:
        print(f"Privacy check error: {e}")
        return False  # Default to unsafe on error

# Universal file loader with auto-detection
def load_data(file_path):
    with open(file_path, 'rb') as f:
        header = f.read(16)

    # Detect Excel files by magic bytes
    if header.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):  # Older .xls
        return pd.read_excel(file_path, engine='xlrd')
    elif header.startswith(b'PK\x03\x04'):  # .xlsx or .zip-based
        return pd.read_excel(file_path, engine='openpyxl')
    else:
        # Try CSV with fallback encodings
        encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
        for enc in encodings:
            try:
                return pd.read_csv(file_path, encoding=enc)
            except Exception:
                continue
        raise ValueError(f"Could not decode file with any encoding: {encodings}")

# Main processing function
def process_ehr(file):
    try:
        if file is None:
            return pd.DataFrame(), "Please upload a file.", None, None

        # Load data (auto-detect format & encoding)
        real_df = load_data(file.name)

        if real_df.empty:
            return pd.DataFrame(), "Uploaded file is empty.", None, None

        # Generate synthetic data with same schema
        synthetic_df = generate_synthetic_like(real_df, num_records=100)

        # Check privacy risk
        is_safe = check_membership_inference_risk(real_df, synthetic_df)
        risk_msg = " Synthetic EHR shows low membership inference risk." if is_safe else "⚠️ High risk: Synthetic data may leak sensitive info."

        # Save outputs
        synthetic_df.to_csv("synthetic_ehr.csv", index=False)
        synthetic_df.to_json("synthetic_ehr.json", orient="records", indent=2)

        return synthetic_df, risk_msg, "synthetic_ehr.json", "synthetic_ehr.csv"

    except Exception as e:
        return pd.DataFrame(), f" Error processing file: {str(e)}", None, None

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="MedSynth – Universal Synthetic EHR") as demo:
    gr.Markdown("""
    #  MedSynth  Universal Privacy-Preserving Synthetic Data Generator
    Upload **any tabular EHR or patient dataset** (CSV, Excel) — we'll generate realistic synthetic data and check for privacy leaks.
    """)

    with gr.Row():
        upload = gr.File(label=" Upload Real EHR (CSV or Excel)", file_types=[".csv", ".xls", ".xlsx"])
        btn = gr.Button(" Generate Synthetic Data", variant="primary")

    with gr.Row():
        output_df = gr.Dataframe(label=" Synthetic Data Sample", interactive=False)

    with gr.Row():
        risk_text = gr.Textbox(label=" Privacy Risk Status")

    with gr.Row():
        json_out = gr.File(label=" Download JSON")
        csv_out = gr.File(label=" Download CSV")

    # Event handler
    btn.click(
        fn=process_ehr,
        inputs=[upload],
        outputs=[output_df, risk_text, json_out, csv_out]
    )

# Launch
if __name__ == "__main__":
    demo.launch(share=True)

Overwriting app.py


In [None]:
import shutil
import os

os.makedirs("/content/drive/MyDrive/MedSynth_Project", exist_ok=True)
shutil.copy("synthetic_ehr.csv", "/content/drive/MyDrive/MedSynth_Project/synthetic_ehr.csv")
shutil.copy("ehr_schema.py", "/content/drive/MyDrive/MedSynth_Project/ehr_schema.py")
shutil.copy("privacy_metrics.py", "/content/drive/MyDrive/MedSynth_Project/privacy_metrics.py")
shutil.copy("app.py", "/content/drive/MyDrive/MedSynth_Project/app.py")

print("All things are safe")

All things are safe


In [None]:
!python app.py

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://5883f179c132eb19ea.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
  return pd.read_csv(file_path, encoding=enc)
Privacy check error: get_dummies() got an unexpected keyword argument 'max_categories'
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5883f179c132eb19ea.gradio.live
