In [31]:
# Install SDV with multi-table support
!pip install sdv
!pip install sdv[multi-table]

import pandas as pd
import numpy as np
from faker import Faker
import random
from sdv.metadata import MultiTableMetadata
from sdv.multi_table import HMASynthesizer 





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\sneha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\sneha\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [32]:



# Initialize Faker and seed
fake = Faker()
np.random.seed(42)

# 🧍 Patients Table
patients = pd.DataFrame({
    'patient_id': [f'P{i}' for i in range(100)],
    'gender': np.random.choice(['M', 'F'], size=100),
    'age': np.random.randint(18, 90, size=100),
    'location': np.random.choice(['Newark', 'Jersey City', 'Clifton', 'Englewood'], size=100)
})

# 🏥 Providers Table
providers = pd.DataFrame({
    'provider_id': [f'D{i}' for i in range(20)],
    'specialty': np.random.choice(['Cardiology', 'Oncology', 'General Practice', 'Pediatrics'], size=20),
    'location': np.random.choice(['Newark', 'Jersey City', 'Clifton', 'Englewood'], size=20),
    'years_of_experience': np.random.randint(1, 40, size=20)
})

# 💉 Procedures Table
procedures = pd.DataFrame({
    'procedure_code': [f'PC{i}' for i in range(50)],
    'description': [fake.catch_phrase() for _ in range(50)],
    'typical_cost': np.round(np.random.uniform(100, 10000, size=50), 2)
})

# 📄 Claims Table
claims = pd.DataFrame({
    'claim_id': [f'C{i}' for i in range(500)],
    'patient_id': np.random.choice(patients['patient_id'], size=500),
    'provider_id': np.random.choice(providers['provider_id'], size=500),
    'procedure_code': np.random.choice(procedures['procedure_code'], size=500),
    'diagnosis_code': np.random.choice(['D001', 'D002', 'D003', 'D004', 'D005'], size=500),
    'claim_amount': np.round(np.random.uniform(100, 15000, size=500), 2),
    'claim_date': pd.to_datetime('2024-01-01') + pd.to_timedelta(np.random.randint(0, 365, size=500), unit='D')
})




# Convert string columns to ensure consistency
for df in [patients, providers, procedures, claims]:
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str)

real_data = {
    'patients': patients,
    'providers': providers,
    'procedures': procedures,
    'claims': claims
}

# 🚧 Build metadata with correct signature
metadata = MultiTableMetadata()
metadata.detect_from_dataframes(
    data=real_data
)

# ✅ Align formats for IDs
metadata.update_column('procedures', 'procedure_code', sdtype='id', regex_format=r'PC\d+')
metadata.update_column('claims', 'procedure_code', sdtype='id', regex_format=r'PC\d+')
metadata.update_column('claims', 'claim_id', sdtype='id', regex_format=r'C\d+')
metadata.update_column('patients', 'patient_id', sdtype='id', regex_format=r'P\d+')
metadata.update_column('providers', 'provider_id', sdtype='id', regex_format=r'D\d+')

# Validate metadata and data
metadata.validate()
metadata.validate_data(real_data)

# ⚙️ Fit & sample
synth = HMASynthesizer(metadata)
synth.fit(real_data)
synthetic_data = synth.sample()

# 🎯 Inspect
print(synthetic_data['procedures'].procedure_code.head())
print(synthetic_data['claims'].procedure_code.head())


The 'MultiTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.

Preprocess Tables: 100%|██████████| 4/4 [00:00<00:00,  7.25it/s]



Learning relationships:


(1/3) Tables 'patients' and 'claims' ('patient_id'): 100%|██████████| 100/100 [00:07<00:00, 13.35it/s]
(2/3) Tables 'providers' and 'claims' ('provider_id'): 100%|██████████| 20/20 [00:01<00:00, 13.13it/s]
(3/3) Tables 'procedures' and 'claims' ('procedure_code'): 100%|██████████| 50/50 [00:04<00:00, 11.86it/s]





Modeling Tables: 100%|██████████| 4/4 [00:00<00:00,  9.07it/s]


0    PC34
1    PC06
2    PC11
3    PC39
4    PC21
Name: procedure_code, dtype: object
0    PC31
1    PC31
2    PC20
3    PC20
4    PC20
Name: procedure_code, dtype: object


In [33]:
synthetic_data['patients'].head()

Unnamed: 0,patient_id,gender,age,location
0,P06,F,37,Englewood
1,P85,M,77,Newark
2,P23,F,49,Newark
3,P4,M,36,Jersey City
4,P71,M,56,Jersey City


In [34]:
synthetic_data['providers'].head()

Unnamed: 0,provider_id,specialty,location,years_of_experience
0,D2,Oncology,Englewood,20
1,D00,Cardiology,Englewood,31
2,D0,General Practice,Englewood,15
3,D9,Cardiology,Clifton,24
4,D03,Oncology,Jersey City,30


In [35]:
synthetic_data['procedures'].head()

Unnamed: 0,procedure_code,description,typical_cost
0,PC34,Function-based maximized leverage,7548.75
1,PC06,Self-enabling leadingedge neural-net,8409.8
2,PC11,Multi-channeled 3rdgeneration framework,5376.84
3,PC39,Stand-alone zero-defect approach,9736.63
4,PC21,Progressive 24/7 moderator,7524.22


In [36]:
synthetic_data['claims'].head()

Unnamed: 0,claim_id,patient_id,provider_id,procedure_code,diagnosis_code,claim_amount,claim_date
0,C2,P06,D04,PC31,D002,147.95,2024-08-10
1,C4,P06,D07,PC31,D002,147.95,2024-10-07
2,C0,P06,D3,PC20,D002,147.95,2024-06-15
3,C1,P06,D2,PC20,D002,147.95,2024-07-23
4,C5,P06,D04,PC20,D002,147.95,2024-10-07


In [38]:
for procedures, df in synthetic_data.items():
    df.to_csv(f"synthetic_{procedures}.csv", index=False)


In [39]:
for providers, df in synthetic_data.items():
    df.to_csv(f"synthetic_{providers}.csv", index=False)


In [40]:
for claims, df in synthetic_data.items():
    df.to_csv(f"synthetic_{claims}.csv", index=False)


In [41]:
for patients, df in synthetic_data.items():
    df.to_csv(f"synthetic_{patients}.csv", index=False)
