### Omics

In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_simplified_synthetic_data(n_patients=20, n_codes=10, n_proteins=50):
    np.random.seed(42)
    
    # Generate patient IDs
    mom_person_ids = np.arange(1000000, 1000000 + n_patients)
    child_person_ids = np.arange(10000000, 10000000 + n_patients)
    
    # Generate birth dates for children
    child_birth_dates = [datetime.now() - timedelta(days=int(np.random.randint(0, 280))) for _ in range(n_patients)]
    
    def generate_ehr_data(concept_prefix, datetime_prefix):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            child_birth_date = child_birth_dates[i]
            
            for _ in range(np.random.randint(1, 5)):  # 1-4 records per patient
                date = child_birth_date - timedelta(days=np.random.randint(0, 280))
                dos = (child_birth_date - date).days
                code = np.random.randint(1, n_codes + 1)
                
                data.append({
                    'mom_person_id': int(mom_id),
                    'child_person_id': int(child_id),
                    f'{concept_prefix}_concept_id': int(code),
                    f'{datetime_prefix}_DATETIME': date,
                    'child_birth_date': child_birth_date
                })
        
        return pd.DataFrame(data)
    
    # Generate EHR data for each modality
    conditions_df = generate_ehr_data('condition', 'condition_start')
    drugs_df = generate_ehr_data('drug', 'drug_exposure_start')
    procedures_df = generate_ehr_data('procedure', 'procedure')
    observations_df = generate_ehr_data('observation', 'observation')
    measurements_df = generate_ehr_data('measurement', 'measurement')
    measurements_df['value_as_number'] = np.random.uniform(0, 100, len(measurements_df))
    
    def generate_proteomics_data():
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            birth_date = child_birth_dates[i]
            
            for timepoint in ['T1', 'T2', 'T3']:
                dos = np.random.randint(5, 280)
                sample_date = birth_date - timedelta(days=dos)
                
                proteins = np.random.normal(0, 1, n_proteins)
                
                data.append({
                    'maternal_person_id': int(mom_id),  # Added this line
                    'child_person_id': int(child_id),
                    'Timepoint': timepoint,
                    'DOS': dos,
                    'sample_date': sample_date,
                    'child_birth_date': birth_date,
                    **{f'Protein_{i}': protein for i, protein in enumerate(proteins)}
                })
        
        df = pd.DataFrame(data)
        df['sample_ID'] = df['maternal_person_id'].astype(str) + '_' + df['child_person_id'].astype(str) + '_' + df['Timepoint']
        return df
    
    proteomics_df = generate_proteomics_data()
    
    # Generate OOL EHR features (list of unique mom_person_ids)
    ool_ehr_features_df = pd.DataFrame({'mom_person_id': mom_person_ids})
    
    # Generate sampleID indices
    sampleid_indices_df = pd.DataFrame({'0': proteomics_df['sample_ID']})
    
    # Save all generated data to CSV files
    conditions_df.to_csv('./raw_data/EHR/EHR_cohort_conditions.csv', index=False)
    drugs_df.to_csv('./raw_data/EHR/EHR_cohort_drugs.csv', index=False)
    procedures_df.to_csv('./raw_data/EHR/EHR_cohort_procedures.csv', index=False)
    observations_df.to_csv('./raw_data/EHR/EHR_cohort_observations.csv', index=False)
    measurements_df.to_csv('./raw_data/EHR/EHR_cohort_measurements.csv', index=False)
    proteomics_df.to_csv('./processed_data/ool_proteomics_omop_id.csv', index=False)
    ool_ehr_features_df.to_csv('./ool_EHR_features.csv', index=False)
    sampleid_indices_df.to_csv('./processed_data/sampleID_indices.csv', index=False)
    
    print("Simplified synthetic data has been generated with consistent IDs across all datasets.")
    print(f"Total records generated:")
    print(f"Conditions: {len(conditions_df)}")
    print(f"Drugs: {len(drugs_df)}")
    print(f"Procedures: {len(procedures_df)}")
    print(f"Observations: {len(observations_df)}")
    print(f"Measurements: {len(measurements_df)}")
    print(f"Proteomics samples: {len(proteomics_df)}")

# Generate the simplified synthetic data
generate_simplified_synthetic_data()

Simplified synthetic data has been generated with consistent IDs across all datasets.
Total records generated:
Conditions: 46
Drugs: 41
Procedures: 55
Observations: 57
Measurements: 46
Proteomics samples: 60


In [16]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_simplified_synthetic_data(n_patients=250, n_codes=10, n_proteins=50):
    np.random.seed(42)
    
    # Generate patient IDs
    mom_person_ids = np.arange(1000000, 1000000 + n_patients)
    child_person_ids = np.arange(10000000, 10000000 + n_patients)
    
    # Generate birth dates for children
    child_birth_dates = [datetime.now() - timedelta(days=int(np.random.randint(0, 280))) for _ in range(n_patients)]
    
    def generate_ehr_data(concept_prefix, datetime_prefix):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            child_birth_date = child_birth_dates[i]
            
            for _ in range(np.random.randint(1, 5)):  # 1-4 records per patient
                date = child_birth_date - timedelta(days=np.random.randint(0, 280))
                dos = (child_birth_date - date).days
                code = np.random.randint(1, n_codes + 1)
                
                data.append({
                    'mom_person_id': int(mom_id),
                    'child_person_id': int(child_id),
                    f'{concept_prefix}_concept_id': int(code),
                    f'{datetime_prefix}_DATETIME': date,
                    'child_birth_date': child_birth_date
                })
        
        return pd.DataFrame(data)
    
    # Generate EHR data for each modality
    conditions_df = generate_ehr_data('condition', 'condition_start')
    drugs_df = generate_ehr_data('drug', 'drug_exposure_start')
    procedures_df = generate_ehr_data('procedure', 'procedure')
    observations_df = generate_ehr_data('observation', 'observation')
    measurements_df = generate_ehr_data('measurement', 'measurement')
    measurements_df['value_as_number'] = np.random.uniform(0, 100, len(measurements_df))
    
    def generate_proteomics_data():
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            birth_date = child_birth_dates[i]
            
            for timepoint in ['T1', 'T2', 'T3']:
                dos = np.random.randint(5, 280)
                sample_date = birth_date - timedelta(days=dos)
                
                proteins = np.random.normal(0, 1, n_proteins)
                
                data.append({
                    'maternal_person_id': int(mom_id),  # Added this line
                    'child_person_id': int(child_id),
                    'Timepoint': timepoint,
                    'DOS': dos,
                    'sample_date': sample_date,
                    'child_birth_date': birth_date,
                    **{f'Protein_{i}': protein for i, protein in enumerate(proteins)}
                })
        
        df = pd.DataFrame(data)
        df['sample_ID'] = df['maternal_person_id'].astype(str) + '_' + df['child_person_id'].astype(str) + '_' + df['Timepoint']
        return df
    
    proteomics_df = generate_proteomics_data()
    
    # Generate OOL EHR features (list of unique mom_person_ids)
    ool_ehr_features_df = pd.DataFrame({'mom_person_id': mom_person_ids})
    
    # Generate sampleID indices
    sampleid_indices_df = pd.DataFrame({'0': proteomics_df['sample_ID']})
    
    # Save all generated data to CSV files
    conditions_df.to_csv('./raw_data/EHR/full_EHR_cohort_conditions.csv', index=False)
    drugs_df.to_csv('./raw_data/EHR/full_EHR_cohort_drugs.csv', index=False)
    procedures_df.to_csv('./raw_data/EHR/full_EHR_cohort_procedures.csv', index=False)
    observations_df.to_csv('./raw_data/EHR/full_EHR_cohort_observations.csv', index=False)
    measurements_df.to_csv('./raw_data/EHR/full_EHR_cohort_measurements.csv', index=False)
    #proteomics_df.to_csv('./processed_data/ool_proteomics_omop_id.csv', index=False)
    #ool_ehr_features_df.to_csv('./ool_EHR_features.csv', index=False)
    #sampleid_indices_df.to_csv('./processed_data/sampleID_indices.csv', index=False)
    
    print("Simplified synthetic data has been generated with consistent IDs across all datasets.")
    print(f"Total records generated:")
    print(f"Conditions: {len(conditions_df)}")
    print(f"Drugs: {len(drugs_df)}")
    print(f"Procedures: {len(procedures_df)}")
    print(f"Observations: {len(observations_df)}")
    print(f"Measurements: {len(measurements_df)}")
    print(f"Proteomics samples: {len(proteomics_df)}")

# Generate the simplified synthetic data
generate_simplified_synthetic_data()

Simplified synthetic data has been generated with consistent IDs across all datasets.
Total records generated:
Conditions: 592
Drugs: 634
Procedures: 608
Observations: 600
Measurements: 622
Proteomics samples: 750


### PT

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_comprehensive_synthetic_data(n_patients, n_codes, n_proteins):
    np.random.seed(42)
    
    # Generate patient IDs
    mom_person_ids = np.arange(1000000, 1000000 + n_patients)
    child_person_ids = np.arange(10000000, 10000000 + n_patients)
    
    # Generate birth dates for children
    child_birth_dates = [datetime.now() - timedelta(days=int(np.random.randint(0, 1000))) for _ in range(n_patients)]
    
    def generate_distinct_dates(start_date, end_date, n_dates):
        date_range = (end_date - start_date).days
        distinct_days = np.random.choice(range(date_range), n_dates, replace=False)
        distinct_dates = sorted(start_date + timedelta(days=int(x)) for x in distinct_days)
        return distinct_dates
    
    # Generate distinct dates for each patient
    patient_dates = {}
    for i, mom_id in enumerate(mom_person_ids):
        child_birth_date = child_birth_dates[i]
        pregnancy_start = child_birth_date - timedelta(days=280)  # Full pregnancy duration
        n_distinct_dates = np.random.randint(50, 100)  # Increase number of dates
        patient_dates[mom_id] = generate_distinct_dates(pregnancy_start, child_birth_date, n_distinct_dates)
    
    def generate_ehr_data(concept_prefix, datetime_prefix, n_codes_modality):
        data = []
        
        # Define trimester-specific features
        trimester_features = {
            1: list(range(1, 51)),    # First trimester
            2: list(range(51, 101)),  # Second trimester
            3: list(range(101, 151))  # Third trimester
        }
        
        # Define continuous features
        continuous_features = list(range(151, 171))
        
        # Define labor-approaching features
        labor_features = list(range(171, 191))
        
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            child_birth_date = child_birth_dates[i]
            
            for date in patient_dates[mom_id]:
                dos = (child_birth_date - date).days
                trimester = 3 - (dos // 93)  # Determine trimester
                
                included_codes = []
                
                # Generate trimester-specific features
                if trimester in trimester_features:
                    trimester_codes = trimester_features[trimester]
                    included_codes.extend([code for code in trimester_codes if np.random.random() > 0.7])
                
                # Generate continuous features
                for code in continuous_features:
                    if np.random.random() < (1 - dos/280)**2:  # Probability increases as pregnancy progresses
                        included_codes.append(code)
                
                # Generate labor-approaching features
                for code in labor_features:
                    if np.random.random() < (1 - dos/50)**3:  # Probability increases sharply as labor approaches
                        included_codes.append(code)
                
                # Add minimal noise with random codes
                if np.random.random() < 0.05:  # 5% chance of random codes
                    random_codes = np.random.choice(range(191, n_codes_modality + 1), 2, replace=False)
                    included_codes.extend(random_codes)
                
                for code in included_codes:
                    data.append({
                        'mom_person_id': int(mom_id),
                        'child_person_id': int(child_id),
                        f'{concept_prefix}_concept_id': int(code),
                        f'{datetime_prefix}_DATETIME': date,
                        'child_birth_date': child_birth_date,
                        'dos': dos
                    })
        
        return pd.DataFrame(data)
    
    # Generate EHR data for each modality
    conditions_df = generate_ehr_data('condition', 'condition_start', n_codes['conditions'])
    conditions_df['condition_source_value'] = [f'condition_{i}' for i in range(len(conditions_df))]
    
    drugs_df = generate_ehr_data('drug', 'drug_exposure_start', n_codes['drugs'])
    procedures_df = generate_ehr_data('procedure', 'procedure', n_codes['procedures'])
    observations_df = generate_ehr_data('observation', 'observation', n_codes['observations'])
    
    measurements_df = generate_ehr_data('measurement', 'measurement', n_codes['measurements'])
    measurements_df['value_as_number'] = np.random.uniform(0, 100, len(measurements_df))
    
    def generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            birth_date = child_birth_dates[i]
            
            # Generate 3 timepoints for each patient
            for timepoint, dos_range in [('T1', (180, 240)), ('T2', (90, 150)), ('T3', (5, 60))]:
                dos = np.random.randint(*dos_range)
                sample_date = birth_date - timedelta(days=dos)
                
                # Generate DOS-dependent proteins
                proteins = [f(dos) for f in protein_functions] + list(np.random.normal(0, 1, n_proteins - 10))
                
                data.append({
                    'maternal_person_id': int(mom_id),
                    'child_person_id': int(child_id),
                    'Timepoint': timepoint,
                    'DOS': dos,
                    'SampleID': f'Sample_{mom_id}_{timepoint}',
                    'ID': f'ID_{mom_id}_{timepoint}',
                    'EGA': f'EGA_{mom_id}_{timepoint}',
                    'sample_date': sample_date,
                    'child_birth_date': birth_date,
                    **{f'Protein_{i}': protein for i, protein in enumerate(proteins)}
                })
        
        df = pd.DataFrame(data)
        df['sample_ID'] = df['maternal_person_id'].astype(str) + '_' + df['child_person_id'].astype(str) + '_' + df['Timepoint']
        df['mom_person_id'] = df['maternal_person_id']
        df['DOS_sampling_time'] = df['DOS']
        return df
    
    # Define DOS-dependent functions for proteins
    protein_functions = [
        lambda dos: np.sin(dos / 40) * 5 + np.random.normal(0, 0.5),
        lambda dos: np.cos(dos / 30) * 4 + np.random.normal(0, 0.5),
        lambda dos: np.log(dos + 1) * 2 + np.random.normal(0, 0.5),
        lambda dos: np.sqrt(dos) / 2 + np.random.normal(0, 0.5),
        lambda dos: dos / 50 + np.random.normal(0, 0.5),
        lambda dos: np.exp(-dos / 100) * 10 + np.random.normal(0, 0.5),
        lambda dos: np.tanh(dos / 50) * 3 + np.random.normal(0, 0.5),
        lambda dos: (dos % 14) / 7 + np.random.normal(0, 0.5),
        lambda dos: np.sin(dos / 20) * np.cos(dos / 40) * 6 + np.random.normal(0, 0.5),
        lambda dos: 1 / (1 + np.exp(-dos / 50)) * 5 + np.random.normal(0, 0.5)
    ]
    
    proteomics_df = generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins)
    
    # Generate OOL EHR features (list of unique mom_person_ids)
    ool_ehr_features_df = pd.DataFrame({'mom_person_id': mom_person_ids})
    
    # Generate sampleID indices
    sampleid_indices_df = pd.DataFrame({'0': proteomics_df['sample_ID']})
    
    # Save all generated data to CSV files
    conditions_df.to_csv('./raw_data/EHR/full_EHR_cohort_conditions.csv', index=True)
    drugs_df.to_csv('./raw_data/EHR/full_EHR_cohort_drugs.csv', index=True)
    procedures_df.to_csv('./raw_data/EHR/full_EHR_cohort_procedures.csv', index=True)
    observations_df.to_csv('./raw_data/EHR/full_EHR_cohort_observations.csv', index=True)
    measurements_df.to_csv('./raw_data/EHR/full_EHR_cohort_measurements.csv', index=True)
    #proteomics_df.to_csv('./processed_data/ool_proteomics_omop_id.csv', index=True)
    #ool_ehr_features_df.to_csv('./ool_EHR_features.csv', index=False)
    #sampleid_indices_df.to_csv('./processed_data/sampleID_indices.csv', index=True)
    
    print("All synthetic data has been generated with consistent IDs across all datasets.")
    print(f"Total records generated:")
    print(f"Conditions: {len(conditions_df)}")
    print(f"Drugs: {len(drugs_df)}")
    print(f"Procedures: {len(procedures_df)}")
    print(f"Observations: {len(observations_df)}")
    print(f"Measurements: {len(measurements_df)}")
    print(f"Proteomics samples: {len(proteomics_df)}")

    print("\nVariables that are a function of DOS:")
    print("Proteins: Protein_0 to Protein_9")
    print("EHR data:")
    print("  - Trimester-specific features (codes 1-150)")
    print("  - Continuous features (codes 151-170)")
    print("  - Labor-approaching features (codes 171-190)")

# Example usage
n_patients = 100
n_codes = {
    'conditions': 200,
    'drugs': 200,
    'procedures': 200,
    'observations': 200,
    'measurements': 200
}
n_proteins = 50

generate_comprehensive_synthetic_data(n_patients, n_codes, n_proteins)

## Omics

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_comprehensive_synthetic_data(n_patients, n_records_per_modality, n_codes, n_proteins):
    np.random.seed(42)
    
    # Generate patient IDs
    mom_person_ids = np.arange(1000000, 1000000 + n_patients)
    child_person_ids = np.arange(10000000, 10000000 + n_patients)
    
    # Generate birth dates for children
    child_birth_dates = [datetime.now() - timedelta(days=int(np.random.randint(0, 1000))) for _ in range(n_patients)]
    
    def generate_distinct_dates(start_date, end_date, n_dates):
        date_range = (end_date - start_date).days
        distinct_days = np.random.choice(range(date_range), n_dates, replace=False)
        distinct_dates = sorted(start_date + timedelta(days=int(x)) for x in distinct_days)
        return distinct_dates
    
    # Generate distinct dates for each patient
    patient_dates = {}
    for i, mom_id in enumerate(mom_person_ids):
        child_birth_date = child_birth_dates[i]
        pregnancy_start = child_birth_date - timedelta(days=280)
        n_distinct_dates = np.random.randint(5, 51)
        patient_dates[mom_id] = generate_distinct_dates(pregnancy_start, child_birth_date, n_distinct_dates)
    
    # Helper function to generate EHR data within pregnancy window
    def generate_ehr_data(concept_prefix, datetime_prefix, n_codes_modality):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            child_birth_date = child_birth_dates[i]
            
            for date in patient_dates[mom_id]:
                # Generate 1 to 5 events for each distinct date
                n_events = np.random.randint(1, 6)
                for _ in range(n_events):
                    data.append({
                        'mom_person_id': int(mom_id),
                        'child_person_id': int(child_id),
                        f'{concept_prefix}_concept_id': int(np.random.randint(1, n_codes_modality + 1)),
                        f'{datetime_prefix}_DATETIME': date,
                        'child_birth_date': child_birth_date
                    })
        
        return pd.DataFrame(data)
    
    # Generate EHR data for each modality
    conditions_df = generate_ehr_data('condition', 'condition_start', n_codes['conditions'])
    conditions_df['condition_source_value'] = [f'condition_{i}' for i in range(len(conditions_df))]
    
    drugs_df = generate_ehr_data('drug', 'drug_exposure_start', n_codes['drugs'])
    procedures_df = generate_ehr_data('procedure', 'procedure', n_codes['procedures'])
    observations_df = generate_ehr_data('observation', 'observation', n_codes['observations'])
    
    measurements_df = generate_ehr_data('measurement', 'measurement', n_codes['measurements'])
    measurements_df['value_as_number'] = np.random.uniform(0, 100, len(measurements_df))
    
    def generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            birth_date = child_birth_dates[i]
            
            # T3: 3-30 days before birth
            t3_dos = int(np.random.randint(3, 31))
            # T2: 31-60 days before birth
            t2_dos = int(np.random.randint(31, 61))
            # T1: 61-100 days before birth
            t1_dos = int(np.random.randint(61, 101))
            
            for timepoint, dos in zip(['T3', 'T2', 'T1'], [t3_dos, t2_dos, t1_dos]):
                sample_date = birth_date - timedelta(days=dos)
                data.append({
                    'maternal_person_id': int(mom_id),
                    'child_person_id': int(child_id),
                    'Timepoint': timepoint,
                    'DOS': dos,
                    'SampleID': f'Sample_{mom_id}_{timepoint}',
                    'ID': f'ID_{mom_id}_{timepoint}',
                    'EGA': f'EGA_{mom_id}_{timepoint}',
                    'sample_date': sample_date,
                    'child_birth_date': birth_date
                })
        
        df = pd.DataFrame(data)
        
        # Generate all protein data at once
        protein_data = np.random.normal(0, 1, (len(df), n_proteins))
        protein_df = pd.DataFrame(protein_data, columns=[f'Protein_{i}' for i in range(n_proteins)])
        
        # Concatenate the original dataframe with the protein data
        df = pd.concat([df, protein_df], axis=1)
        
        df['sample_ID'] = df['maternal_person_id'].astype(str) + '_' + df['child_person_id'].astype(str) + '_' + df['Timepoint']
        df['mom_person_id'] = df['maternal_person_id']
        df['DOS_sampling_time'] = df['DOS']
        return df
    
    proteomics_df = generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins)
    
    # Generate OOL EHR features (list of unique mom_person_ids)
    ool_ehr_features_df = pd.DataFrame({'mom_person_id': mom_person_ids})
    
    # Generate sampleID indices
    sampleid_indices_df = pd.DataFrame({'0': proteomics_df['sample_ID']})
    
    # Save all generated data to CSV files
    conditions_df.to_csv('./raw_data/EHR/EHR_cohort_conditions.csv', index=True)
    drugs_df.to_csv('./raw_data/EHR/EHR_cohort_drugs.csv', index=True)
    procedures_df.to_csv('./raw_data/EHR/EHR_cohort_procedures.csv', index=True)
    observations_df.to_csv('./raw_data/EHR/EHR_cohort_observations.csv', index=True)
    measurements_df.to_csv('./raw_data/EHR/EHR_cohort_measurements.csv', index=True)
    proteomics_df.to_csv('./processed_data/ool_proteomics_omop_id.csv', index=True)
    ool_ehr_features_df.to_csv('./ool_EHR_features.csv', index=False)
    sampleid_indices_df.to_csv('./processed_data/sampleID_indices.csv', index=True)
    
    print("All synthetic data has been generated with consistent IDs across all datasets.")
    print(f"Total records generated:")
    print(f"Conditions: {len(conditions_df)}")
    print(f"Drugs: {len(drugs_df)}")
    print(f"Procedures: {len(procedures_df)}")
    print(f"Observations: {len(observations_df)}")
    print(f"Measurements: {len(measurements_df)}")
    print(f"Proteomics samples: {len(proteomics_df)}")

# Example usage
n_patients = 100
n_records_per_modality = 1000
n_codes = {
    'conditions': 200,
    'drugs': 200,
    'procedures': 200,
    'observations': 200,
    'measurements': 200
}
n_proteins = 500

generate_comprehensive_synthetic_data(n_patients, n_records_per_modality, n_codes, n_proteins)

## EHR Cohort

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_comprehensive_synthetic_data(n_patients, n_records_per_modality, n_codes, n_proteins):
    np.random.seed(42)
    
    # Generate patient IDs
    mom_person_ids = np.arange(1000000, 1000000 + n_patients)
    child_person_ids = np.arange(10000000, 10000000 + n_patients)
    
    # Generate birth dates for children
    child_birth_dates = [datetime.now() - timedelta(days=int(np.random.randint(0, 1000))) for _ in range(n_patients)]
    
    def generate_distinct_dates(start_date, end_date, n_dates):
        date_range = (end_date - start_date).days
        distinct_days = np.random.choice(range(date_range), n_dates, replace=False)
        distinct_dates = sorted(start_date + timedelta(days=int(x)) for x in distinct_days)
        return distinct_dates
    
    # Generate distinct dates for each patient
    patient_dates = {}
    for i, mom_id in enumerate(mom_person_ids):
        child_birth_date = child_birth_dates[i]
        pregnancy_start = child_birth_date - timedelta(days=280)
        n_distinct_dates = np.random.randint(5, 51)
        patient_dates[mom_id] = generate_distinct_dates(pregnancy_start, child_birth_date, n_distinct_dates)
    
    # Helper function to generate EHR data within pregnancy window
    def generate_ehr_data(concept_prefix, datetime_prefix, n_codes_modality):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            child_birth_date = child_birth_dates[i]
            
            for date in patient_dates[mom_id]:
                # Generate 1 to 5 events for each distinct date
                n_events = np.random.randint(1, 6)
                for _ in range(n_events):
                    data.append({
                        'mom_person_id': int(mom_id),
                        'child_person_id': int(child_id),
                        f'{concept_prefix}_concept_id': int(np.random.randint(1, n_codes_modality + 1)),
                        f'{datetime_prefix}_DATETIME': date,
                        'child_birth_date': child_birth_date
                    })
        
        return pd.DataFrame(data)
    
    # Generate EHR data for each modality
    conditions_df = generate_ehr_data('condition', 'condition_start', n_codes['conditions'])
    conditions_df['condition_source_value'] = [f'condition_{i}' for i in range(len(conditions_df))]
    
    drugs_df = generate_ehr_data('drug', 'drug_exposure_start', n_codes['drugs'])
    procedures_df = generate_ehr_data('procedure', 'procedure', n_codes['procedures'])
    observations_df = generate_ehr_data('observation', 'observation', n_codes['observations'])
    
    measurements_df = generate_ehr_data('measurement', 'measurement', n_codes['measurements'])
    measurements_df['value_as_number'] = np.random.uniform(0, 100, len(measurements_df))
    
    def generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins):
        data = []
        for i, mom_id in enumerate(mom_person_ids):
            child_id = child_person_ids[i]
            birth_date = child_birth_dates[i]
            
            # T3: 3-30 days before birth
            t3_dos = int(np.random.randint(3, 31))
            # T2: 31-60 days before birth
            t2_dos = int(np.random.randint(31, 61))
            # T1: 61-100 days before birth
            t1_dos = int(np.random.randint(61, 101))
            
            for timepoint, dos in zip(['T3', 'T2', 'T1'], [t3_dos, t2_dos, t1_dos]):
                sample_date = birth_date - timedelta(days=dos)
                data.append({
                    'maternal_person_id': int(mom_id),
                    'child_person_id': int(child_id),
                    'Timepoint': timepoint,
                    'DOS': dos,
                    'SampleID': f'Sample_{mom_id}_{timepoint}',
                    'ID': f'ID_{mom_id}_{timepoint}',
                    'EGA': f'EGA_{mom_id}_{timepoint}',
                    'sample_date': sample_date,
                    'child_birth_date': birth_date
                })
        
        df = pd.DataFrame(data)
        
        # Generate all protein data at once
        protein_data = np.random.normal(0, 1, (len(df), n_proteins))
        protein_df = pd.DataFrame(protein_data, columns=[f'Protein_{i}' for i in range(n_proteins)])
        
        # Concatenate the original dataframe with the protein data
        df = pd.concat([df, protein_df], axis=1)
        
        df['sample_ID'] = df['maternal_person_id'].astype(str) + '_' + df['child_person_id'].astype(str) + '_' + df['Timepoint']
        df['mom_person_id'] = df['maternal_person_id']
        df['DOS_sampling_time'] = df['DOS']
        return df
    
    proteomics_df = generate_processed_ool_proteomics(mom_person_ids, child_person_ids, child_birth_dates, n_proteins)
    
    # Generate OOL EHR features (list of unique mom_person_ids)
    ool_ehr_features_df = pd.DataFrame({'mom_person_id': mom_person_ids})
    
    # Generate sampleID indices
    sampleid_indices_df = pd.DataFrame({'0': proteomics_df['sample_ID']})
    
    # Save all generated data to CSV files
    conditions_df.to_csv('./raw_data/EHR/full_EHR_cohort_conditions.csv', index=True)
    drugs_df.to_csv('./raw_data/EHR/full_EHR_cohort_drugs.csv', index=True)
    procedures_df.to_csv('./raw_data/EHR/full_EHR_cohort_procedures.csv', index=True)
    observations_df.to_csv('./raw_data/EHR/full_EHR_cohort_observations.csv', index=True)
    measurements_df.to_csv('./raw_data/EHR/full_EHR_cohort_measurements.csv', index=True)
    #proteomics_df.to_csv('./processed_data/ool_proteomics_omop_id.csv', index=True)
    #ool_ehr_features_df.to_csv('./ool_EHR_features.csv', index=False)
    #sampleid_indices_df.to_csv('./processed_data/sampleID_indices.csv', index=True)
    
    print("All synthetic data has been generated with consistent IDs across all datasets.")
    print(f"Total records generated:")
    print(f"Conditions: {len(conditions_df)}")
    print(f"Drugs: {len(drugs_df)}")
    print(f"Procedures: {len(procedures_df)}")
    print(f"Observations: {len(observations_df)}")
    print(f"Measurements: {len(measurements_df)}")
    print(f"Proteomics samples: {len(proteomics_df)}")

# Example usage
n_patients = 10000
n_records_per_modality = 100000
n_codes = {
    'conditions': 400,
    'drugs': 400,
    'procedures': 400,
    'observations': 400,
    'measurements': 400
}
n_proteins = 500

generate_comprehensive_synthetic_data(n_patients, n_records_per_modality, n_codes, n_proteins)