In [2]:
import numpy as np
import pandas as pd

### SYNTHETIC DATSET ###
np.random.seed(6)

n_samples = 1000

# Category 1: Genetic factors
G1 = np.random.normal(0, 1, n_samples)
G2 = np.random.normal(0, 1, n_samples)
G3 = np.random.normal(0, 1, n_samples)

# Category 2: Environmental exposures (binary and continuous)
E1 = np.random.normal(0, 1, n_samples)              
E2 = np.random.binomial(n=1, p=0.5, size=n_samples)

# Category 3: Nutritional
N1 = np.random.normal(0, 1, n_samples)            
N2 = np.random.normal(0, 1, n_samples)

# Category 4: Metabolic
M1 = np.random.normal(0, 1, n_samples)
M2 = np.random.normal(0, 1, n_samples)

# synthetic dataset
y = (  
    0.5 * G3 +
    0.7 * E2 +
    0.2 * N1 +
    0.3 * M1 +
    + 1.8 * G1 * E1 #interaction between 2 features
    + 1.8 * G2 * N2 * M2 # higher order interaction
)

# Clip target to simulate z-score range
#y = np.clip(y, -3, 3)

# Assemble into a DataFrame
synthetic_df = pd.DataFrame({
    'G1': G1,
    'G2': G2,
    'G3': G3,
    'E1': E1,
    'E2': E2,
    'N1': N1,
    'N2': N2,
    'M1': M1,
    'M2': M2,
    'y': y
})

# Preview
print(synthetic_df.head())


         G1        G2        G3        E1  E2        N1        N2        M1  \
0 -0.311784 -1.418699 -1.260993  0.132408   0  2.049664 -0.338455 -0.426517   
1  0.729004 -0.160942 -1.024372 -1.651252   1  1.050320 -0.894394  0.097983   
2  0.217821  0.213931 -0.182401 -1.499079   1  0.489765 -1.668413  2.362459   
3 -0.899092  1.281692 -1.209049  0.127766   0 -0.492195  1.919627  0.323312   
4 -2.486781 -2.382922 -1.086511  0.249325   0 -1.144469  0.575439 -0.101861   

         M2         y  
0 -0.549591 -0.897839  
1 -1.542525 -2.139181  
2  0.806702  0.309457  
3  0.130645 -0.234160  
4  0.217153 -2.454715  


In [None]:
# Replicating Synthetic Data Generation based on Diagnostic Criteria

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(8)

# Number of samples
n_samples = 1000

# Duplicate each subtype by n (replicates each row in the dictionary by n): 20



# Generate features
G1 = np.random.normal(0, 1, n_samples)



In [1]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

n = 1000
p_bio1 = 0.60  # prevalence of biomarker A
p_bio2 = 0.30  # prevalence of biomarker B

bioA = rng.binomial(1, p_bio1, size=n)
bioB = rng.binomial(1, p_bio2, size=n)

df = pd.DataFrame({"biomarker_A": bioA, "biomarker_B": bioB})
df["group"] = (
    df["biomarker_A"].map({0:"A-",1:"A+"}) + "/" +
    df["biomarker_B"].map({0:"B-",1:"B+"})
)

print(df.head())
print("\nCounts:\n", df["group"].value_counts())
print("\nPrevalences observed:",
      df["biomarker_A"].mean(), df["biomarker_B"].mean())


   biomarker_A  biomarker_B  group
0            0            1  A-/B+
1            1            0  A+/B-
2            0            0  A-/B-
3            0            1  A-/B+
4            1            1  A+/B+

Counts:
 group
A+/B-    37
A-/B-    32
A+/B+    23
A-/B+     8
Name: count, dtype: int64

Prevalences observed: 0.6 0.31


In [68]:
import pandas as pd
import numpy as np

# Read CSV
df = pd.read_csv("ccmd_filled_10.23.2025.csv")
df = df.drop(columns = "Category")

# Set a seed for reproducibility
rng = np.random.default_rng(10)

# Create an empty DataFrame to hold synthetic patient data
synthetic_df = pd.DataFrame(columns=df.columns)

In [None]:
# Get all biomarkers available
biomarker_cols = []
for col in df.columns:
    if col not in ["Subtype"]:
        biomarker_cols.append(col)

for index, row in df.iterrows():
    # Select the specific subtype you are on now
    subtype = row["Subtype"]
    # desired population size - change for how many samples needed for each subtype
    n = 100

    # Create an empty dictionary to hold all biomarkers and their associated percent population
    biomarker_data = {}
    for biomarker in biomarker_cols:
        # identify population prevalance of each biomarker
        prob_biomarker = row[biomarker]/100
        # Create a unique key value for the dictionary
        key = (f"{subtype} , {biomarker}")
        # When working with binomial generator, p must be less than 1 or greater than 0 (no Na)
        # Adding it to the biomarker data dicitionary as well
        biomarker_data[key] = rng.binomial(1,prob_biomarker, size=n)

print(biomarker_data)




{'Urothelial Carcinoma,Cystic': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'Urothelial Carcinoma,Tubular': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'Urothelial Carcinoma,Tubulocystic': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      

In [67]:
if prob_biomarker < 0 or prob_biomarker > 1:
    print(f"Warning: {biomarker} has invalid probability {prob_biomarker}")

In [None]:
# This is just kept for history, might need to go back to this version... 
for index, row in df.iterrows():
    # Select the specific subtype you are on now
    subtype = row["Subtype"]
    # desired population size - change for how many samples needed for each subtype
    n = 100
    
    # Get all biomarkers available
    #biomarker_cols = [col for col in df.columns if col not in ["category","subtype"]]
    biomarker_cols = []
    for col in df.columns:
        if col not in ["category","subtype"]:
            biomarker_cols.append(col)
    
    # Create a dict of random draws per biomarker
    data = {"subtype": [subtype]*n}
    
    for biomarker in biomarker_cols:
        p = float(row[biomarker])
        data[biomarker] = rng.binomial(1, p, size=n)
    
    subtype_df = pd.DataFrame(data)
    synthetic_df.append(subtype_df)

# Combine all subtypes into one final dataset
population_df = pd.concat(synthetic_df, ignore_index=True)

# summary
print(population_df.head())
print("\nPrevalence check by subtype:")
print(population_df.groupby("subtype").mean(numeric_only=True))

In [39]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

n = 100
p_bio1 = 0.60  # prevalence of biomarker A
p_bio2 = 0.30  # prevalence of biomarker B

bioA = rng.binomial(1, p_bio1, size=n)
bioB = rng.binomial(1, p_bio2, size=n)

print(bioA)

df = pd.DataFrame({"biomarker_A": bioA, "biomarker_B": bioB})
#print(df.head(20))


'''df["group"] = (
    df["biomarker_A"].map({0:"A-",1:"A+"}) + "/" +
    df["biomarker_B"].map({0:"B-",1:"B+"})
)

print(df.head())
print("\nCounts:\n", df["group"].value_counts())
print("\nPrevalences observed:",
      df["biomarker_A"].mean(), df["biomarker_B"].mean())'''


[0 1 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1
 1 1 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0]


'df["group"] = (\n    df["biomarker_A"].map({0:"A-",1:"A+"}) + "/" +\n    df["biomarker_B"].map({0:"B-",1:"B+"})\n)\n\nprint(df.head())\nprint("\nCounts:\n", df["group"].value_counts())\nprint("\nPrevalences observed:",\n      df["biomarker_A"].mean(), df["biomarker_B"].mean())'