In [None]:
import pandas as pd
import numpy as np

# Read CSV
df = pd.read_csv("ccmd_filled_10.23.2025.csv")
df = df.drop(columns = "Category")

# Set a seed for reproducibility
rng = np.random.default_rng(10)

# Create an empty DataFrame to hold synthetic patient data
synthetic_df = pd.DataFrame(columns=df.columns)

# Get all biomarkers available
biomarker_cols = []
for col in df.columns:
    if col not in ["Subtype"]:
        biomarker_cols.append(col)

for index, row in df.iterrows():
    # Select the specific subtype you are on now
    subtype = row["Subtype"]
    # desired population size - change for how many samples needed for each subtype
    n = 100

    # Create an empty dictionary to hold all biomarkers and their associated percent population
    biomarker_data = {}
    for biomarker in biomarker_cols:
        # identify population prevalance of each biomarker
        prob_biomarker = row[biomarker]/100
        # Create a unique key value for the dictionary
        key = biomarker
        # When working with binomial generator, p must be less than 1 or greater than 0 (no Na)
        # Adding it to the biomarker data dicitionary as well
        biomarker_data[key] = rng.binomial(1,prob_biomarker, size=n)
    
    # Make the biomarker dictionary into a dataframe
    subtype_df = pd.DataFrame(biomarker_data)
    # Create Subtype Columna and add subtype name to each row
    subtype_df["Subtype"] = [subtype] * n

    synthetic_df = pd.concat([synthetic_df,subtype_df], ignore_index=True)

synthetic_df.to_csv("synthetic_dataset_10.23.2025.csv")


In [96]:
synthetic_df

Unnamed: 0,Subtype,Cystic,Tubular,Tubulocystic,Cribriform/Adenocarcinoma-like,Sieve-like with crystals,Solid/Nested,Papillary/Tubulopapillary,Infiltrative,Clear,...,Clear granular cytoplasm,Plant-like cell membrane,Basophilic stippling,Thick fibromuscular stroma,Desmoplastic stroma,Foamy macrophages,Mucin,Calcium oxalate crystals,End-stage renal disease background,True Necrosis
0,Clear cell RCC,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Clear cell RCC,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Clear cell RCC,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Clear cell RCC,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,Clear cell RCC,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,Urothelial Carcinoma,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2796,Urothelial Carcinoma,0,0,0,1,0,1,1,1,0,...,0,0,0,0,1,0,0,0,0,1
2797,Urothelial Carcinoma,0,0,0,1,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2798,Urothelial Carcinoma,0,0,0,1,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1


In [67]:
if prob_biomarker < 0 or prob_biomarker > 1:
    print(f"Warning: {biomarker} has invalid probability {prob_biomarker}")

In [None]:
# This is just kept for history, might need to go back to this version... 
for index, row in df.iterrows():
    # Select the specific subtype you are on now
    subtype = row["Subtype"]
    # desired population size - change for how many samples needed for each subtype
    n = 100
    
    # Get all biomarkers available
    #biomarker_cols = [col for col in df.columns if col not in ["category","subtype"]]
    biomarker_cols = []
    for col in df.columns:
        if col not in ["category","subtype"]:
            biomarker_cols.append(col)
    
    # Create a dict of random draws per biomarker
    data = {"subtype": [subtype]*n}
    
    for biomarker in biomarker_cols:
        p = float(row[biomarker])
        data[biomarker] = rng.binomial(1, p, size=n)
    
    subtype_df = pd.DataFrame(data)
    synthetic_df.append(subtype_df)

# Combine all subtypes into one final dataset
population_df = pd.concat(synthetic_df, ignore_index=True)

# summary
print(population_df.head())
print("\nPrevalence check by subtype:")
print(population_df.groupby("subtype").mean(numeric_only=True))

In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

n = 100
p_bio1 = 0.60  # prevalence of biomarker A
p_bio2 = 0.30  # prevalence of biomarker B

bioA = rng.binomial(1, p_bio1, size=n)
bioB = rng.binomial(1, p_bio2, size=n)

print(bioA)

df = pd.DataFrame({"biomarker_A": bioA, "biomarker_B": bioB})
#print(df.head(20))


df["group"] = (
    df["biomarker_A"].map({0:"A-",1:"A+"}) + "/" +
    df["biomarker_B"].map({0:"B-",1:"B+"})
)

print(df.head())
print("\nCounts:\n", df["group"].value_counts())
print("\nPrevalences observed:",
      df["biomarker_A"].mean(), df["biomarker_B"].mean())



[0 1 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1
 1 1 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0]


'df["group"] = (\n    df["biomarker_A"].map({0:"A-",1:"A+"}) + "/" +\n    df["biomarker_B"].map({0:"B-",1:"B+"})\n)\n\nprint(df.head())\nprint("\nCounts:\n", df["group"].value_counts())\nprint("\nPrevalences observed:",\n      df["biomarker_A"].mean(), df["biomarker_B"].mean())'

In [None]:
# Research for synthetic data generation 
import numpy as np
import pandas as pd

### SYNTHETIC DATSET ###
np.random.seed(6)

n_samples = 1000

# Category 1: Genetic factors
G1 = np.random.normal(0, 1, n_samples)
G2 = np.random.normal(0, 1, n_samples)
G3 = np.random.normal(0, 1, n_samples)

# Category 2: Environmental exposures (binary and continuous)
E1 = np.random.normal(0, 1, n_samples)              
E2 = np.random.binomial(n=1, p=0.5, size=n_samples)

# Category 3: Nutritional
N1 = np.random.normal(0, 1, n_samples)            
N2 = np.random.normal(0, 1, n_samples)

# Category 4: Metabolic
M1 = np.random.normal(0, 1, n_samples)
M2 = np.random.normal(0, 1, n_samples)

# synthetic dataset
y = (  
    0.5 * G3 +
    0.7 * E2 +
    0.2 * N1 +
    0.3 * M1 +
    + 1.8 * G1 * E1 #interaction between 2 features
    + 1.8 * G2 * N2 * M2 # higher order interaction
)

# Clip target to simulate z-score range
#y = np.clip(y, -3, 3)

# Assemble into a DataFrame
synthetic_df = pd.DataFrame({
    'G1': G1,
    'G2': G2,
    'G3': G3,
    'E1': E1,
    'E2': E2,
    'N1': N1,
    'N2': N2,
    'M1': M1,
    'M2': M2,
    'y': y
})

# Preview
print(synthetic_df.head())