# Generate Synthetic Data

Since we don't have access to the actual dataset used in the journal, we replicate the variables that were mentioned.

In [17]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of patients
n_patients = 400

# Generate enrollment time in months
enrollment_time = np.random.randint(0, 12, size=n_patients)

# Define possible time points for treatment (e.g., after 3 months, 6 months, etc.)
time_points = np.arange(3, 25, 3)  # Possible treatment times from 3 months to 24 months

# Baseline symptoms (Pain, Urgency, Frequency on a scale of 0-9)
baseline_symptoms = np.random.randint(0, 10, size=(n_patients, 3))

# Generate the "treated" and "treatment time"
treated_indicator = np.zeros(n_patients)  # Initialize all as untreated
treatment_time = np.full(n_patients, np.inf)  # Initialize with np.inf (for untreated patients)

# Simulate simple treatment decision: randomized treatment probability
for i in range(n_patients):
    treat_prob = np.random.uniform(0.1, 0.5)

    # Flip a coin based on the treatment probability (binomial distribution)
    treated = np.random.binomial(1, treat_prob)
    treated_indicator[i] = treated
    
    # If treated, assign a treatment time randomly from the available time points
    if treated:
        treatment_time[i] = np.random.choice(time_points[1:-1])

# Symptoms post-treatment (fluctuate slightly from baseline) Clip to 0-9 scale
treatment_symptoms = np.clip(baseline_symptoms + np.random.randint(-2, 3, size=(n_patients, 3)), 0, 9)

# Follow-up symptoms (Clip to 0-9 scale)
symptoms_3m = np.clip(treatment_symptoms + np.random.randint(0, 3, size=(n_patients, 3)), 0, 9)
symptoms_6m = np.clip(symptoms_3m - np.random.randint(0, 2, size=(n_patients, 3)), 0, 9)

# Gender (0 = Female, 1 = Male)
gender = np.random.choice([0, 1], size=n_patients)

df = pd.DataFrame({
    'Patient_ID': np.arange(1, n_patients + 1),
    'Enrollment_Time': enrollment_time,
    'Treatment_Time': treatment_time,
    'Treated': treated_indicator,
    'base_pain': baseline_symptoms[:, 0],
    'base_urgency': baseline_symptoms[:, 1],
    'base_frequency': baseline_symptoms[:, 2],
    'treatment_pain': treatment_symptoms[:, 0],
    'treatment_urgency': treatment_symptoms[:, 1],
    'treatment_frequency': treatment_symptoms[:, 2],
    'Pain_3M': symptoms_3m[:, 0],
    'Urgency_3M': symptoms_3m[:, 1],
    'Frequency_3M': symptoms_3m[:, 2],
    'Pain_6M': symptoms_6m[:, 0],
    'Urgency_6M': symptoms_6m[:, 1],
    'Frequency_6M': symptoms_6m[:, 2],
    'Gender': gender
})

df.head()


Unnamed: 0,Patient_ID,Enrollment_Time,Treatment_Time,Treated,base_pain,base_urgency,base_frequency,treatment_pain,treatment_urgency,treatment_frequency,Pain_3M,Urgency_3M,Frequency_3M,Pain_6M,Urgency_6M,Frequency_6M,Gender
0,1,6,6.0,1.0,0,8,5,0,9,7,2,9,7,2,9,7,0
1,2,3,inf,0.0,6,9,6,8,7,5,8,8,6,7,8,5,0
2,3,10,12.0,1.0,9,2,1,8,3,2,8,4,2,7,4,1,1
3,4,7,inf,0.0,8,7,9,9,6,9,9,7,9,8,7,9,0
4,5,4,12.0,1.0,6,8,3,8,8,3,8,9,3,8,8,2,0


In [18]:
df["Treated"].value_counts()

Treated
0.0    270
1.0    130
Name: count, dtype: int64