In [2]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)
num_records = 100

data = {
    'Age': np.random.randint(29, 78, size=num_records),
    'Sex': np.random.choice([0, 1], size=num_records), # 0: Female, 1: Male
    'ChestPainType': np.random.choice([0, 1, 2, 3], size=num_records),
    'RestingBP': np.random.randint(94, 200, size=num_records),
    'Cholesterol': np.random.randint(126, 564, size=num_records),
    'FastingBS': np.random.choice([0, 1], size=num_records, p=[0.8, 0.2]),
    'RestingECG': np.random.choice([0, 1, 2], size=num_records),
    'MaxHR': np.random.randint(71, 202, size=num_records),
    'ExerciseAngina': np.random.choice([0, 1], size=num_records),
    'Oldpeak': np.round(np.random.uniform(0.0, 6.2, size=num_records), 1),
    'ST_Slope': np.random.choice([0, 1, 2], size=num_records)
}

df_large = pd.DataFrame(data)

# --- REFINED HIGH RISK LOGIC ---
# If Age > 55 AND Cholesterol > 240, set markers typical of heart disease
mask = (df_large['Age'] > 55) & (df_large['Cholesterol'] > 240)
df_large.loc[mask, 'ST_Slope'] = 2            # Downsloping
df_large.loc[mask, 'ExerciseAngina'] = 1     # Yes
df_large.loc[mask, 'Oldpeak'] = df_large.loc[mask, 'Oldpeak'].apply(lambda x: max(x, 2.0))

# Save to CSV
file_name = 'test_patients_100.csv'
df_large.to_csv(file_name, index=False)

print(f"✅ Created '{file_name}' with {num_records} records.")
print("\n--- First 5 Records ---")
print(df_large.head())

✅ Created 'test_patients_100.csv' with 100 records.

--- First 5 Records ---
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   67    0              2        129          442          0           2   
1   57    1              3        112          429          0           1   
2   43    1              0        183          272          1           0   
3   71    1              2        160          129          0           0   
4   36    0              1        112          160          1           2   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0    196               1      5.7         2  
1    128               1      2.0         2  
2    131               1      1.7         1  
3    197               0      5.0         1  
4    175               0      4.6         1  
