In [2]:
import pandas as pd

In [3]:
import numpy as np
import pandas as pd

np.random.seed(42)

ROWS = 12000

data = {
    # Demographics
    "age": np.random.randint(25, 85, ROWS),
    "sex": np.random.binomial(1, 0.55, ROWS),  # 55% male
    
    # Clinical measurements
    "cholesterol": np.random.normal(240, 50, ROWS),
    "resting_bp": np.random.normal(135, 20, ROWS),
    "max_heart_rate": np.random.normal(150, 25, ROWS),
    
    # Blood indicators
    "fasting_blood_sugar": np.random.binomial(1, 0.25, ROWS),
    "blood_sugar_level": np.random.normal(110, 30, ROWS),
    
    # Symptoms
    "exercise_angina": np.random.binomial(1, 0.3, ROWS),
    "oldpeak": np.abs(np.random.normal(1.5, 1.2, ROWS)),
    
    # Lifestyle / history
    "smoking": np.random.binomial(1, 0.35, ROWS),
    "obesity": np.random.binomial(1, 0.4, ROWS),
}

df = pd.DataFrame(data)

# --- Introduce medical realism ---
df["cholesterol"] = df["cholesterol"].clip(120, 450)
df["resting_bp"] = df["resting_bp"].clip(80, 200)
df["max_heart_rate"] = df["max_heart_rate"].clip(60, 210)
df["blood_sugar_level"] = df["blood_sugar_level"].clip(60, 300)
df["oldpeak"] = df["oldpeak"].clip(0, 6)

# --- Disease probability logic (REALISTIC) ---
risk_score = (
    0.03 * df["age"] +
    0.02 * df["cholesterol"] +
    0.025 * df["resting_bp"] +
    0.04 * df["fasting_blood_sugar"] * 100 +
    0.05 * df["exercise_angina"] * 100 +
    0.04 * df["smoking"] * 100 +
    0.04 * df["obesity"] * 100 +
    0.03 * df["oldpeak"] * 10 -
    0.02 * df["max_heart_rate"]
)

probability = 1 / (1 + np.exp(-0.03 * (risk_score - risk_score.mean())))

df["target"] = (probability > 0.65).astype(int)

# --- Enforce class imbalance (industry realistic) ---
# ~80% healthy, ~20% disease
df.loc[df.sample(frac=0.25, random_state=42).index, "target"] = 0

# --- Inject missing values (noise) ---
for col in ["cholesterol", "resting_bp", "blood_sugar_level"]:
    df.loc[df.sample(frac=0.07, random_state=42).index, col] = np.nan

# --- Inject impossible values (dirty hospital data) ---
df.loc[df.sample(frac=0.01, random_state=42).index, "cholesterol"] = 0
df.loc[df.sample(frac=0.01, random_state=24).index, "resting_bp"] = 0

print(df.head())
print("\nShape:", df.shape)
print("\nTarget distribution:\n", df["target"].value_counts(normalize=True))




risk = (
    (df["cholesterol"] > 240).astype(int) +
    (df["resting_bp"] > 140).astype(int) +
    df["exercise_angina"] +
    df["smoking"] +
    df["obesity"] +
    (df["age"] > 55).astype(int)
)

df["target"] = (risk >= 3).astype(int)


   age  sex  cholesterol  resting_bp  max_heart_rate  fasting_blood_sugar  \
0   63    1   158.236706  171.826261      181.703149                    0   
1   76    1   307.802317  143.901204      126.261537                    0   
2   53    1   292.632469  159.715218      150.652781                    0   
3   39    1   216.365781  147.477747      191.136913                    0   
4   67    0   253.091766  126.739457      140.254895                    0   

   blood_sugar_level  exercise_angina   oldpeak  smoking  obesity  target  
0         177.554990                0  2.217908        1        1       0  
1          83.504048                1  1.480675        0        0       0  
2         112.736859                0  0.072005        1        0       0  
3         149.440470                0  2.377410        0        0       0  
4         109.788175                1  2.550232        0        0       0  

Shape: (12000, 12)

Target distribution:
 target
0    1.0
Name: proportion, dtyp

In [4]:
df

Unnamed: 0,age,sex,cholesterol,resting_bp,max_heart_rate,fasting_blood_sugar,blood_sugar_level,exercise_angina,oldpeak,smoking,obesity,target
0,63,1,158.236706,171.826261,181.703149,0,177.554990,0,2.217908,1,1,1
1,76,1,307.802317,143.901204,126.261537,0,83.504048,1,1.480675,0,0,1
2,53,1,292.632469,159.715218,150.652781,0,112.736859,0,0.072005,1,0,1
3,39,1,216.365781,147.477747,191.136913,0,149.440470,0,2.377410,0,0,0
4,67,0,253.091766,126.739457,140.254895,0,109.788175,1,2.550232,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,57,1,,,166.054175,0,,0,0.757018,1,1,1
11996,70,0,178.429561,123.167288,149.343325,1,107.229654,0,2.060941,0,0,0
11997,41,1,218.337595,152.408872,169.674321,0,150.730573,0,0.663092,1,0,0
11998,66,1,,,182.172033,0,,0,0.956816,1,0,0


np.int64(828)

INFO REGARDING COLUMNS AND THE PROBLEMS OUT THERE 

In [None]:
#  Normal range of Cholestrol in adults is less than 200 mg/dL.
# and of under 18 is less than 170 mg/dL.
                        Cholestrol  : 
                        1) MISSING VALUES = 720 
                        2)also some of entries are 0 which is impossible in real life.
                        
 
 
 
 
 #the blood moves across the body insides vessels as they are moving they exert pressure on the walls of the vessels this pressure is called blood pressure.
 # resting bp is the bp when the person is at rest like hasen't done any kind of exercisw or some sort of activities 
 # normal resting bp of an adult is less than 120 ,
#  the warning zone(elevated) bp is between 120-129,
# high bp stage 1 is between 130-139,
# high bp stage 2 is 140 or higher,
                        RESTING_BP :
                        1) some values are 0 which is impossible in real life.
                        2) MISSING VALUES = 828     
                        
                        
                        
                        
                        
                        
                        
                        
# the maximum numner of pumping your heart can do in a minute during exercise or some sort of fast  activities  is called max heart rate.

            #             MAX_HEART_RATE basicallyt denote the limit of heart rate for a specific age group like the  heart of a person at this age cannot beats more than this limit.
                        
                     MAX_HEART_RATE:
                            so far so good
                            
                            
                            
                            









#fasting blood sugar means the amount   of glucose present in your blood after not eating for at least 8 hours.












                            
                            
                            
                            
                            

In [13]:
df['resting_bp'].isnull().sum()

np.int64(828)

In [26]:
df['max_heart_rate'].isnull().sum()

np.int64(0)

In [28]:
df.groupby('age')['max_heart_rate'].mean().head(3)

Unnamed: 0_level_0,max_heart_rate
age,Unnamed: 1_level_1
25,151.00052
26,155.199743
27,147.240147


In [23]:
a=76
b=220
print(b-a)

144


In [24]:
df['max_heart_rate']

Unnamed: 0,max_heart_rate
0,181.703149
1,126.261537
2,150.652781
3,191.136913
4,140.254895
...,...
11995,166.054175
11996,149.343325
11997,169.674321
11998,182.172033


In [29]:
df.head(3)

Unnamed: 0,age,sex,cholesterol,resting_bp,max_heart_rate,fasting_blood_sugar,blood_sugar_level,exercise_angina,oldpeak,smoking,obesity,target
0,63,1,158.236706,171.826261,181.703149,0,177.55499,0,2.217908,1,1,1
1,76,1,307.802317,143.901204,126.261537,0,83.504048,1,1.480675,0,0,1
2,53,1,292.632469,159.715218,150.652781,0,112.736859,0,0.072005,1,0,1
