In [None]:
!pip install pandas

In [2]:
import pandas as pd

## Prepare data

In [3]:
# Data can be downloaded from https://www.kaggle.com/datasets/aasheesh200/framingham-heart-study-dataset
# Using kaggle api, kaggle datasets download aasheesh200/framingham-heart-study-dataset/

# Load raw data
df = pd.read_csv("./data/framingham.csv")
df = df.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
df = df.dropna()

df.head()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
0,1,39,0.0,0,0,195.0,106.0,80.0,77.0,0
1,0,46,0.0,0,0,250.0,121.0,95.0,76.0,0
2,1,48,20.0,0,0,245.0,127.5,75.0,70.0,0
3,0,61,30.0,0,1,225.0,150.0,65.0,103.0,1
4,0,46,23.0,0,0,285.0,130.0,85.0,85.0,0


In [4]:
# Print some meta data for each variable
df.describe()

Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
count,3816.0,3816.0,3816.0,3816.0,3816.0,3816.0,3816.0,3816.0,3816.0,3816.0
mean,0.442872,49.632075,8.986635,0.006289,0.313679,236.920597,132.460168,75.708857,81.918763,0.154612
std,0.496791,8.603992,11.92269,0.079066,0.464048,44.737876,22.161911,11.924352,24.017451,0.361582
min,0.0,32.0,0.0,0.0,0.0,107.0,83.5,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,206.0,117.0,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,234.0,128.0,75.0,78.0,0.0
75%,1.0,56.25,20.0,0.0,1.0,264.0,144.0,82.0,87.0,0.0
max,1.0,70.0,70.0,1.0,1.0,696.0,295.0,143.0,394.0,1.0


In [5]:
# Sort all patients based on thier Coronary heart disease (CHD) status after 10 years
grouped = df.groupby('TenYearCHD')
grouped.count()

Unnamed: 0_level_0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose
TenYearCHD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3226,3226,3226,3226,3226,3226,3226,3226,3226
1,590,590,590,590,590,590,590,590,590


In [6]:
# Random sample 500 from each class (CHD = 0/1) 
data = grouped.apply(lambda x: x.sample(500, random_state=73).reset_index(drop=True))
data.describe()


Unnamed: 0,male,age,cigsPerDay,prevalentStroke,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.491,51.419,9.771,0.011,0.392,241.994,137.1045,75.839,84.39,0.5
std,0.500169,8.693504,12.476547,0.104355,0.488441,46.622201,24.123687,12.310102,30.33447,0.50025
min,0.0,34.0,0.0,0.0,0.0,107.0,83.5,45.0,40.0,0.0
25%,0.0,44.0,0.0,0.0,0.0,210.0,120.0,67.0,72.0,0.0
50%,0.0,51.0,1.0,0.0,0.0,238.0,133.0,75.0,78.0,0.5
75%,1.0,59.0,20.0,0.0,1.0,270.0,149.125,82.0,88.0,1.0
max,1.0,70.0,60.0,1.0,1.0,600.0,248.0,140.0,394.0,1.0


In [12]:
# Standardize the data and prone from irelevant data

ndata = data.reset_index(drop=True).drop(columns="TenYearCHD").apply(lambda x: (x-x.mean())/ x.std(), axis=0)
ndata["TenYearCHD"] = data["TenYearCHD"].reset_index(drop=True)
ndata = ndata.sample(len(ndata), random_state=42).drop(columns="prevalentStroke")

ndata

Unnamed: 0,male,age,cigsPerDay,prevalentHyp,totChol,sysBP,heartRate,glucose,TenYearCHD
521,1.017656,1.217116,0.018354,-0.802553,-1.823037,-0.128691,2.368867,-0.276583,1
737,1.017656,-1.083453,1.621362,1.244777,0.600701,-0.315230,0.500483,0.119007,1
740,1.017656,-0.393282,0.018354,-0.802553,1.415763,-0.833393,-0.880496,-1.001831,1
660,1.017656,0.757002,0.819858,-0.802553,-0.900730,-0.045785,-0.068155,-0.375480,1
411,1.017656,-0.738367,0.819858,-0.802553,-0.643342,-0.791939,-0.799262,-0.045822,0
...,...,...,...,...,...,...,...,...,...
106,-0.981668,-1.313509,0.819858,-0.802553,0.643599,-0.584674,0.338015,-0.705138,0
270,1.017656,-0.738367,-0.783149,-0.802553,-0.836383,-1.227196,-0.068155,-0.045822,0
860,1.017656,-0.393282,1.621362,1.244777,-0.578995,-0.170144,-0.068155,-1.133694,1
435,-0.981668,0.757002,-0.542698,-0.802553,0.000129,-0.584674,-0.149389,-0.408446,0


In [14]:
# Save the data for next notebook
ndata.to_csv(f"data/myTenYearCHD_n{len(ndata)}.csv")