In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



In [7]:
df = pd.read_csv(r"C:\Users\shiha\Downloads\cardio_train (1).csv", sep=";")
df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [8]:
print(df.shape) 

(70000, 13)


So there is 70,000 data entries 13 features.

In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None


In [10]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
df['age_years'] = (df['age'] / 365).round()


In [7]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50.0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55.0
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,52.0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48.0
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,48.0


Lets understand the features/columns:
1. id = an identifier for each record
2. age = age of the patient in days
3. gender = gender of the patient 
4. height = height of the patients in cm
5. weight = weight of the patient in kg
6. ap_hi = Systolic blood pressure (upper number)
7. ap_lo = Diastolic blood pressure (lower number)
8. cholesterol = cholesterol level of the patient, 1 is normal, 2 is above normal and 3 is well above normal indicating high.
9. gluc = glucose level of patient. 1 is normal, 2 is above normal and 3 is well above normal indicating high.
10. smoke = wheteher the patient smokes or not. 0 means no and 1 means yes.
11. alco = Whether the patient drinks alcohol or not. 0 means no and 1 means yes.
12. active = Whether the patient is physically active or not. 0 means no and 1 means yes.
13. cardio = Target variable: whether the patient has cardiovascular disease or not. 0 means no disease 1 means Disease patient. 
14. age_years = age column feature engineered to years for better interpretability. 

In [8]:
print(df.describe()) 


                 id           age        gender        height        weight  \
count  70000.000000  70000.000000  70000.000000  70000.000000  70000.000000   
mean   49972.419900  19468.865814      1.349571    164.359229     74.205690   
std    28851.302323   2467.251667      0.476838      8.210126     14.395757   
min        0.000000  10798.000000      1.000000     55.000000     10.000000   
25%    25006.750000  17664.000000      1.000000    159.000000     65.000000   
50%    50001.500000  19703.000000      1.000000    165.000000     72.000000   
75%    74889.250000  21327.000000      2.000000    170.000000     82.000000   
max    99999.000000  23713.000000      2.000000    250.000000    200.000000   

              ap_hi         ap_lo   cholesterol          gluc         smoke  \
count  70000.000000  70000.000000  70000.000000  70000.000000  70000.000000   
mean     128.817286     96.630414      1.366871      1.226457      0.088129   
std      154.011419    188.472530      0.680250    

### Key Insights: 

1. Data quality issue: unrealistic height, weight & blood pressure seem to have some extreme values. Maybe outliers. We need to clean them.

2. Lifestyle distribution: most are non-smokers, non-drinkers, but active.

3. About half of the patients have cardiovascular disease.


In [9]:
df.isnull().sum()


id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
age_years      0
dtype: int64

No missing values found. 

In [34]:
df.groupby('gender')['cardio'].value_counts()


gender  cardio
1       0         22914
        1         22616
2       1         12363
        0         12107
Name: count, dtype: int64

So for females almost a 50-50 split can be seen for cardiac disease and no cardiac disease.  Same goes for men.

In [10]:
df.groupby('smoke')['cardio'].value_counts()


smoke  cardio
0      1         32050
       0         31781
1      0          3240
       1          2929
Name: count, dtype: int64

Weirdly the cases seem to be quite close for the smokers and non smokers cuz smoking is a major reason for heart disease,  and we normally expect smokers are more prone to heart disease compared to non smokers.

In [11]:
df.groupby(['gender', 'smoke'])['cardio'].value_counts()


gender  smoke  cardio
1       0      0         22489
               1         22228
        1      0           425
               1           388
2       0      1          9822
               0          9292
        1      0          2815
               1          2541
Name: count, dtype: int64

Both genders hover around 50% cardio rate whether smoking or not.

Female smokers are rare in this dataset so results aren’t that reliable based on this factor.

For males, surprisingly, non-smokers actually show slightly higher cardio rates than smokers.

In [12]:
df.groupby(pd.cut(df['age_years'], [20,30,40,50,60,70]))['cardio'].mean()


  df.groupby(pd.cut(df['age_years'], [20,30,40,50,60,70]))['cardio'].mean()


age_years
(20, 30]    0.000000
(30, 40]    0.229977
(40, 50]    0.391714
(50, 60]    0.530873
(60, 70]    0.690369
Name: cardio, dtype: float64

I need to find how many people are there in a certain age range. 