In [None]:
%pip install -q pandas kagglehub

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
from pathlib import Path
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# For cleaner and easier path
# Note: kagglehub.dataset_download downloads once if non existent and loads locally from then on

DATASET = "neurocipher/heartdisease"
DATA_PATH = Path(kagglehub.dataset_download(DATASET))

In [None]:
# Reading file
file_path = DATA_PATH / "Heart_Disease_Prediction.csv"
initial_df = pd.read_csv(file_path)
initial_df

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [None]:
# Its good praxis to create new dataframes instead of writing back into them
# Male = 1, Female = 0
male_df = initial_df[initial_df['Sex'] == 1]
male_df

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
5,65,1,4,120,177,0,0,140,0,0.4,1,0,7,Absence
6,56,1,3,130,256,1,2,142,1,0.6,2,1,6,Presence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,48,1,2,110,229,0,0,168,0,1.0,3,0,7,Presence
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [None]:
# Filters can be nested with ()
young_males_chest_pain_types_count = (
    initial_df[(initial_df['Sex'] == 1) & (initial_df['Age'] < 40)] # Filters by those columns (pd.Series) -> Returns a pd.Dataframe
    .groupby('Chest pain type')                                     # Creates a DataFrameGroupBy
    .size()                                                         # Counts number of rows in each group -> Returns pd.Series with index groups
    .reset_index(name='count')                                      # Makes it into a pd.Dataframe adding index as a column and renames the values as 'count'
    .sort_values(by='count', ascending=False)                       # Sorts by count
    .reset_index(drop=True)                                         #! Different from .reset_index(name='count'), this is for a pd.Dataframe and resets the index (the other one was a pd.Series)
)

young_males_chest_pain_types_count

Unnamed: 0,Chest pain type,count
0,4,3
1,1,2
2,3,2
3,2,1


### Pandas Functions

In [None]:
# Dispay data
display(initial_df.head(3))
display(initial_df.tail(3))
display(initial_df.sample(3))
display(initial_df.info())
display(initial_df.dtypes)
display(initial_df.size)

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence
269,67,1,4,160,286,0,2,108,1,1.5,2,3,3,Presence


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
76,45,1,4,104,208,0,2,148,1,3.0,2,0,3,Absence
125,54,0,3,160,201,0,0,163,0,0.0,1,1,3,Absence
113,54,0,3,135,304,1,0,170,0,0.0,1,0,3,Absence


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

None

Age                          int64
Sex                          int64
Chest pain type              int64
BP                           int64
Cholesterol                  int64
FBS over 120                 int64
EKG results                  int64
Max HR                       int64
Exercise angina              int64
ST depression              float64
Slope of ST                  int64
Number of vessels fluro      int64
Thallium                     int64
Heart Disease               object
dtype: object