# What's this notebook about? 
Don't forget to leave notes as you move through the notebook. Your collaborators will appreciate it. 

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt

In [3]:
pd.set_option("display.max_rows", 5000)
pd.set_option("display.max_columns", 5000)
pd.set_option("display.width", 5000)

#### What data do we have here? 

In [4]:
df = pd.read_csv("heart.csv")

In [5]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [6]:
len(df)

1025

#### Line charts work best with time series data. 
Because different patiends have different cholesterol levels, the line jumps back and forth for each case along the x axis (age). See below for a different chart type.  

In [7]:
alt.Chart(df).mark_line().encode(x="age", y="chol")

#### What were you curious about here? 

In [8]:
under_60 = df[df["age"] < 60]

In [9]:
under_60.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0


#### And here? 

In [10]:
over_60 = df[df["age"] > 60]

In [11]:
over_60.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
10,71,0,0,112,149,0,1,125,0,1.6,1,0,2,1
21,67,0,0,106,223,0,1,142,0,0.3,2,2,2,1


### Chart it

Since you're plotting the relationship between two variables — cholesterol and age — a [scatterplot](https://altair-viz.github.io/gallery/scatter_tooltips.html) might work better here. I'm assuming this is patient-level observations? If so, you might also want to aggregate, calculating the average cholesterol by age and sex, to avoid some of the noise. 

In [27]:
grouped_by_age_sex = df.groupby(["age", "sex"]).agg({"chol": "mean"}).reset_index()

In [31]:
### Convert sex to a string because it's categorical: men(0), women(1)
grouped_by_age_sex["sex"] = grouped_by_age_sex["sex"].astype(str)

In [29]:
alt.Chart(grouped_by_age_sex).mark_circle().encode(
    x="age", y="chol", color="sex",
)