In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Nulls, N/A, other
- id - all unique values, no nulls, convert to object dtype **Done**
- gender - drop value: "Other" (one count) **Done**
- age - no nulls, consider additional age ranges column instead (low-age has decimal values) **Done**
- hypertension - no nulls, convert to object dtype **Done**
- heart_disease - no nulls, convert to object dtype **Done**
- ever_married - no nulls, Yes/No, encode to new column with 0/1 or True/False (handled with modeling_df)
- work_type - no nulls, uses strings to describe, use one-hot encoding into each new column (handled w modeling_df)
- Residence_type - one-hot encode into is_Urban (handled w modeling_df)
- avg_glucose_level - no nulls, continuous values **Done**
- bmi - hangle nulls by imputing mean by age category and also gender, normal distribution **Done**
    * use pd.cut to create bins: 0-4, 5-9, 10-14, 15-19, ...
    * use existing gender column as well
- smoking_status - no nulls, one-hot encode (handled w modeling_df)
- stroke - no nulls, convert to string **Done**

In [4]:
df = df.drop(3116).reset_index().drop(columns='index') # index 3116 has gender 'Other', only one value

In [5]:
df['id'] = df.id.astype('object')
df['hypertension'] = df.hypertension.astype('object')
df['heart_disease'] = df.heart_disease.astype('object')
df['stroke'] = df.stroke.astype('object')

In [6]:
five_year_cutpoints = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
df['age_range'] = pd.cut(x=df.age, bins=five_year_cutpoints).astype('string')

In [7]:
df['age_range'] = df['age_range'].str[1:-1]\
                  .str.replace(', ', '-')\
                  .astype('object')

In [8]:
df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_range
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,65-70
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,60-65
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,75-80


In [9]:
df.gender.unique()

array(['Male', 'Female'], dtype=object)

In [10]:
grouped = df.groupby(['age_range', 'gender']).bmi.mean()

In [11]:
grouped['65-70']['Male']

30.56969696969697

In [12]:
# literally magic
df.loc[df.bmi.isna(), 'bmi'] = df[df.bmi.isna()]\
                                         .apply(lambda x: grouped[x.age_range][x.gender], axis=1)

In [13]:
df.rename(columns={'Residence_type':'residence_type'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5109 non-null   object 
 1   gender             5109 non-null   object 
 2   age                5109 non-null   float64
 3   hypertension       5109 non-null   object 
 4   heart_disease      5109 non-null   object 
 5   ever_married       5109 non-null   object 
 6   work_type          5109 non-null   object 
 7   residence_type     5109 non-null   object 
 8   avg_glucose_level  5109 non-null   float64
 9   bmi                5109 non-null   float64
 10  smoking_status     5109 non-null   object 
 11  stroke             5109 non-null   object 
 12  age_range          5109 non-null   object 
dtypes: float64(3), object(10)
memory usage: 519.0+ KB


In [14]:
df.to_csv('.csv')

In [15]:
modeling_df = pd.get_dummies(df, drop_first=True)