# EDA on Diabetes Health Indicator Dataset

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pt
import seaborn as sb

In [2]:
# Function to save figures
import os
from pathlib import Path
from matplotlib import pyplot as plt

IMAGES_PATH = Path() / "Images/EDA"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
# Read the csv file
df = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015 - diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Diabetes_binary       70692 non-null  int64
 1   HighBP                70692 non-null  int64
 2   HighChol              70692 non-null  int64
 3   CholCheck             70692 non-null  int64
 4   BMI                   70692 non-null  int64
 5   Smoker                70692 non-null  int64
 6   Stroke                70692 non-null  int64
 7   HeartDiseaseorAttack  70692 non-null  int64
 8   PhysActivity          70692 non-null  int64
 9   Fruits                70692 non-null  int64
 10  Veggies               70692 non-null  int64
 11  HvyAlcoholConsump     70692 non-null  int64
 12  AnyHealthcare         70692 non-null  int64
 13  NoDocbcCost           70692 non-null  int64
 14  GenHlth               70692 non-null  int64
 15  MentHlth              70692 non-null  int64
 16  Phys

In [5]:
df.describe()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,...,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,0.5,0.563458,0.525703,0.975259,29.856985,0.475273,0.062171,0.14781,0.703036,0.611795,...,0.95496,0.093914,2.837082,3.752037,5.810417,0.25273,0.456997,8.584055,4.920953,5.698311
std,0.500004,0.49596,0.499342,0.155336,7.113954,0.499392,0.241468,0.354914,0.456924,0.487345,...,0.207394,0.291712,1.113565,8.155627,10.062261,0.434581,0.498151,2.852153,1.029081,2.175196
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,4.0,4.0
50%,0.5,1.0,1.0,1.0,29.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,9.0,5.0,6.0
75%,1.0,1.0,1.0,1.0,33.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,2.0,6.0,1.0,1.0,11.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [6]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,0,1,26,0,0,0,1,0,...,1,0,3,5,30,0,1,4,6,8
1,0,1,1,1,26,1,1,0,0,1,...,1,0,3,0,0,0,1,12,6,8
2,0,0,0,1,26,0,0,0,1,1,...,1,0,1,0,10,0,1,13,6,8
3,0,1,1,1,28,1,0,0,1,1,...,1,0,3,0,3,0,1,11,6,8
4,0,0,0,1,29,1,0,0,1,1,...,1,0,2,0,0,0,0,8,5,8


In [7]:
df.tail()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
70687,1,0,1,1,37,0,0,0,0,0,...,1,0,4,0,0,0,0,6,4,1
70688,1,0,1,1,29,1,0,1,0,1,...,1,0,2,0,0,1,1,10,3,6
70689,1,1,1,1,25,0,0,1,0,1,...,1,0,5,15,0,1,0,13,6,4
70690,1,1,1,1,18,0,0,0,0,0,...,1,0,4,0,0,1,0,11,2,4
70691,1,1,1,1,25,0,0,1,1,1,...,1,0,2,0,0,0,0,9,6,2


# Value Counts

In [8]:
df["Diabetes_binary"].value_counts()

Diabetes_binary
0    35346
1    35346
Name: count, dtype: int64

In [9]:
df["HighBP"].value_counts()

HighBP
1    39832
0    30860
Name: count, dtype: int64

In [10]:
df["Smoker"].value_counts()

Smoker
0    37094
1    33598
Name: count, dtype: int64

In [11]:
df["Stroke"].value_counts()

Stroke
0    66297
1     4395
Name: count, dtype: int64

In [12]:
df["HeartDiseaseorAttack"].value_counts()

HeartDiseaseorAttack
0    60243
1    10449
Name: count, dtype: int64

In [13]:
df["Education"].value_counts()

Education
6    26020
5    20030
4    19473
3     3447
2     1647
1       75
Name: count, dtype: int64

In [15]:
df["Income"].value_counts()

Income
8    20646
7    11425
6    10287
5     8010
4     6658
3     5557
2     4498
1     3611
Name: count, dtype: int64