# **Objective**

This EDA project is a data analysis of a healthcare dataset focused on identifying key factors associated with diabetes.

The goal is to discover patterns and insights to better understand the characteristics of the diabetic population within the dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('healthcare_dataset1_biased.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# rename column W_kg to Weight_kg
df.rename(columns={'W_kg':'Weight_kg'}, inplace=True)

# Dealing missing values

In [3]:
df.isnull().sum()

Age                   3023
W_kg                     0
Height_cm                0
BMI                   1511
Continent              500
Blood_Pressure           0
Cholesterol_Level        0
Diabetic                 0
Smoker                   0
Exercise_Frequency       0
Annual_Income_USD        0
dtype: int64

In [None]:
AgeProp = df['Age'].value_counts(normalize=True)
AgeProp

In [None]:
synthAge = np.random.choice(AgeProp.index, size=df['Age'].isnull().sum(), p=AgeProp.values)

In [None]:
df.loc[df['Age'].isnull(), 'Age'] = synthAge

In [None]:
df['Age'].isnull().sum()

In [None]:
df['Age'].dtype

In [None]:
df['Age'] = df['Age'].astype(int)

In [None]:
df['Age'].dtype

In [None]:
df['BMI'].describe()

In [None]:
medianBMI = df['BMI'].median()
medianBMI = round(medianBMI, 2)
medianBMI

In [None]:
df['BMI']=df['BMI'].fillna(medianBMI)

In [None]:
df['BMI'].isnull().sum()

In [None]:
df['BMI'].describe()

In [None]:
df['Continent'].unique()

In [None]:
df['Continent'].mode()

In [None]:
df['Continent'].value_counts()

In [None]:
df['Continent'] = df['Continent'].ffill()

In [None]:
df['Continent'].isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
categorical_columns = ['Continent','Cholesterol_Level', 'Diabetic', 'Smoker', 'Exercise_Frequency']

In [None]:
numeric_columns = ['Age', 'Height_cm', 'Weight_kg', 'BMI', 'Blood_Pressure', 'Annual_Income_USD']

# Univariate Analysis on Categorical Columns

In [None]:
# Continent

sns.countplot(df['Continent'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# Cholesterol_Level

sns.countplot(df['Cholesterol_Level'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
colestrol_lev = df['Cholesterol_Level'].value_counts(normalize=True)
colestrol_lev

In [None]:
plt.pie(colestrol_lev, labels=colestrol_lev.index, autopct='%1.1f%%')
plt.title('Cholesterol Level in our dataset')
plt.gcf().set_facecolor('lightgrey')
plt.show()

In [None]:
sns.countplot(df['Diabetic'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
sns.countplot(df['Smoker'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()
plt.show()

In [None]:
sns.countplot(df['Exercise_Frequency'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

# Univariate Analysis on Numeric Columns

In [None]:
numeric_columns

In [None]:
# Age

sns.histplot(df['Age'], color = 'brown', kde = True, bins=10)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# Height

sns.histplot(df['Height_cm'], color = 'brown', kde=True)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# Weight_kg
sns.histplot(df['Weight_kg'], color = 'brown', kde=True)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# BMI

sns.histplot(df['BMI'], color = 'brown', kde=True, bins=15)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# Blood_Pressure
sns.histplot(df['Blood_Pressure'], color = 'brown', kde=True)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()
#

In [None]:
# Annual_Income_USD
sns.kdeplot(df['Annual_Income_USD'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()
#

# Outliers


In [None]:
numeric_columns

In [None]:
for col in numeric_columns:
    plt.boxplot(
        df[col],
        boxprops=dict(color='brown'),
        medianprops=dict(color='brown'),
        flierprops=dict(color='brown', marker='o'),  # valid marker
        showmeans=True
    )
    plt.title(col, fontsize=20)
    plt.gcf().set_facecolor('lightgrey')
    plt.gca().set_facecolor('lightblue')
    plt.tight_layout()
    plt.show()


In [None]:
df.head(2)

### Created a new Dataframe for analysing only Diabetic patients

In [None]:
# people having Diabetic and relationship with Cholesterol_Level

diabeticDF = df[df['Diabetic']=='Yes']
diabeticDF.reset_index(drop=True, inplace=True)
diabeticDF.head(2)

In [None]:
sns.countplot(diabeticDF['Cholesterol_Level'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# people having Diabetic and relationship with Smoking habit

sns.countplot(diabeticDF['Smoker'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
sns.countplot(x='Diabetic', hue='Smoker', data=df)
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()


### Ratio of diabetic people in all smoking people and diabetic in whole non smoking

In [None]:
diabetic_smokers = df[(df['Diabetic'] == 'Yes') & (df['Smoker'] == 'Yes')].shape[0]

all_smokers = df[df['Smoker'] == 'Yes'].shape[0]

# Calculate ratio of diabetic smokers
ratio_diabetic_smokers = diabetic_smokers / all_smokers


print(f"Ratio of diabetic people among smokers: {ratio_diabetic_smokers:.2f}")

In [None]:
# Count diabetic non-smokers
diabetic_non_smokers = df[(df['Diabetic'] == 'Yes') & (df['Smoker'] == 'No')].shape[0]

all_non_smokers = df[df['Smoker'] == 'No'].shape[0]
ratio_diabetic_non_smokers = diabetic_non_smokers / all_non_smokers if all_non_smokers > 0 else 0
print(f"Ratio of diabetic people among non-smokers: {ratio_diabetic_non_smokers:.2f}")

In [None]:
sns.countplot(diabeticDF['Exercise_Frequency'], color = 'brown')
plt.gcf().set_facecolor('lightgrey')
plt.gca().set_facecolor('lightblue')
plt.show()

In [None]:
# Relationship between Age and Diabetic

NondiabeticDF = df[df['Diabetic']=='No']
NondiabeticDF.reset_index(drop=True, inplace=True)
NondiabeticDF.head(2)

In [None]:
plt.subplot(1,2,1)
sns.histplot(x=diabeticDF['Blood_Pressure'], color='green', label='Diabetic', bins=10)
plt.xlabel("BP")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()

plt.subplot(1,2,2)
sns.histplot(x=NondiabeticDF['Blood_Pressure'], color='brown', label='Non-diabetic', bins=10)
plt.xlabel("BP")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()
plt.tight_layout()
plt.show()

High Blood Pressure (Hypertension):

Stage 1: 130-139/80-89 mm Hg.

Stage 2: 140/90 mm Hg or higher.

Low Blood Pressure:  less than 90 mmHg

# Relationship between Age and Diabetic

In [None]:
# Relationship between Age and Diabetic

NondiabeticDF = df[df['Diabetic']=='No']
NondiabeticDF.reset_index(drop=True, inplace=True)
NondiabeticDF.head(2)

In [None]:
plt.figure(figsize=(15,7))
plt.gcf().set_facecolor('lightgrey')

plt.subplot(2,1,1)
sns.histplot(x=NondiabeticDF['Age'], color='brown', label='Non-diabetic', bins=20)
plt.xlabel("Age")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()


plt.subplot(2,1,2)
sns.histplot(x=diabeticDF['Age'], color='green', label='Diabetic', bins=20)
plt.xlabel("Age")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()

plt.show()


In [None]:
plt.figure(figsize=(15,7))
plt.gcf().set_facecolor('lightgrey')

plt.subplot(2,1,1)
sns.histplot(x=NondiabeticDF['Blood_Pressure'], color='brown', label='Non-diabetic', bins=20)
plt.xlabel("BP")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()


plt.subplot(2,1,2)
sns.histplot(x=diabeticDF['Blood_Pressure'], color='green', label='Diabetic', bins=20)
plt.xlabel("BP")
plt.ylabel("Freequency")
plt.gca().set_facecolor('lightblue')
plt.legend()

plt.show()


In [None]:

plt.figure(figsize=(12,10))
plt.gcf().set_facecolor('lightblue')
plt.subplot(3,1,1)
sns.scatterplot(x=diabeticDF['Blood_Pressure'], y=diabeticDF['Annual_Income_USD'], hue=diabeticDF['Cholesterol_Level'])
plt.xlabel("Age")
plt.ylabel("Annual Income USD")
plt.gca().set_facecolor('lightgrey')
plt.legend()

plt.subplot(3,1,2)
sns.countplot(x=diabeticDF['Cholesterol_Level'],  hue= diabeticDF['Cholesterol_Level'])
plt.gca().set_facecolor('lightgrey')

plt.subplot(3,1,3)
sns.histplot(x=diabeticDF['Annual_Income_USD'])
plt.gca().set_facecolor('lightgrey')


plt.suptitle('Diabetic People Income levels, Age, Cholesterol Levels', fontsize=20)
plt.show()


In [None]:
smokers = df.loc[(df['Diabetic']=='Yes') & (df['Smoker']=='Yes')]
smokers.reset_index(drop=True, inplace=True)
smokers = smokers.sort_values(by='Age')
smokers.reset_index(drop=True, inplace=True)
smokers.head(2)

In [None]:
sns.histplot(smokers['Age'], bins=15)
plt.show()

# Bivariate Analysis

In [None]:
# BMI and Blood_Pressure

sns.scatterplot(x=smokers['BMI'], y=smokers['Blood_Pressure'])
plt.xlabel('BMI')
plt.ylabel('Blood Pressure')
plt.show()

In [None]:
sns.scatterplot(x=diabeticDF['BMI'], y=diabeticDF['Blood_Pressure'])
plt.xlabel('BMI')
plt.ylabel('Blood Pressure')
plt.show()

In [None]:
sns.scatterplot(x=df['BMI'], y=df['Blood_Pressure'])
plt.xlabel('BMI')
plt.ylabel('Blood Pressure')
plt.show()

In [None]:
diabeticDF.head()

In [None]:
diabeticDF.shape

In [None]:
for col in categorical_columns:
  # print(col)
  print(diabeticDF[col].value_counts())
  print()


In [None]:
for col in numeric_columns:
  sns.histplot(diabeticDF[col], kde=True)
  plt.show()

# **Multivariate Analysis**

In [None]:
diabeticDF.to_csv('diabeticDF.csv', index=False)

In [None]:
# 1. Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(diabeticDF.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

In [None]:
# 2. Pairplot for selected features

sns.pairplot(diabeticDF[numeric_columns])
plt.show()

# **Key Findings**

Diabetic individuals in the dataset are more likely to have high cholesterol.


There is a significantly higher ratio of diabetes among smokers (79%) compared to non-smokers (46%).

Diabetic patients, on average, have higher blood pressure than non-diabetic individuals.

The dataset shows a wide distribution of Age, Height, Weight, and BMI, with outliers present in BMI, Blood Pressure, and Annual Income.



A majority of the overall population (59.5%) in the dataset has a normal cholesterol level, while 30.4% have high and 10.1% have very high cholesterol.

# **Summary and Conclusion**

The EDA highlights strong associations between smoking and cholesterol levels with diabetes.

Smokers have a significantly higher risk of being diabetic compared to non-smokers, and diabetic patients frequently have high cholesterol.

The analysis also suggests a correlation between diabetes and elevated blood pressure.