# 📊 Exploratory Data Analysis (EDA)

This notebook performs EDA on the UCI Diabetes dataset.

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='whitegrid')


In [None]:
# Step 2: Load dataset
url = 'https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv'
df = pd.read_csv(url)
df.head()


In [None]:
# Step 3: Basic overview
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.describe()


In [None]:
# Step 4: Check for missing or zero values in important columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zeros:
    print(f"{col}: {(df[col] == 0).sum()} zeros")


In [None]:
# Step 5: Replace zeroes with NaN for visualization
df_clean = df.copy()
df_clean[cols_with_zeros] = df_clean[cols_with_zeros].replace(0, np.nan)

# Visualize missingness
import missingno as msno
msno.matrix(df_clean)


In [None]:
# Step 6: Visualize distributions
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
for i, col in enumerate(cols_with_zeros):
    sns.histplot(df_clean[col].dropna(), kde=True, ax=axs[i//3][i%3])
    axs[i//3][i%3].set_title(col)
plt.tight_layout()


In [None]:
# Step 7: Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Step 8: Outcome distribution
sns.countplot(x='Outcome', data=df)
plt.title('Diabetes Outcome Distribution')
plt.show()


In [None]:
# Step 9: Save cleaned data
df_clean.fillna(df_clean.median(), inplace=True)
df_clean.to_csv('cleaned_diabetes_data.csv', index=False)
print("✅ Cleaned data saved as 'cleaned_diabetes_data.csv'")
