In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the dataset
df = pd.read_csv('D:\\cardio-health-predictor\\data\\cardio_train.csv', sep=';')

In [6]:
# --- 1. Initial Data Inspection ---
print("--- Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Missing Values ---")
print(df.isnull().sum())

--- Dataset Head ---
   id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393       2     168    62.0    110     80            1     1      0   
1   1  20228       1     156    85.0    140     90            3     1      0   
2   2  18857       1     165    64.0    130     70            3     1      0   
3   3  17623       2     169    82.0    150    100            1     1      0   
4   4  17474       1     156    56.0    100     60            1     1      0   

   alco  active  cardio  
0     0       1       0  
1     0       1       1  
2     0       0       1  
3     0       1       1  
4     0       0       0  

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 

In [7]:
# --- 2. Feature Engineering & Cleaning ---
# Convert age from days to years
df['age_years'] = (df['age'] / 365).round().astype(int)

In [8]:
# Create BMI feature
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

In [9]:
# Display some statistics for the new features
print("\n--- Statistics for New Features (Age in Years & BMI) ---")
print(df[['age_years', 'bmi']].describe())


--- Statistics for New Features (Age in Years & BMI) ---
          age_years           bmi
count  70000.000000  70000.000000
mean      53.338686     27.556513
std        6.765294      6.091511
min       30.000000      3.471784
25%       48.000000     23.875115
50%       54.000000     26.374068
75%       58.000000     30.222222
max       65.000000    298.666667


In [10]:
# --- 3. Visualization ---
print("\nGenerating visualizations...")


Generating visualizations...


In [11]:
# Age distribution vs. Cardio
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age_years', hue='cardio', multiple='stack', kde=True, palette='viridis')
plt.title('Age Distribution by Cardiovascular Disease Outcome')
plt.xlabel('Age (Years)')
plt.ylabel('Count')
plt.savefig('D:\\cardio-health-predictor\\visuals\\age_distribution.png')
plt.close()

In [12]:
# Correlation Matrix Heatmap
plt.figure(figsize=(12, 10))
# Drop the original 'age' in days for a cleaner correlation matrix
corr_matrix = df.drop('age', axis=1).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Health Features')
plt.savefig('D:\\cardio-health-predictor\\visuals\\correlation_matrix.png')
plt.close()

In [14]:
# Plotting distributions of key features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Distribution of Key Health Metrics', fontsize=16)

sns.histplot(ax=axes[0, 0], data=df, x='age_years', hue='cardio', multiple='stack', kde=True)
axes[0, 0].set_title('Age Distribution by Cardio Outcome')

sns.countplot(ax=axes[0, 1], data=df, x='gender', hue='cardio')
axes[0, 1].set_title('Gender Distribution by Cardio Outcome')
axes[0, 1].set_xticklabels(['Female', 'Male'])


sns.histplot(ax=axes[0, 2], data=df, x='height', kde=True)
axes[0, 2].set_title('Height Distribution')

sns.histplot(ax=axes[1, 0], data=df, x='weight', kde=True)
axes[1, 0].set_title('Weight Distribution')

sns.countplot(ax=axes[1, 1], data=df, x='cholesterol', hue='cardio')
axes[1, 1].set_title('Cholesterol Levels by Cardio Outcome')

sns.countplot(ax=axes[1, 2], data=df, x='gluc', hue='cardio')
axes[1, 2].set_title('Glucose Levels by Cardio Outcome')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('D:\\cardio-health-predictor\\visuals\\health_metrics_distributions.png')
plt.close()


  axes[0, 1].set_xticklabels(['Female', 'Male'])


In [15]:
print("EDA complete. Visualizations saved to the 'visuals' folder.")

EDA complete. Visualizations saved to the 'visuals' folder.
