In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv("cardio_train.csv", sep=";")
df.head()

## Notice that age is in days format, so we are converting those in years format.

In [None]:
# Convert age from days to years and round to nearest integer
df['age_years'] = (df['age'] / 365).round()


In [None]:
df.head()

In [None]:
df.describe()

## Now Lets create a BMI index which might come in handy later.

In [None]:
# Calculate BMI and add as a new column
df['BMI'] = df['weight'] / ( (df['height'] / 100) ** 2 )
df.head()


## Lets add another column which will categorize the BMI index value. This way we can understand clearly where a person stands physical aspect wise.

In [None]:
bins = [0, 18.5, 25, 30, float('inf')]
labels = [1, 2, 3, 4] # 'Underweight':1, 'Normal': 2, 'Overweight': 3, 'Obese': 4

df['BMI_category_num'] = pd.cut(df['BMI'], bins=bins, labels=labels).astype(int)


In [None]:
df.head()

## We need to convert the BMI category into numbers like 1, 2, 3 , 4 etc. This way it's easier for the model to train effectively because some ML model works better on numeric encoding.

In [None]:
df.info()


## Now Lets focus on cleaning the dataset, this is crucial for the future steps in our project.

### Starting off with seeing if there is any missing values or not.

In [None]:
df.isnull().sum()

## Looks like there isn't any missing values, but we still need to look out for abnormal values or outliers which will harm our model's accuracy.

In [None]:
df_clean = df.copy() # Create a copy to preserve the original data
df_clean.head()

### Lets look at age

Reference: World Health Organization (WHO). "Adolescent health." Defines adulthood starting from 18 years.

Rationale: We exclude <18 (children) and >100 (extreme outliers).

In [None]:


# --- AGE CLEANING ---
print("Min age:", df_clean['age_years'].min())
print("Max age:", df_clean['age_years'].max())

# Count abnormal ages
abnormal_age = df_clean[(df_clean['age_years'] < 18) | (df_clean['age_years'] > 100)]
print("Abnormal ages found:", abnormal_age.shape[0])

# Drop them
df_clean = df_clean[(df_clean['age_years'] >= 18) & (df_clean['age_years'] <= 100)]
print("Remaining rows after age cleaning:", df_clean.shape[0])


## Lets look at the height now.

Reference: Centers for Disease Control and Prevention (CDC), Anthropometric Reference Data for U.S. Adults, 2015–2018.

Adult height typically ranges ~140–200 cm, but rare cases can extend beyond.

Rationale: We used 80–250 cm as a broad but realistic filter to exclude data entry errors.





In [None]:
# --- HEIGHT CLEANING (80–250 cm) ---
print("Min height:", df_clean['height'].min())
print("Max height:", df_clean['height'].max())

# Count abnormal heights
abnormal_height = df_clean[(df_clean['height'] < 80) | (df_clean['height'] > 250)]
print("Abnormal heights found:", abnormal_height.shape[0])

# Drop them
df_clean = df_clean[(df_clean['height'] >= 80) & (df_clean['height'] <= 250)]
print("Remaining rows after height cleaning:", df_clean.shape[0])


## Lets look at weight now.

Reference: CDC Anthropometric Reference Data 2015–2018.

Adult weight distribution: ~45–120 kg for most adults, but can extend higher.

Rationale: We set 40–200 kg as plausible to allow for heavier individuals while filtering errors.

In [None]:

print("Min weight:", df_clean['weight'].min())
print("Max weight:", df_clean['weight'].max())

# Count abnormal weights
abnormal_weight = df_clean[(df_clean['weight'] < 40) | (df_clean['weight'] > 200)]
print("Abnormal weights found:", abnormal_weight.shape[0])

# Drop them
df_clean = df_clean[(df_clean['weight'] >= 40) & (df_clean['weight'] <= 200)]
print("Remaining rows after weight cleaning:", df_clean.shape[0])


## Lets look at blood pressure

Reference: American Heart Association (AHA) – 2017 Guidelines.

Physiological bounds: 60–240 systolic (ap_hi), 40–130 diastolic (ap_lo).

In [None]:
# Show rows where systolic blood pressure (ap_hi) is less than 0
low_ap_hi = df_clean[df_clean['ap_hi'] < 0]
low_ap_lo = df_clean[df_clean['ap_lo'] < 0]
print(f"Number of records with ap_hi < 0: {len(low_ap_hi) }")
display(low_ap_hi)
print(f"Number of records with ap_lo < 0: {len(low_ap_lo) }")
display(low_ap_lo)


In [None]:
df_clean['ap_hi'] = df_clean['ap_hi'].abs()
df_clean['ap_lo'] = df_clean['ap_lo'].abs()

In [None]:
# Show rows where diastolic blood pressure (ap_lo) is greater than 200
high_ap_lo = df_clean[df_clean['ap_lo'] > 200]
print(f"Number of records with ap_lo > 200: {len(high_ap_lo)}")
display(high_ap_lo)

In [None]:
# (SBP: 60-240, DBP: 40-130)
# Defining plausible limits
MIN_SYSTOLIC  = 60
MAX_SYSTOLIC  = 240
MIN_DIASTOLIC = 40
MAX_DIASTOLIC = 130


In [None]:
# CORRECT THE COMMON DATA ENTRY ERROR: Missing decimal in Diastolic
# We assume any diastolic value between 400 and 1300 with trailing '00' is meant to be divided by 10.
suspicious_dia_range_lower = (400, 1300)


dia_error_mask = (
    (df_clean['ap_lo'] >= suspicious_dia_range_lower[0]) &
    (df_clean['ap_lo'] <= suspicious_dia_range_lower[1]) &
    (df_clean['ap_lo'] % 100 == 0)
)

before_corr = df_clean['ap_lo'].copy()
df_clean.loc[dia_error_mask, 'ap_lo'] = df_clean.loc[dia_error_mask, 'ap_lo'] / 10
print(f"Corrected {dia_error_mask.sum()} diastolic values by dividing by 10.")


In [None]:
# CORRECT THE COMMON DATA ENTRY ERROR: Missing decimal in Systolic
# We assume any systolic value between 600 and 2400 with trailing '00' is meant to be divided by 10.
suspicious_dia_range_higher = (600, 2400)

dia_error_mask = (
    (df_clean['ap_hi'] >= suspicious_dia_range_higher[0]) &
    (df_clean['ap_hi'] <= suspicious_dia_range_higher[1]) &
    (df_clean['ap_hi'] % 100 == 0)
)

before_corr = df_clean['ap_hi'].copy()
df_clean.loc[dia_error_mask, 'ap_hi'] = df_clean.loc[dia_error_mask, 'ap_hi'] / 10
print(f"Corrected {dia_error_mask.sum()} diastolic values by dividing by 10.")

In [None]:
# Create a boolean mask for rows that are within limits
valid_data_mask = (
    (df_clean['ap_hi'] >= MIN_SYSTOLIC)  &
    (df_clean['ap_hi'] <= MAX_SYSTOLIC)  &
    (df_clean['ap_lo'] >= MIN_DIASTOLIC) &
    (df_clean['ap_lo'] <= MAX_DIASTOLIC)
)


In [None]:
# Apply the mask to get your cleaned dataset (overwrite df_clean as requested)
before_rows = df_clean.shape[0]
df_clean = df_clean[valid_data_mask].copy()
after_rows = df_clean.shape[0]

print(f"Rows kept after BP bounds: {after_rows} (dropped {before_rows - after_rows})")
display(df_clean.describe())


## Lets look at BMI

Reference: World Health Organization (WHO) BMI classification.

Rationale: Plausible BMI for adults is generally 10–60. Values outside are considered unrealistic or erroneous.

In [None]:
print("Min BMI:", df_clean['BMI'].min())
print("Max BMI:", df_clean['BMI'].max())

# Count abnormal BMI
abnormal_bmi = df_clean[(df_clean['BMI'] < 10) | (df_clean['BMI'] > 60)]
print("Abnormal BMI values found:", abnormal_bmi.shape[0])


In [None]:
# Count rows before cleaning
before_rows = df_clean.shape[0]

# Drop abnormal BMI rows (<10 or >60)
df_clean = df_clean[(df_clean['BMI'] >= 10) & (df_clean['BMI'] <= 60)].copy()

# Count rows after cleaning
after_rows = df_clean.shape[0]

print(f"Rows kept after BMI cleaning: {after_rows} (dropped {before_rows - after_rows})")


## we have done the most part of the cleaning, we just need to see if there's any duplicate values or not.

In [None]:
# Count duplicate rows in the cleaned dataframe
duplicate_rows = df_clean.duplicated().sum()
print("Duplicate rows found:", duplicate_rows)


In [None]:
df_clean.info()

## Lets scale our data for the better readabilty and also our model need it so that it can train effectively

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Drop BMI_category and age (in days)
df_to_scale = df_clean.drop(columns=['id', 'age'])

# Scale all remaining features
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_to_scale)

# Put back into a DataFrame
df_scaled = pd.DataFrame(scaled_data, columns=df_to_scale.columns, index=df_to_scale.index)

print("Scaled dataframe shape:", df_scaled.shape)
display(df_scaled)


In [None]:
df_scaled.to_csv('cardio_train_clean_scaled.csv', index=False)

In [None]:
# DATA PREPARATION

print("1. CHECKING DATA TYPES")
print("-" * 30)

# Display current data types
print("DataFrame Info:")
print(df_scaled.info())

# Identify numerical columns only
numerical_columns = df_scaled.select_dtypes(include=['number']).columns
print(f"\n Numerical columns selected: {len(numerical_columns)}")
print(f"Features: {list(numerical_columns)}")

# Create numerical-only dataframe
numerical_df = df_scaled[numerical_columns]
print(f"Working dataset: {numerical_df.shape[0]} rows, {numerical_df.shape[1]} columns")

In [None]:
# CORRELATION CALCULATION

print("\n2. CALCULATING CORRELATION MATRIX")
print("-" * 30)

# Calculate correlation matrix
correlation_matrix = numerical_df.corr()
print(f"Correlation matrix shape: {correlation_matrix.shape}")

# Display correlation matrix values
print("\nCorrelation Matrix Values:")
print(correlation_matrix.round(3))

In [None]:
# HEATMAP VISUALIZATION

print("\n3. CREATING CORRELATION HEATMAP")
print("-" * 30)

# Set up the figure
plt.figure(figsize=(14, 12))

# Create the heatmap
heatmap = sns.heatmap(
    correlation_matrix,
    annot=True,           # Show correlation values
    cmap='coolwarm',      # Color scheme (blue-white-red)
    center=0,             # Center colormap at 0
    fmt='.2f',            # Format numbers to 2 decimal places
    linewidths=0.5,       # Add lines between cells
    square=True,          # Make cells square
    cbar_kws={"shrink": 0.8}  # Size of color bar
)

# Customize titles and labels
plt.title('Feature Correlation Heatmap\nCardiovascular Dataset',
          fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels
plt.yticks(rotation=0)               # Keep y-axis labels horizontal

plt.tight_layout()
plt.show()

In [None]:
# CORRELATION ANALYSIS

print("\n4. CORRELATION ANALYSIS & INSIGHTS")
print("-" * 30)

# Get all correlation pairs (excluding self-correlation)
corr_pairs = correlation_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs != 1.0]  # Remove correlation with itself

# Sort by absolute value to find strongest correlations
corr_pairs_abs = corr_pairs.abs().sort_values(ascending=False)

print("STRONGEST CORRELATIONS:")

# Display top 10 strongest correlations (positive and negative)
print("\nTop 10 Strongest Correlations (Absolute Value):")
for i, (pair, corr_value) in enumerate(corr_pairs_abs.head(10).items()):
    actual_value = corr_pairs[pair]
    strength = "STRONG" if abs(actual_value) > 0.5 else "MODERATE" if abs(actual_value) > 0.3 else "WEAK"
    direction = "↑ POSITIVE" if actual_value > 0 else "↓ NEGATIVE"
    print(f"  {i+1:2d}. {pair[0]:15} ↔ {pair[1]:15} : {actual_value:6.3f} ({direction} - {strength})")

In [None]:
# TARGET VARIABLE ANALYSIS

print("\n5. TARGET VARIABLE CORRELATION (cardio)")
print("-" * 30)

if 'cardio' in correlation_matrix.columns:
    # Get correlations with target variable
    target_correlations = correlation_matrix['cardio'].sort_values(ascending=False)

    print("Features Most Correlated with Cardiovascular Disease:")
    print("-" * 50)

    for feature, corr_value in target_correlations.items():
        if feature != 'cardio':  # Exclude self-correlation
            if abs(corr_value) > 0.1:
                importance = "HIGH" if abs(corr_value) > 0.2 else "MEDIUM" if abs(corr_value) > 0.1 else "LOW"
                direction = "increases risk" if corr_value > 0 else "decreases risk"
                print(f"  • {feature:20} : {corr_value:6.3f} ({importance} - {direction})")
            else:
                print(f"  • {feature:20} : {corr_value:6.3f} (LOW impact)")