In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# Load the dataset
df = pd.read_csv('cardio_train.csv', sep=';')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn information:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Check unique values for each column
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
# Check for outliers and unrealistic values
print("Blood pressure values:")
print(f"ap_hi min: {df['ap_hi'].min()}, max: {df['ap_hi'].max()}")
print(f"ap_lo min: {df['ap_lo'].min()}, max: {df['ap_lo'].max()}")

print("\nHeight values:")
print(f"height min: {df['height'].min()}, max: {df['height'].max()}")

print("\nWeight values:")
print(f"weight min: {df['weight'].min()}, max: {df['weight'].max()}")

# Check for biologically impossible values
print(f"\nRows with ap_hi < ap_lo: {(df['ap_hi'] < df['ap_lo']).sum()}")

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Remove biologically impossible blood pressure values
df_clean = df_clean[(df_clean['ap_hi'] >= 60) & (df_clean['ap_hi'] <= 250)]
df_clean = df_clean[(df_clean['ap_lo'] >= 40) & (df_clean['ap_lo'] <= 150)]
df_clean = df_clean[df_clean['ap_hi'] >= df_clean['ap_lo']]

# Remove unrealistic height and weight values
df_clean = df_clean[(df_clean['height'] >= 140) & (df_clean['height'] <= 210)]
df_clean = df_clean[(df_clean['weight'] >= 40) & (df_clean['weight'] <= 200)]

# Calculate BMI and remove unrealistic BMI values
df_clean['bmi'] = df_clean['weight'] / ((df_clean['height']/100) ** 2)
df_clean = df_clean[(df_clean['bmi'] >= 15) & (df_clean['bmi'] <= 50)]

print(f"Rows after cleaning: {len(df_clean)} ({(len(df_clean)/len(df))*100:.1f}% of original data)")

In [None]:
df_clean.describe()

In [None]:
# Prepare features and target
features = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
           'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi']
X = df_clean[features]
y = df_clean['cardio']

# Feature importance using correlation
correlation_matrix = df_clean.corr()
target_correlations = correlation_matrix['cardio'].abs().sort_values(ascending=False)

print("Correlation with target variable (cardio):")
print(target_correlations)

# Feature importance using ANOVA F-value
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)

feature_scores = pd.DataFrame({
    'feature': features,
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)

print("\nANOVA F-scores:")
print(feature_scores)

In [None]:
# Set up the plotting style
plt.style.use('default')
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Key Features for Heart Disease Prediction', fontsize=16, fontweight='bold')

# 1. Blood Pressure vs Cardio
sns.boxplot(data=df_clean, x='cardio', y='ap_hi', ax=axes[0,0])
axes[0,0].set_title('Systolic BP vs Heart Disease')
axes[0,0].set_xlabel('Heart Disease')
axes[0,0].set_ylabel('Systolic BP')

# 2. Cholesterol vs Cardio
sns.countplot(data=df_clean, x='cholesterol', hue='cardio', ax=axes[0,1])
axes[0,1].set_title('Cholesterol Level vs Heart Disease')
axes[0,1].set_xlabel('Cholesterol Level')
axes[0,1].legend(['No Disease', 'Disease'])

# 3. Age vs Cardio
sns.boxplot(data=df_clean, x='cardio', y='age', ax=axes[0,2])
axes[0,2].set_title('Age vs Heart Disease')
axes[0,2].set_xlabel('Heart Disease')
axes[0,2].set_ylabel('Age (days)')

# 4. BMI vs Cardio
sns.boxplot(data=df_clean, x='cardio', y='bmi', ax=axes[1,0])
axes[1,0].set_title('BMI vs Heart Disease')
axes[1,0].set_xlabel('Heart Disease')
axes[1,0].set_ylabel('BMI')

# 5. Glucose vs Cardio
sns.countplot(data=df_clean, x='gluc', hue='cardio', ax=axes[1,1])
axes[1,1].set_title('Glucose Level vs Heart Disease')
axes[1,1].set_xlabel('Glucose Level')
axes[1,1].legend(['No Disease', 'Disease'])

# 6. Active Lifestyle vs Cardio
sns.countplot(data=df_clean, x='active', hue='cardio', ax=axes[1,2])
axes[1,2].set_title('Physical Activity vs Heart Disease')
axes[1,2].set_xlabel('Physically Active')
axes[1,2].legend(['No Disease', 'Disease'])

plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
corr_matrix = df_clean[features + ['cardio']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Rank features by importance
feature_importance = pd.DataFrame({
    'feature': features,
    'correlation': [abs(correlation_matrix.loc[feat, 'cardio']) for feat in features],
    'f_score': feature_scores.set_index('feature').loc[features, 'f_score']
})

feature_importance['combined_score'] = (
    feature_importance['correlation'] +
    feature_importance['f_score'] / feature_importance['f_score'].max()
)

feature_importance = feature_importance.sort_values('combined_score', ascending=False)

print("="*60)
print("FEATURE IMPORTANCE RANKING FOR HEART DISEASE PREDICTION")
print("="*60)
print("\nTop 5 Most Important Features:")
print(feature_importance.head(5))

print("\n" + "="*60)
print("RECOMMENDED FEATURES FOR MODEL BUILDING:")
print("="*60)
for i, (idx, row) in enumerate(feature_importance.head(5).iterrows(), 1):
    print(f"{i}. {row['feature']} (Score: {row['combined_score']:.3f})")

print(f"\nDataset size after cleaning: {len(df_clean)} samples")
print(f"Features available: {len(features)}")
print(f"Target variable distribution:")
print(df_clean['cardio'].value_counts(normalize=True))