# Cardiovascular Disease Risk Analysis

## 1. Introduction
This notebook analyzes the risk factors associated with Cardiovascular Disease (CVD) using machine learning techniques. We interpret the `cardio_train.csv` dataset to train various classification models.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ensure plots display in the notebook
%matplotlib inline

## 2. Data Loading and Preprocessing
We load the data, remove duplicates, and perform necessary cleaning steps like handling outliers in blood pressure and height/weight data.

In [None]:
def load_and_preprocess(filepath):
    print(f"Loading data from {filepath}...")
    df = pd.read_csv(filepath, sep=';')
    
    # --- Show Shapes BEFORE Preprocessing ---
    initial_shape = df.shape
    print(f"\n--- User Requested Information ---")
    print(f"Data Shape BEFORE Preprocessing: {initial_shape}")
    print(f"----------------------------------\n")

    if 'id' in df.columns:
        df.drop('id', axis=1, inplace=True)

    df['age_years'] = (df['age'] / 365.25).round(1)
    
    dupes = df.duplicated().sum()
    if dupes > 0:
        print(f"Removing {dupes} duplicate rows...")
        df.drop_duplicates(inplace=True)

    mask = (df['ap_hi'] >= 60) & (df['ap_hi'] <= 240) & \
           (df['ap_lo'] >= 30) & (df['ap_lo'] <= 160) & \
           (df['ap_hi'] > df['ap_lo'])
    
    df_clean = df[mask].copy()
    print(f"Rows after blood pressure cleaning: {df_clean.shape[0]} (removed {df.shape[0] - df_clean.shape[0]})")
    
    df_clean = df_clean[(df_clean['height'] > 100) & (df_clean['weight'] > 30)]
    
    # --- Show Shapes AFTER Preprocessing ---
    final_shape = df_clean.shape
    print(f"\n--- User Requested Information ---")
    print(f"Final Data Shape AFTER Preprocessing: {final_shape}")
    print(f"----------------------------------\n")

    return df_clean

# Execution
data_path = "f:/venv/cvd_analysis/cardio_train.csv"
df = load_and_preprocess(data_path)

## 3. Exploratory Data Analysis
Here we visualize some basic distributions and analyze the effect of smoking.

In [None]:
def optimize_visuals(df):
    plt.figure(figsize=(6, 5))
    sns.countplot(x='smoke', hue='cardio', data=df)
    plt.title('CVD Cases by Smoking Status')
    plt.xlabel('Smoking Status (0=No, 1=Yes)')
    plt.ylabel('Count')
    plt.legend(title='CVD', labels=['No', 'Yes'])
    plt.show()

def analyze_smoking_effect(df):
    print("\n--- Analysis: Effect of Smoking on CVD ---")
    
    prevalence = df.groupby('smoke')['cardio'].mean()
    print("\nCVD Prevalence:")
    print(f"Non-Smokers (0): {prevalence[0]*100:.2f}%")
    print(f"Smokers (1):     {prevalence[1]*100:.2f}%")
    
    ct = pd.crosstab(df['smoke'], df['cardio'])
    odds_smoker = ct.iloc[1, 1] / ct.iloc[1, 0]
    odds_nonsmoker = ct.iloc[0, 1] / ct.iloc[0, 0]
    or_val = odds_smoker / odds_nonsmoker
    
    print("\nUnadjusted Odds Ratio (Smoker vs Non-Smoker):")
    print(f"OR: {or_val:.4f}")
    if or_val > 1:
        print("Interpretation: Smokers have higher odds of CVD compared to non-smokers (in this unadjusted view).")
    else:
        print("Interpretation: Smokers have lower/equal odds of CVD compared to non-smokers (unexpected, likely due to age confounding).")

optimize_visuals(df)
analyze_smoking_effect(df)

## 4. Model Training and Evaluation
We train multiple models ranging from Logistic Regression to Ensemble methods like Gradient Boosting and Voting Classifiers.

In [None]:
def train_and_evaluate(df):
    print("\n--- Model Training & Evaluation ---")
    
    features = ['age_years', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 
                'cholesterol', 'gluc', 'smoke', 'alco', 'active']
    target = 'cardio'
    
    X = df[features]
    y = df[target]
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Initialize individual models
    lr = LogisticRegression(max_iter=1000)
    knn = KNeighborsClassifier(n_neighbors=5)
    svm = LinearSVC(random_state=42, dual=False)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    nb = GaussianNB()
    gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
    ada = AdaBoostClassifier(n_estimators=100, random_state=42)
    
    # Create a Voting Classifier (Ensemble)
    voting_clf = VotingClassifier(
        estimators=[('lr', lr), ('rf', rf), ('gb', gb)],
        voting='hard'
    )

    models = {
        "Logistic Regression": lr,
        "K-Nearest Neighbors": knn,
        "SVM": svm,
        "Decision Tree": dt,
        "Naive Bayes": nb,
        "Random Forest": rf,
        "Gradient Boosting": gb,
        "AdaBoost": ada,
        "Voting Classifier": voting_clf
    }
    
    model_performance = []

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        print(f"{name} Accuracy: {acc:.4f}")
        # print(classification_report(y_test, preds)) # Optional: Uncomment for full report
        
        model_performance.append({'Model': name, 'Accuracy': acc})

        if name == "Logistic Regression":
            print("Logistic Regression Coefficients (Feature Importance):")
            coeffs = pd.DataFrame({
                'Feature': features,
                'Coefficient': model.coef_[0]
            }).sort_values(by='Coefficient', ascending=False)
            # print(coeffs)
            
            smoke_coef = coeffs[coeffs['Feature'] == 'smoke']['Coefficient'].values[0]
            # print(f"\nLogistic Regression Coefficient for 'smoke': {smoke_coef:.4f}")

            # Show Feature Importance Plot
            plt.figure(figsize=(10, 6))
            sns.barplot(x='Coefficient', y='Feature', data=coeffs)
            plt.title('Feature Importance (Logistic Regression)')
            plt.tight_layout()
            plt.show()
        
        # Show Confusion Matrix for Random Forest (or best model)
        if name == "Random Forest":
            cm = confusion_matrix(y_test, preds)
            plt.figure(figsize=(6, 5))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
            plt.title(f'Confusion Matrix - {name}')
            plt.ylabel('Actual')
            plt.xlabel('Predicted')
            plt.tight_layout()
            plt.show()

        if name == "Decision Tree":
            # Export text representation of the tree rules
            tree_rules = export_text(model, feature_names=features)
            print("\nDecision Tree Rules (Top 5 levels):")
            print("\n".join(tree_rules.splitlines()[:20])) # Print first 20 lines

    # Generate Model Comparison Plot
    perf_df = pd.DataFrame(model_performance).sort_values(by='Accuracy', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Accuracy', y='Model', data=perf_df, palette='viridis')
    plt.title('Model Accuracy Comparison')
    plt.xlabel('Accuracy Score')
    plt.xlim(0.6, 0.8) # Zoom in for better differentiation on this dataset
    plt.tight_layout()
    plt.show()

train_and_evaluate(df)