In [0]:
#import mlflow
#mlflow.set_experiment("/Workspace/Data Science end to end Project/Heart_Disease_Experiment")

In [0]:
import pandas as pd

file_path = "/Workspace/Data Science end to end Project/Heart_disease.csv"
df = pd.read_csv(file_path)

# Quick look
df.head()

In [0]:
df.shape

# Checking Data types of features:

In [0]:
df.info()

# Checking duplicates

In [0]:
df.duplicated().sum()

In [0]:
# droping duplicates
df.drop_duplicates(inplace=True)

In [0]:
df.shape

# Finding Missing and Empty values among the object:
      object_cols = df.select_dtypes(include='object').columns
      for col in object_cols:
          empty_count = df[col].isnull().sum() + (df[col].str.strip() == '').sum()
          print(f"'{col}' has {empty_count} empty/missing values")

# Identify Columns With Missing Values

In [0]:
df.isnull().sum()

In [0]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage)

In [0]:
# NaN rows
df[df.isnull().any(axis=1)]

# Replace Empty Strings or Empty Objects with NaN

In [0]:
import numpy as np
# Replace empty strings or whitespace-only cells with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df.replace('', np.nan, inplace=True)

# Decide How to Handle Each Column

- Low missing values (<5%) → Can fill with mean/median/mode
- High missing values (>30%) → Consider dropping the column (or imputing carefully)

| Data Type / Scenario               | Impute With |
| ---------------------------------- | ----------- |
| Normally distributed, no outliers  | Mean        |
| Skewed distribution / has outliers | Median      |

**Optional – Drop Columns With Too Many Missing Values**


        threshold = 0.3  # 30% missing
        for col in df.columns:
            if df[col].isnull().sum() / len(df) > threshold:
                df.drop(col, axis=1, inplace=True)
        


In [0]:
Nan_Col = df.columns[df.isnull().any()].to_list()
Nan_Col

In [0]:
df[Nan_Col].head()

In [0]:
print("Skewness of BMI", df['BMI'].skew())
print("Skewness of PhysicalHealth", df['PhysicalHealth'].skew())

In [0]:
for i in Nan_Col:
    if df[i].dtype == 'object':
        df[i].fillna(df[i].mode()[0], inplace=True)
    else:
        df[i].fillna(df[i].median(), inplace=True)

In [0]:
df[df.isnull().any(axis=1)].shape

In [0]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
numeric_cols

# Numerical features Outliers:

In [0]:
outliers = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    print(f"{col}: {len(outliers[col])} outliers")

In [0]:
print("Skewness of numeric columns:\n")
df[numeric_cols].skew()

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Detect outliers using IQR ---
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower) | (df[column] > upper)][column]
    return outliers, lower, upper

# --- Plot boxplot and KDE for a given column ---
def plot_outliers(df, column):
    outliers, lower, upper = detect_outliers_iqr(df, column)
    print(f"\nFeature: {column}")
    print(f"Lower Bound: {lower:.2f}, Upper Bound: {upper:.2f}")
    print(f"Detected {len(outliers)} outliers")
    if not outliers.empty:
        print(f"Outlier values: {outliers.values}")

    # Create subplots
    plt.figure(figsize=(12,5))

    # Boxplot
    plt.subplot(1, 2, 1)
    sns.boxplot(x=df[column], color='lightblue')
    plt.title(f"Boxplot of {column}")

    # KDE Plot
    plt.subplot(1, 2, 2)
    sns.kdeplot(df[column], fill=True, color='skyblue')
    plt.axvline(lower, color='red', linestyle='--', label='Lower Bound')
    plt.axvline(upper, color='red', linestyle='--', label='Upper Bound')
    plt.title(f"KDE Plot of {column}")
    plt.legend()

    plt.tight_layout()
    plt.show()

# --- Loop dynamically through all numeric columns ---
numeric_cols = df.select_dtypes(include=['number']).columns

for col in numeric_cols:
    plot_outliers(df, col)

In [0]:
print("Unique values in PhysicalHealth:")
print(np.sort(df['PhysicalHealth'].unique()))

print("\nUnique values in MentalHealth:")
print(np.sort(df['MentalHealth'].unique()))

Conclusion:
1. Respondent’s physical health was not good (due to illness or injury) for last 5 to 30 days.
2. Respondent’s Mental health was not good (due to illness or injury) for last 10 to 30 days.

In [0]:
# --- Function to fill outliers dynamically ---
def handle_outliers_dynamic(df):
    numeric_cols = ['BMI', 'SleepTime']

    for col in numeric_cols:
        if df[col].nunique() <= 2:
            continue  # skip binary columns

        skewness = df[col].skew()
        outliers, lower, upper = detect_outliers_iqr(df, col)

        if len(outliers) == 0:
            print(f"No outliers detected in '{col}'")
            continue

        print(f"\nFeature: {col}")
        print(f"Skewness: {skewness:.2f}")
        print(f"Lower Bound: {lower:.2f}, Upper Bound: {upper:.2f}")
        print(f"Detected {len(outliers)} outliers")

        # --- Choose filling strategy based on skewness ---
        if abs(skewness) > 0.5:
            # Skewed — use median
            fill_value = df[col].median()
            method = "median"
        else:
            # Nearly normal — use mean
            fill_value = df[col].mean()
            method = "mean"

        # --- Replace outliers with chosen fill value ---
        df.loc[(df[col] < lower) | (df[col] > upper), col] = fill_value

        print(f"→ Outliers in '{col}' filled using {method} = {fill_value:.2f}")

    return df

# --- Apply dynamic outlier handling ---
df = handle_outliers_dynamic(df)


In [0]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns

for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Categorical feature Outliers:

# 1. Frequency Count

In [0]:
import pandas as pd

categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f"\nColumn: {col}")
    display(df[col].value_counts().reset_index())

In [0]:
category_cols = df.select_dtypes(include=['object']).columns.tolist()
category_cols

# Count plot distribution of Category columns with Hear Disease.

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a clean style
sns.set(style="whitegrid", context="talk")

# Select categorical columns and remove target column
category_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'HeartDisease' in category_cols:
    category_cols.remove('HeartDisease')

# Plot for each categorical column
for col in category_cols:
    plt.figure(figsize=(12,6))
    
    # Countplot with HeartDisease hue
    ax = sns.countplot(
        data=df,
        x=col,
        hue='HeartDisease',
        palette={"Yes": "red", "No": "green"},
        order=df[col].value_counts().index  # Sort bars by frequency
    )
    
    # Titles and labels
    plt.title(f'{col} vs Heart Disease', fontsize=18, fontweight='bold', pad=15)
    plt.xlabel(col, fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Heart Disease', loc='upper right', fontsize=12)
    
    # Add count labels
    for p in ax.patches:
        height = int(p.get_height())
        ax.annotate(f'{height}', 
                    (p.get_x() + p.get_width() / 2., height),
                    ha='center', va='bottom',
                    fontsize=11, color='black', 
                    xytext=(0, 5), textcoords='offset points')
    
    plt.tight_layout()
    plt.show()


        df['Race'] = df['Race'].map({'White': 0, 'Black': 1, 'Asian': 2, 'Hispanic': 3, 'Other': 4})
        df['GenHealth'] = df['GenHealth'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4})
        df['AgeCategory'] = df['AgeCategory'].map({'18-24': '<25', '25-29': '<30', '30-34': '<35', '35-39': '<40', '40-44': '<45', '45-49': '<50', '50-54': '<55', '55-59': '<60', '60-64': '<65', '65-69': '<70', '70-74': '<75', '75-79': '<80', '80 or older': '>=80'})

In [0]:
df['HeartDisease'] = df['HeartDisease'].map({'Yes': 1, 'No': 0})
df['Smoking'] = df['Smoking'].map({'Yes': 1, 'No': 0})
df['AlcoholDrinking'] = df['AlcoholDrinking'].map({'Yes': 1, 'No': 0})
df['Stroke'] = df['Stroke'].map({'Yes': 1, 'No': 0})
df['DiffWalking'] = df['DiffWalking'].map({'Yes': 1, 'No': 0})
df['Sex'] = df['Sex'].map({'Male': 1, 'Female': 0})
df['Diabetic'] = df['Diabetic'].map({'Yes': 1, 'No': 0, 'No, borderline diabetes': 0, 'Yes (during pregnancy)': 1})
df['PhysicalActivity'] = df['PhysicalActivity'].map({'Yes': 1, 'No': 0})
df['Asthma'] = df['Asthma'].map({'Yes': 1, 'No': 0})
df['KidneyDisease'] = df['KidneyDisease'].map({'Yes': 1, 'No': 0})
df['SkinCancer'] = df['SkinCancer'].map({'Yes': 1, 'No': 0})
df['Race'] = df['Race'].map({'White': 0, 'Black': 1, 'Asian': 2, 'Hispanic': 3, 'Other': 4})
df['GenHealth'] = df['GenHealth'].map({'Poor': 0, 'Fair': 1, 'Good': 2, 'Very good': 3, 'Excellent': 4})
df['AgeCategory'] = df['AgeCategory'].map({'18-24': '<25', '25-29': '<30', '30-34': '<35', '35-39': '<40', '40-44': '<45', '45-49': '<50', '50-54': '<55', '55-59': '<60', '60-64': '<65', '65-69': '<70', '70-74': '<75', '75-79': '<80', '80 or older': '>=80'})

In [0]:
df_Cleaned = df.copy()

In [0]:
df_Cleaned.head()

In [0]:
df['HeartDisease'].value_counts(normalize=True)*100

In [0]:
from sklearn.model_selection import train_test_split

X = df_Cleaned.drop('HeartDisease', axis=1)
y = df_Cleaned['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [0]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Convert categorical columns to numeric using one-hot encoding
X_train_encoded = pd.get_dummies(X_train)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(
    X_train_encoded,
    y_train
)

In [0]:
from sklearn.preprocessing import OneHotEncoder
multi_cat_cols = ['AgeCategory', 'Race', 'Diabetic', 'GenHealth']

encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
encoder.fit(X_train[multi_cat_cols])

In [0]:
X_train_ohe = encoder.transform(X_train[multi_cat_cols])
X_test_ohe = encoder.transform(X_test[multi_cat_cols])

encoded_cols = encoder.get_feature_names_out(multi_cat_cols)
X_train_ohe_df = pd.DataFrame(X_train_ohe.toarray(), columns=encoded_cols, index=X_train.index)
X_test_ohe_df = pd.DataFrame(X_test_ohe.toarray(), columns=encoded_cols, index=X_test.index)

X_train_final = pd.concat([X_train.drop(columns=multi_cat_cols).reset_index(drop=True),
                           X_train_ohe_df.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test.drop(columns=multi_cat_cols).reset_index(drop=True),
                          X_test_ohe_df.reset_index(drop=True)], axis=1)