In [None]:
# === SECTION 1: Scaling & Encoding ===

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Identify column types
binary_cols = [col for col in data.columns if data[col].nunique() == 2 and data[col].dtype == 'object']
categorical_cols = [col for col in data.select_dtypes(include='object') if col not in binary_cols]
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'HeartDisease'] # Exclude the target

# 2. Encode Binary Columns
for col in binary_cols:
    mapping = {data[col].unique()[0]: 0, data[col].unique()[1]: 1}
    data[col] = data[col].map(mapping)

# 3. One-Hot Encode Categorical Columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# 4. Scale Numeric Columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# 5. Convert 'bool' types (from get_dummies) to 'int'
bool_cols = data.select_dtypes(include='bool').columns.tolist()
data[bool_cols] = data[bool_cols].astype(int)

# 6. Clean column names (remove whitespace)
data.columns = data.columns.str.strip()

# 7. Separate features (X) and target (Y)
X = data.drop('HeartDisease', axis=1)
Y = data['HeartDisease']


In [None]:
# === SECTION 2: Train_Test_Split===

# Split the data into training and testing sets
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Data successfully split into:")
print(f"X_Train shape: {X_Train.shape}")
print(f"X_Test shape: {X_Test.shape}")
print("Data is now preprocessed and ready for modeling.")

In [None]:
# === SECTION 3:APPLYING 5 BALANCING TECHNIQUES & VISUALIZING BALANCING RESULTS===


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks

# 1. Load & Prepare Data
df = pd.read_csv('heart.csv')
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

# 2. Define Techniques
samplers = {
    "Original": None,
    "SMOTE": SMOTE(random_state=369),
    "ADASYN": ADASYN(random_state=369),
    "SMOTETomek": SMOTETomek(random_state=369),
    "Tomek Links": TomekLinks(),
    "SMOTEENN": SMOTEENN(random_state=369)
}

# 3. Process Data
stats = []
for name, sampler in samplers.items():
    if sampler is None:
        counts = y_train.value_counts()
    else:
        try:
            X_res, y_res = sampler.fit_resample(X_train, y_train)
            counts = y_res.value_counts()
        except:
            continue
    stats.append({"Technique": name, "Class": 0, "Count": counts.get(0, 0)}) # Normal
    stats.append({"Technique": name, "Class": 1, "Count": counts.get(1, 0)}) # Disease

df_stats = pd.DataFrame(stats)

# ---------------------------------------------------------
# 4. PLOTTING
# ---------------------------------------------------------
plt.figure(figsize=(12, 7))
sns.set_theme(style="whitegrid")

# Define Positions
techniques = df_stats['Technique'].unique()
y_range = range(len(techniques))
offsets = [-0.15, 0.15]

# Colors: Navy Blue & Crimson Red
colors = ["#1ABC9C", "#C0392B"]

for i, tech in enumerate(techniques):
    row_norm = df_stats[(df_stats['Technique']==tech) & (df_stats['Class']==0)]
    row_dis = df_stats[(df_stats['Technique']==tech) & (df_stats['Class']==1)]

    val_norm = row_norm['Count'].values[0]
    val_dis = row_dis['Count'].values[0]

    # --- Draw Stem 1: Normal ---
    plt.hlines(y=i + offsets[0], xmin=0, xmax=val_norm, color=colors[0], alpha=0.9, linewidth=4.5)
    plt.plot(val_norm, i + offsets[0], "o", markersize=15, color=colors[0])
    plt.text(val_norm + 18, i + offsets[0], str(val_norm), va='center', fontsize=11, fontweight='bold', color=colors[0])

    # --- Draw Stem 2: Disease ---
    plt.hlines(y=i + offsets[1], xmin=0, xmax=val_dis, color=colors[1], alpha=0.9, linewidth=4.5)
    plt.plot(val_dis, i + offsets[1], "o", markersize=15, color=colors[1])
    plt.text(val_dis + 18, i + offsets[1], str(val_dis), va='center', fontsize=11, fontweight='bold', color=colors[1])

# Styling
plt.yticks(y_range, techniques, fontsize=13, fontweight='bold')
plt.xlabel("Number of Samples", fontsize=13, labelpad=10, fontweight='bold')

# Clean Spines
sns.despine(left=True, bottom=True)
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.grid(axis='y', alpha=0)

# Manually creating handles for the legend to place it exactly where we want
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', label='Normal', markerfacecolor=colors[0], markersize=12),
    Line2D([0], [0], marker='o', color='w', label='Heart Disease', markerfacecolor=colors[1], markersize=12)
]

plt.legend(handles=legend_elements, loc='upper right', frameon=True, fontsize=12, facecolor='white', framealpha=1, edgecolor='black')

plt.tight_layout()
plt.show()