In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
    'oldpeak', 'slope', 'ca', 'thal', 'target'
]

try:
    heart_df = pd.read_csv(url, names=columns)
    print("‚úÖ Dataset loaded successfully!")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    exit()


In [None]:
heart_df.head(25)

In [None]:
# cleaning
heart_df.replace('?', np.nan, inplace=True)
for col in ['ca', 'thal']:
    heart_df[col] = pd.to_numeric(heart_df[col], errors='coerce')

heart_df.fillna(heart_df.median(), inplace=True)

In [None]:
# Sex vs Target
sns.countplot(x='sex', hue='target', data=heart_df)
plt.title('Heart Disease by Sex')
plt.show()

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(heart_df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
X = heart_df.drop('target', axis=1)
y = heart_df['target']

# --- Class distribution before SMOTE ---
print("Before SMOTE:")
print(y.value_counts())

sns.countplot(x=y)
plt.title("Class Distribution Before SMOTE")
plt.xlabel("Target Class (0 = No Disease, 1 = Disease)")
plt.ylabel("Count")
plt.show()


In [None]:
# --- Features and labels ---


# One-Hot Encoding for categorical variables
heart_df = pd.get_dummies(heart_df, columns=['cp', 'restecg', 'slope', 'thal', 'sex'])


feature_columns = [col for col in heart_df.columns if col != 'target']
X = heart_df[feature_columns]  # Features
y = heart_df['target'].apply(lambda x: 1 if x > 0 else 0)  # Convert to binary classification


# --- Apply SMOTE ---
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# --- Class distribution after SMOTE ---
print("\nAfter SMOTE:")
print(y_sm.value_counts())

sns.countplot(x=y_sm)
plt.title("Class Distribution After SMOTE")
plt.xlabel("Target Class (0 = No Disease, 1 = Disease)")
plt.ylabel("Count")
plt.show()

# --- Summary printout ---
print(f"Class 0 count before: {y.value_counts()[0]}")
print(f"Class 1 count before: {y.value_counts()[1]}")
print(f"Class 0 count after: {y_sm.value_counts()[0]}")
print(f"Class 1 count after: {y_sm.value_counts()[1]}")

In [None]:
# Splitting the Data into Training & Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

In [None]:
#Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# scaler file
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

In [None]:
#Training
# Training the XGBoost Model with Better Hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)
xgb_model.fit(X_train_scaled, y_train)



In [None]:
y_pred_xgb = xgb_model.predict(X_test_scaled)
print("\nüîç XGBoost Model Performance:")
print(classification_report(y_test, y_pred_xgb))
print(f"üìä Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")

In [None]:
# trained model(pickel file)
with open("heart_disease_model.pkl", "wb") as model_file:
    pickle.dump(xgb_model, model_file)

print("\n‚úÖ Model and scaler saved successfully!")

In [None]:
# Feature Importance Analysis
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": xgb_model.feature_importances_})
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

In [None]:
print("\nüîπ Top 10 Important Features:")
print(feature_importances.head(10))



In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "CatBoost": CatBoostClassifier(iterations=200, verbose=0, random_state=42),
    "XgBoost" : XGBClassifier( n_estimators=200, max_depth=5, learning_rate=0.1,subsample=0.8,colsample_bytree=0.8,eval_metric="logloss")
        
}

In [None]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "AUC": round(auc, 4) if auc else "-",
        "Precision": round(classification_report(y_test, y_pred, output_dict=True)["1"]["precision"], 4),
        "Recall": round(classification_report(y_test, y_pred, output_dict=True)["1"]["recall"], 4),
        "F1-Score": round(classification_report(y_test, y_pred, output_dict=True)["1"]["f1-score"], 4)
    })


In [None]:
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
display(results_df)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
plt.barh(results_df["Model"], results_df["Accuracy"], color='blue')
plt.xlabel("Accuracy")
plt.title("Model Performance Comparison")
plt.gca().invert_yaxis()
plt.show()
