In [1]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Step 2: Load Dataset
# Make sure the file is in the same directory or provide the full path
df = pd.read_csv("Heart_Disease_Prediction.csv")

# Display the first few rows
df.head()


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [3]:
# Step 3: Dataset Summary
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)
print("\nNull Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)


Shape: (270, 14)

Columns:
 Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='object')

Null Values:
 Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

Data Types:
 Age                          int64
Sex                          int64
Chest pain type              int64
BP                           int64
Cholesterol                  int64
FBS over 120                 int64
EKG results                  int64
Max HR                       int6

In [4]:
# Step 4: Encode Target Column if it's categorical
# Replace 'Heart Disease' with the actual target column name if different
if df['Heart Disease'].dtype == 'object':
    df['Heart Disease'] = LabelEncoder().fit_transform(df['Heart Disease'])

# Check encoding result
df['Heart Disease'].value_counts()


Heart Disease
0    150
1    120
Name: count, dtype: int64

In [5]:
# Step 5: Split Data into Features and Target
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)


Training set: (216, 13)
Testing set: (54, 13)


In [6]:
# Step 6: Train Model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Model Accuracy: 0.7963


In [7]:
# Step 7: Classification Report and Confusion Matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84        33
           1       0.78      0.67      0.72        21

    accuracy                           0.80        54
   macro avg       0.79      0.77      0.78        54
weighted avg       0.79      0.80      0.79        54

Confusion Matrix:
 [[29  4]
 [ 7 14]]


In [9]:
# Step 8: Save the Trained Model to 'model.pkl'
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved successfully as model/model.pkl")


✅ Model saved successfully as model/model.pkl


In [10]:
# Install XGBoost if not already installed
!pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 326.8 kB/s eta 0:07:39
   ---------------------------------------- 0.1/150.0 MB 326.1 kB/s eta 0:07:40
   ---------------------------------------- 0.1/150.0 MB 544.7 kB/s eta 0:04:36
   ---------------------------------------- 0.2/150.0 MB 827.9 kB/s eta 0:03:01
   ---------------------------------------- 0.3/150.0 MB 871.5 kB/s eta 0:02:52
   ---------------------------------------- 0.4/150.0 MB 1.2 MB/s eta 0:02:10
   ---------------------------------------- 0.5/150.0 MB 1.3 MB/s eta 0:01:59
   --

DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Step 1: Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 2: Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Step 3: Evaluate
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"✅ XGBoost Accuracy: {accuracy_xgb:.4f}")

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred_xgb))

print("\n🧾 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


✅ XGBoost Accuracy: 0.8148

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85        33
           1       0.79      0.71      0.75        21

    accuracy                           0.81        54
   macro avg       0.81      0.80      0.80        54
weighted avg       0.81      0.81      0.81        54


🧾 Confusion Matrix:
[[29  4]
 [ 6 15]]


In [12]:
# Create a model directory if it doesn't exist
import os
os.makedirs("model", exist_ok=True)

# Save the model as xgb_model.pkl
with open("model/xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

print("✅ XGBoost model saved as model/xgb_model.pkl")


✅ XGBoost model saved as model/xgb_model.pkl
