In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
# Load the heart dataset (ensure that heart.csv is in the ../datasets/ folder)
heart_dataset = pd.read_csv("../datasets/heart.csv")

# Display dataset preview and info
print("First 5 rows of the heart dataset:")
display(heart_dataset.head())

print("\nDataset Information:")
heart_dataset.info()

print("\nStatistical Summary:")
display(heart_dataset.describe())

First 5 rows of the heart dataset:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB

Statistical Summary:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [3]:
# Check how many classes are present in the target variable
print("Unique values in target:", heart_dataset['target'].unique())

Unique values in target: [1 0]


In [4]:
if len(heart_dataset['target'].unique()) < 2:
    print("Only one target class found. Adding additional records for a balanced dataset.")
    additional_data = {
        "age":        [50, 55, 60],
        "sex":        [1, 0, 0],
        "cp":         [1, 2, 3],
        "trestbps":   [130, 140, 150],
        "chol":       [250, 240, 230],
        "fbs":        [0, 0, 0],
        "restecg":    [0, 1, 1],
        "thalach":    [150, 145, 160],
        "exang":      [0, 1, 0],
        "oldpeak":    [1.0, 2.0, 1.5],
        "slope":      [1, 2, 1],
        "ca":         [0, 0, 0],
        "thal":       [2, 3, 2],
        "target":     [0, 0, 0]
    }
    additional_heart_data = pd.DataFrame(additional_data)
    heart_dataset = pd.concat([heart_dataset, additional_heart_data], ignore_index=True)
    print("After adding additional records, unique targets:", heart_dataset['target'].unique())

In [5]:
# Separate features and target variable
X = heart_dataset.drop(columns=['target'], axis=1)
y = heart_dataset['target']

print("Features and target variable separated.")

Features and target variable separated.


In [6]:
# Split the dataset into training (80%) and testing (20%) sets (using stratify to preserve class ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 242, Testing samples: 61


In [7]:
# Apply feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)
print("Feature scaling applied successfully.")

Feature scaling applied successfully.


In [8]:
# Train the SVM classifier with a linear kernel
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, y_train)
print("SVM model trained successfully.")

SVM model trained successfully.


In [9]:
# Calculate training and testing accuracies
y_train_pred = classifier.predict(X_train)
training_accuracy = accuracy_score(y_train, y_train_pred)

y_test_pred = classifier.predict(X_test)
testing_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {training_accuracy:.2f}")
print(f"Testing Accuracy: {testing_accuracy:.2f}")

Training Accuracy: 0.86
Testing Accuracy: 0.79


In [10]:
# Create a prediction function to use on new input data
def predict_heart(input_data):
    """
    input_data: a list (or similar structure) of feature values
    (order of features: age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal)
    """
    # Convert input to a DataFrame and then scale the data
    input_array = pd.DataFrame([input_data], columns=X.columns)
    input_scaled = scaler.transform(input_array)
    prediction = classifier.predict(input_scaled)
    return "High Risk" if prediction[0] == 1 else "Low Risk"

In [11]:
# Example prediction using the first record from the dataset
sample_input = X.iloc[0].tolist()
print("Sample Prediction:", predict_heart(sample_input))

Sample Prediction: High Risk


In [12]:
# Define a save path for the model
save_path = "../saved_models/"
os.makedirs(save_path, exist_ok=True)

# Save the trained model and scaler using pickle
with open(os.path.join(save_path, "heart_model.sav"), "wb") as f:
    pickle.dump(classifier, f)

with open(os.path.join(save_path, "heart_scaler.sav"), "wb") as f:
    pickle.dump(scaler, f)

print("Heart model and scaler saved successfully.")

Heart model and scaler saved successfully.


In [13]:
# Test loading the saved model and scaler
try:
    with open(os.path.join(save_path, "heart_model.sav"), "rb") as f:
        loaded_model = pickle.load(f)
    print("Heart model loaded successfully!")
except (FileNotFoundError, EOFError):
    print("Error: Heart model file is missing or corrupted.")
    loaded_model = None

try:
    with open(os.path.join(save_path, "heart_scaler.sav"), "rb") as f:
        loaded_scaler = pickle.load(f)
    print("Scaler loaded successfully!")
except (FileNotFoundError, EOFError):
    print("Error: Heart scaler file is missing or corrupted.")
    loaded_scaler = None

Heart model loaded successfully!
Scaler loaded successfully!


In [14]:
# Test a prediction after reloading the model
sample_input = X.iloc[5].tolist()  # Using a different record for testing
# Scale the sample input and then predict
sample_input_scaled = loaded_scaler.transform([sample_input])
prediction = loaded_model.predict(sample_input_scaled)
result = "High Risk" if prediction[0] == 1 else "Low Risk"
print("Prediction after reloading model:", result)

Prediction after reloading model: High Risk


