

# Importing Required Libraries

In [1]:
# Libraries for Feature Engineering
import seaborn as sns

In [2]:
# Libraries for Data Structures
import pandas as pd
import numpy as np

In [3]:
# Libraries for Creating Pipelines
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

In [4]:
# Libraries for Data Pre-processing and Processing
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [5]:
# Libraries For Model Formation
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier


In [6]:
# Libraries for Model Evaluation
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import accuracy_score, classification_report


# Dataset Customisation

In [7]:
# Importing Dataset from csv file
raw_data = pd.read_csv('../Datasets/heart_disease_uci.csv')

# Gives total no.of rows and columns
raw_data.shape

(920, 16)

## ***DataSet Columns Defination***
    1. id (Unique id for each patient)
    2. age (Age of the patient in years)
    3. origin (place of study)
    4. sex (Male/Female)
    5. cp chest pain type ([typical angina, atypical angina, non-anginal, asymptomatic])
    6. trestbps resting blood pressure (resting blood pressure (in mm Hg on admission to the hospital))
    7. chol (serum cholesterol in mg/dl)
    8. fbs (if fasting blood sugar > 120 mg/dl)
    9. restecg (resting electrocardiographic results)
    -- Values: [normal, stt abnormality, lv hypertrophy]
    10. thalach: maximum heart rate achieved
    11. exang: exercise-induced angina (True/ False)
    12. oldpeak: ST depression induced by exercise relative to rest
    13. slope: the slope of the peak exercise ST segment
    14. ca: number of major vessels (0-3) colored by fluoroscopy
    15. thal: [normal; fixed defect; reversible defect]
    16. num: the predicted attribute


In [8]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [11]:
raw_data['thal'].value_counts()

thal
normal               196
reversable defect    192
fixed defect          46
Name: count, dtype: int64

In [12]:
new_column_names = ['Id', 'Age', 'Gender', 'Origin', 'Chest_Pain', 'Resting_Blood_Pressure', 'Cholestral', 'Fasting_Blood_Sugar',
                    'Resting_ECG', 'Maximum_Heart_Rate_Achieved', 'Exercise_Induced_Angina','ST_Depression_Induced', 'Peak_Exercise_Slope',
                    'No_of_Major_Vessels', 'Thalassemia', 'Target_Variable']
raw_data.rename(columns={old_column_name: new_column_name for old_column_name, new_column_name in zip(raw_data.columns, new_column_names)}, inplace=True)

# Feature Engineering


## Dropping Non-Required Featues

In [13]:
raw_data.drop(['Id', 'Origin'], axis=1, inplace=True)

## Converting Category into Numerical Values

In [14]:
# Converting Categorical Values to String
raw_data['Gender'] = pd.factorize(raw_data.Gender)[0]
raw_data['Chest_Pain'] = pd.factorize(raw_data.Chest_Pain)[0]
raw_data['Resting_ECG'] = pd.factorize(raw_data.Resting_ECG)[0]
raw_data['Peak_Exercise_Slope'] = pd.factorize(raw_data.Peak_Exercise_Slope)[0]
raw_data['Thalassemia'] = pd.factorize(raw_data.Thalassemia)[0]

# Converting Target Variables to Binary Target Values
raw_data.Target_Variable.replace([1,2,3,4], 1, inplace=True)
raw_data['Target_Variable'].value_counts()

Target_Variable
1    509
0    411
Name: count, dtype: int64

In [15]:
raw_data['Thalassemia'].value_counts()

Thalassemia
-1    486
 1    196
 2    192
 0     46
Name: count, dtype: int64

# Datasets Pre-processing and Processing


## ***DataSet Pre-Processing***

## Dataset Processing(Train-Test Splitting)

In [None]:
# Separate labels and features
X = raw_data.drop(['Target_Variable'], axis=1)
y = raw_data['Target_Variable']


# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_scaled)

# Data Augmentation
smote = SMOTE(random_state=42)
X_imputed_smote, y_smote = smote.fit_resample(X_imputed, y)

# Now, split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed_smote, y_smote, test_size=0.2, random_state=42)

# Model Trainings


## Testing Out Different Models layering on top of one another

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(rf_model, rf_params, cv=5)
rf_grid.fit(X_train, y_train)
best_rf_model = rf_grid.best_estimator_

cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print(f"{best_rf_model} CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Fit the model on the entire dataset
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)

print("-------------------------------------------------------")
# Training accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"{best_rf_model} Training Accuracy: {accuracy * 100:.2f}%")
print("-------------------------------------------------------")

# Classification report
print(f"{best_rf_model} Classification Report:\n{classification_report(y_test, y_pred)}")
print("-------------------------------------------------------")

In [None]:
# import pickle


# filename = "heart.sav"
# pickle.dump(best_rf_model, open(filename, "wb"))