In [10]:
import warnings
warnings.simplefilter('ignore')

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix

In [11]:
data = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\Raw_data.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [12]:
data.shape


(1025, 14)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [14]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [15]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [16]:
categorical_columns = []
numerical_columns = []

for col in data.columns:
    if data[col].dtype == 'object':
        categorical_columns.append(col)
    else:
        numerical_columns.append(col)

In [17]:
print("Numerical Features: ",numerical_columns)

print("\nCategorical Features: ",categorical_columns)

Numerical Features:  ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

Categorical Features:  []


In [18]:
discrete_features=[feature for feature in numerical_columns if len(data[feature].unique())<25]
continuous_features=[feature for feature in numerical_columns if feature not in discrete_features]

In [19]:
for feature in continuous_features:
    IQR = data[feature].quantile(0.75) - data[feature].quantile(0.25)
    lower_limit = data[feature].quantile(0.25) - (IQR * 3)
    upper_limit = data[feature].quantile(0.75) + (IQR * 3)
    data.loc[data[feature] >= upper_limit, feature] = upper_limit
    data.loc[data[feature] < lower_limit, feature] = lower_limit

In [20]:
x = data.drop('target',axis=1)

y = data["target"]

In [21]:
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [22]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state = 20)

In [24]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)
print()
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(1025, 13)
(820, 13)
(205, 13)

(1025,)
(820,)
(205,)


In [25]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)

x_test  = scaler.transform(x_test)

In [26]:
models={
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    'RFC':RandomForestClassifier(n_estimators=20, max_depth=10),
    'XG Boost':XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27, 
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5),
    'K Nearest Neighbors':KNeighborsClassifier(n_neighbors=10),
    'Decision Tree':DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6),
    'Support Vector Machine':SVC(kernel='rbf', C=2)
}

In [27]:
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for model_name, model in models.items():

    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred = model.predict(x_test)

    # Evaluate the model
    #train_accuracy = accuracy_score(x_train,y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)

    print(f"Model: {model_name}")
    print("Testing Accuracy: ", test_accuracy)
    print("Precision: ",precision)
    print("Recall: ",recall)
    print("F1 Score: ",f1)
    print("Confusion Matrix:\n ",confusion_mat)


    model_list.append(model_name)
    accuracy_list.append(test_accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    print("=" * 35)

max_accuracy_index = accuracy_list.index(max(accuracy_list))
print(f"The best model based on accuracy is {model_list[max_accuracy_index]} with Testing Accuracy: {accuracy_list[max_accuracy_index]}")

Model: Logistic Regression
Testing Accuracy:  0.8
Precision:  0.773109243697479
Recall:  0.8679245283018868
F1 Score:  0.8177777777777778
Confusion Matrix:
  [[72 27]
 [14 92]]
Model: Naive Bayes
Testing Accuracy:  0.8
Precision:  0.782608695652174
Recall:  0.8490566037735849
F1 Score:  0.8144796380090499
Confusion Matrix:
  [[74 25]
 [16 90]]
Model: RFC
Testing Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 Score:  1.0
Confusion Matrix:
  [[ 99   0]
 [  0 106]]
Model: XG Boost
Testing Accuracy:  0.8341463414634146
Precision:  0.78125
Recall:  0.9433962264150944
F1 Score:  0.8547008547008547
Confusion Matrix:
  [[ 71  28]
 [  6 100]]
Model: K Nearest Neighbors
Testing Accuracy:  0.8731707317073171
Precision:  0.8703703703703703
Recall:  0.8867924528301887
F1 Score:  0.8785046728971964
Confusion Matrix:
  [[85 14]
 [12 94]]
Model: Decision Tree
Testing Accuracy:  0.9317073170731708
Precision:  0.8898305084745762
Recall:  0.9905660377358491
F1 Score:  0.9374999999999999
Confusion Matrix:

In [28]:
data = {
    'Model': model_list,
    'Testing Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list,
    'F1 Score': f1_list
}

df = pd.DataFrame(data)

df

Unnamed: 0,Model,Testing Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.8,0.773109,0.867925,0.817778
1,Naive Bayes,0.8,0.782609,0.849057,0.81448
2,RFC,1.0,1.0,1.0,1.0
3,XG Boost,0.834146,0.78125,0.943396,0.854701
4,K Nearest Neighbors,0.873171,0.87037,0.886792,0.878505
5,Decision Tree,0.931707,0.889831,0.990566,0.9375
6,Support Vector Machine,0.965854,0.945946,0.990566,0.967742


In [29]:
best_model = models['RFC']
pickle.dump(best_model,open('heart.pkl','wb'))

In [30]:
heart=pickle.load(open("heart.pkl","rb"))
a=heart.predict([[52,1,0,125,212,0,1,168,0,1,2,2,3]])
print(a)

[0]
