**1. Importing the Libraries**

In [2]:
import pandas as pd

**2. Importing the Dataset**

In [3]:
data = pd.read_csv('heart.csv')

**3. Taking care of missing values**

In [4]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

**4. Taking care of Duplicate values**

In [5]:
data_dup= data.duplicated().any()
data_dup

True

In [6]:
data= data.drop_duplicates()

In [7]:
data_dup= data.duplicated().any()
data_dup

False

**5. Data Processing**

In [8]:
cate_val=[]
cont_val=[]

for column in data.columns:
    if data[column].nunique() <=10:
        cate_val.append(column)
    else:
        cont_val.append(column)

In [9]:
cate_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [10]:
cont_val

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

**6. Encoding Categorical Data**

In [11]:
data['cp'].unique()

array([0, 1, 2, 3], dtype=int64)

In [12]:
cate_val.remove('sex')
cate_val.remove('target')


In [13]:
data = pd.get_dummies(data,columns=cate_val,drop_first=True)

In [14]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,52,1,125,212,168,1.0,0,False,False,False,...,False,False,True,False,True,False,False,False,False,True
1,53,1,140,203,155,3.1,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,70,1,145,174,125,2.6,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,61,1,148,203,161,0.0,0,False,False,False,...,False,False,True,True,False,False,False,False,False,True
4,62,0,138,294,106,1.9,0,False,False,False,...,False,True,False,False,False,True,False,False,True,False


**7. Feature Scaling**

In [15]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,52,1,125,212,168,1.0,0,False,False,False,...,False,False,True,False,True,False,False,False,False,True
1,53,1,140,203,155,3.1,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,70,1,145,174,125,2.6,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,61,1,148,203,161,0.0,0,False,False,False,...,False,False,True,True,False,False,False,False,False,True
4,62,0,138,294,106,1.9,0,False,False,False,...,False,True,False,False,False,True,False,False,True,False


In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
st= StandardScaler()
data[cont_val]= st.fit_transform(data[cont_val])

In [18]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,-0.267966,1,-0.376556,-0.667728,0.806035,-0.037124,0,False,False,False,...,False,False,True,False,True,False,False,False,False,True
1,-0.15726,1,0.47891,-0.841918,0.237495,1.773958,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,1.724733,1,0.764066,-1.403197,-1.074521,1.342748,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,0.728383,1,0.935159,-0.841918,0.499898,-0.899544,0,False,False,False,...,False,False,True,True,False,False,False,False,False,True
4,0.839089,0,0.364848,0.919336,-1.905464,0.739054,0,False,False,False,...,False,True,False,False,False,True,False,False,True,False


**8. Splitting the Dataset into the Training set and Test set**

In [19]:
X = data.drop('target',axis=1)

In [20]:
y = data['target']

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state= 42)

In [23]:
y_test

245    1
349    0
135    0
389    1
66     1
      ..
402    1
123    1
739    0
274    1
256    1
Name: target, Length: 61, dtype: int64

In [24]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,-0.267966,1,-0.376556,-0.667728,0.806035,-0.037124,0,False,False,False,...,False,False,True,False,True,False,False,False,False,True
1,-0.15726,1,0.47891,-0.841918,0.237495,1.773958,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2,1.724733,1,0.764066,-1.403197,-1.074521,1.342748,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3,0.728383,1,0.935159,-0.841918,0.499898,-0.899544,0,False,False,False,...,False,False,True,True,False,False,False,False,False,True
4,0.839089,0,0.364848,0.919336,-1.905464,0.739054,0,False,False,False,...,False,True,False,False,False,True,False,False,True,False


**9. Logistic Regression**

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
log = LogisticRegression()
log.fit(X_train,y_train)

In [27]:
y_pred1= log.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(y_test,y_pred1)

0.7868852459016393

**14. XG boost**

In [30]:
import xgboost as xgb
from xgboost import XGBClassifier

In [31]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [32]:
y_pred7= xgb_model.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test,y_pred7)

0.7868852459016393

**10. SVC (Support Vector Classifier)**

In [35]:
from sklearn import svm

In [36]:
svm = svm.SVC()

In [37]:
svm.fit(X_train,y_train)

In [38]:
y_pred2= svm.predict(X_test)

In [39]:
accuracy_score(y_test,y_pred2)

0.8032786885245902

**11. K-Neighbors Classifier**

In [40]:
from sklearn.neighbors import KNeighborsClassifier

In [41]:
knn= KNeighborsClassifier()

In [42]:
knn.fit(X_train,y_train)

In [43]:
y_pred3 = knn.predict(X_test)

In [44]:
accuracy_score(y_test,y_pred3)

0.7377049180327869

In [45]:
score = [] 

for k in range(1,40):
    knn= KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred= knn.predict(X_test)
    score.append(accuracy_score(y_test,y_pred))

In [46]:
score

[0.7213114754098361,
 0.8032786885245902,
 0.7049180327868853,
 0.7049180327868853,
 0.7377049180327869,
 0.8032786885245902,
 0.7868852459016393,
 0.8032786885245902,
 0.7704918032786885,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7377049180327869,
 0.7377049180327869,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7540983606557377,
 0.7377049180327869,
 0.7540983606557377,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869]

In [47]:
knn= KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_pred= knn.predict(X_test)
accuracy_score(y_test,y_pred)

0.8032786885245902

**Non-Linear ML Algorithms**

In [48]:
data = pd.read_csv('heart.csv')

In [49]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [50]:
data = data.drop_duplicates()

In [51]:
data.shape()

(302, 14)

In [52]:
X = data.drop('target',axis=1)
y = data['target']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state= 42)

**12. Decision Tree Classifier**

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
dt = DecisionTreeClassifier()

In [56]:
dt.fit(X_train,y_train)

In [57]:
y_pred4= dt.predict(X_test)

In [58]:
accuracy_score(y_test,y_pred4)

0.7377049180327869

**13. Random Forest Classifier**

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
param_grid = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', 'log2', None],  # Removed 'auto'
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

In [66]:
rf = RandomForestClassifier()

In [67]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=3, error_score='raise')

In [69]:
try:
    grid_search.fit(X_train, y_train)
except Exception as e:
    print(f"An error occurred: {e}")

# Get the best parameters and the best estimator if no error occurred
if grid_search.best_params_:
    best_params = grid_search.best_params_
    best_rf = grid_search.best_estimator_

    print("Best Parameters: ", best_params)

    # Evaluate the best model on the test set
    y_pred5 = best_rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred5)
    print(f"Accuracy after hyperparameter tuning: {accuracy * 100:.2f}%")

    # Save the best model using joblib
    #joblib.dump(best_rf, 'best_model_joblib_heart_rf')
else:
    print("GridSearchCV did not complete successfully.")

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best Parameters:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy after hyperparameter tuning: 85.25%


In [None]:
#rf.fit(X_train,y_train)

In [None]:
#y_pred5 = rf.predict(X_test)

In [None]:
#accuracy_score(y_test,y_pred5)

**14. Gradient Boosting Classifier**

In [70]:
from sklearn.ensemble import GradientBoostingClassifier

In [71]:
gbc = GradientBoostingClassifier()

In [72]:
gbc.fit(X_train,y_train)

In [73]:
y_pred6 = gbc.predict(X_test)

In [74]:
accuracy_score(y_test,y_pred6)

0.819672131147541

In [75]:
final_data = pd.DataFrame({'Models':['LR','SVM','KNN','DT','RF','GB','XGB'],
                           'ACC':[accuracy_score(y_test,y_pred1),
                                 accuracy_score(y_test,y_pred2),
                                 accuracy_score(y_test,y_pred3),
                                 accuracy_score(y_test,y_pred4),
                                 accuracy_score(y_test,y_pred5),
                                 accuracy_score(y_test,y_pred6),
                                 accuracy_score(y_test,y_pred7)]})

In [76]:
final_data

Unnamed: 0,Models,ACC
0,LR,0.786885
1,SVM,0.803279
2,KNN,0.737705
3,DT,0.737705
4,RF,0.852459
5,GB,0.819672
6,XGB,0.786885


In [None]:
import seaborn as sns

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns


In [None]:
sns.barplot(x=final_data['Models'], y=final_data['ACC'])


In [None]:
X = data.drop('target',axis=1)
y = data['target']

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X,y)

**PREDICTION ON NEW DATA**

In [None]:
import pandas as pd

In [None]:
new_data = pd.DataFrame({
    'age':52,
    'sex':1,
    'cp':0,
    'trestbps':125,
    'chol':212,
    'fbs':0,
    'restecg':1,
    'thalach':168,
    'exang':0,
    'oldpeak':1.0,
    'slope':2,
    'ca':2,
    'thal':3,
},index=[0])

In [None]:
new_data

In [None]:
p=rf.predict(new_data)
if p[0]==0:
    print('No probability of having heart disease')
else:
    print('Do have probability of heart disease')

**SAVING MODEL USING JOBLIB**

In [None]:
import joblib

In [None]:
joblib.dump(rf,'model_joblib_heart')

In [None]:
model = joblib.load('model_joblib_heart')

In [None]:
model.predict(new_data)

**GUI for our model**

In [None]:
from tkinter import *
import joblib

In [None]:
def show_entry_fields_ans():
    p1=float(e1.get())
    p2=float(e2.get())
    p3=float(e3.get())
    p4=float(e4.get())
    p5=float(e5.get())
    p6=float(e6.get())
    p7=float(e7.get())
    p8=float(e8.get())
    p9=float(e9.get())
    p10=float(e10.get())
    p11=float(e11.get())
    p12=float(e12.get())
    p13=float(e13.get())

    model = joblib.load('model_joblib_heart')
    result=model.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13]])

    if result==0:
        Label(master,text="No Heart Disease").grid(row=31)
    else:
        Label(master,text="Possibility of Heart Disease").grid(row=31)



master = Tk()
master.title("Heart Disease Prediction System")

label = Label(master,text="Heart Disease Prediction system",
             bg="black",fg="white").grid(row=0,columnspan=2)

Label(master, text="Enter your Age").grid(row=1)
Label(master, text="Male or Female[1/0]").grid(row=2)
Label(master, text="Enter value of CP").grid(row=3)
Label(master, text="Enter value of trestbps").grid(row=4)
Label(master, text="Enter value of chol").grid(row=5)
Label(master, text="Enter value of fbs").grid(row=6)
Label(master, text="Enter value of restecg").grid(row=7)
Label(master, text="Enter value of thalach").grid(row=8)
Label(master, text="Enter value of exang").grid(row=9)
Label(master, text="Enter value of oldpeak").grid(row=10)
Label(master, text="Enter value of slope").grid(row=11)
Label(master, text="Enter value of ca").grid(row=12)
Label(master, text="Enter value of thal").grid(row=13)

e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)
e12 = Entry(master)
e13 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)
e12.grid(row=12,column=1)
e13.grid(row=13,column=1)

Button(master,text='Predict',command=show_entry_fields_ans).grid()

mainloop()
