# Import all the important libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# from pandas_profiling import ProfileReport 
# profile=ProfileReport(df, title="Heart Data Details", explorative=True)
# profile.to_file("heart_report.html")

In [None]:
df=pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.head()

**Important Information about all columns:-**


* age: The person’s age in years
* sex: The person’s sex (1 = male, 0 = female)
* cp: chest pain type

— Value 0: asymptomatic (silent killer )

— Value 1: atypical angina ( caused when your heart muscle doesn't get an adequate supply of oxygenated blood.)

— Value 2: non-anginal pain (pain in the chest that is not caused by heart disease or a heart attack / problem with the esophagus, such as gastroesophageal reflux disease)

— Value 3: typical angina (substernal chest pain precipitated by physical exertion or emotional stress)

* trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)
* chol: The person’s cholesterol measurement in mg/dl
* fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
* restecg: resting electrocardiographic results
* — Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria

— Value 1: normal

— Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

* thalach: The person’s maximum heart rate achieved
* exang: Exercise induced angina (1 = yes; 0 = no)
* oldpeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)
* slope: the slope of the peak exercise ST segment — 0: downsloping; 1: flat; 2: upsloping
0: downsloping; 1: flat; 2: upsloping
* ca: The number of major vessels (0–3)
* thal: A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously

Value 1: fixed defect (no blood flow in some part of the heart)

Value 2: normal blood flow

Value 3: reversible defect (a blood flow is observed but it is not normal)

* target: Heart disease (1 = no, 0= yes)

In [None]:
df.shape

In [None]:
#visualizing Null values if it exists 
plt.figure(figsize=(22,10))
plt.xticks(size=20,color='grey')
plt.tick_params(size=12,color='grey')
plt.title('Finding Null Values Using Heatmap\n',color='grey',size=30)
sns.heatmap(df.isnull(),
            yticklabels=False,
            cbar=False,
            cmap='PuBu_r',
            )

In [None]:
df.isnull().sum()

In [None]:
df.all()

In [None]:
sns.countplot('target',data=df)

In [None]:
for i in df:  
    g = sns.FacetGrid(df, hue="target")
    g = g.map(sns.distplot, i)

In [None]:
# sns.pairplot(data=df,hue="target")

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = df.copy()
X = data.iloc[:,0:13]  #independent columns
Y = data.iloc[:,-1]    #target column 
#apply SelectKBest class to extract top best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(12,'Score'))  #print best features

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(13).plot(kind='barh')
plt.show()

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize =(9, 8))
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1)

In [None]:
df.corr(method ='kendall')

In [None]:
# col=['age','sex']#,'exang','ca',"oldpeak","thal"
# df.drop(col,axis=1,inplace=True)

In [None]:
# for i in df:
#     plt.figure()
#     df.boxplot(i)

In [None]:
x=df.iloc[:,0:-1].values
y=df.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
sc=StandardScaler()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)


# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred2=rfc.predict(x_test)
print(accuracy_score(y_test, y_pred2))
print(rfc.score(x_test,y_test))
print(rfc.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred2)
sns.heatmap(cf_matrix, annot=True)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LogR=LogisticRegression()
LogR.fit(x_train,y_train)
y_pred3=LogR.predict(x_test)
print(accuracy_score(y_test, y_pred3))
print(LogR.score(x_test,y_test))
print(LogR.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred3)
sns.heatmap(cf_matrix, annot=True)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred4=gnb.predict(x_test)
print(accuracy_score(y_test, y_pred4))
print(gnb.score(x_test,y_test))
print(gnb.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred4)
sns.heatmap(cf_matrix, annot=True)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)
y_pred5=knn_model.predict(x_test)
print(accuracy_score(y_test, y_pred5))
print(knn_model.score(x_test,y_test))
print(knn_model.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred5)
sns.heatmap(cf_matrix, annot=True)

# SVM

In [None]:
from sklearn import svm
support = svm.LinearSVC(random_state=20)
support.fit(x_train, y_train)
y_pred6=support.predict(x_test)
print(accuracy_score(y_test, y_pred6))
print(support.score(x_test,y_test))
print(support.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred6)
sns.heatmap(cf_matrix, annot=True)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier  
DTclassifier= DecisionTreeClassifier(criterion='entropy', random_state=0)  
DTclassifier.fit(x_train, y_train)  
y_pred7=DTclassifier.predict(x_test)
print(accuracy_score(y_test, y_pred7))
print(DTclassifier.score(x_test,y_test))
print(DTclassifier.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred7)
sns.heatmap(cf_matrix, annot=True)

# SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
SGDModel=SGDClassifier()
SGDModel.fit(x_train, y_train)  
y_pred8=SGDModel.predict(x_test)
print(accuracy_score(y_test, y_pred8))
print(SGDModel.score(x_test,y_test))
print(SGDModel.score(x_train,y_train))

In [None]:
cf_matrix=confusion_matrix(y_test, y_pred8)
sns.heatmap(cf_matrix, annot=True)

# Accuracy_score Analysis

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
x=["RFC","LR","NB","KNN","SVM","DT","SGD"]
y=[]
for i in [y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7,y_pred8]:
    y.append((accuracy_score(y_test,i))*100)
fig = plt.figure(figsize = (10, 5))    
plt.barh(x,y,color="royalblue",alpha=0.7 ) 
plt.grid(color='#95a5a6', linestyle='--', linewidth=2,alpha=0.7)
plt.ylabel("Algorithms")
plt.xlabel("accuracy_score")
plt.title("All algorithms accuracy analysis")
plt.show()

# Precision_Score Analysis

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
x=["RFC","LR","NB","KNN","SVM","DT","SGD"]
y=[]
for i in [y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7,y_pred8]:
    y.append((precision_score(y_test,i))*100)
fig = plt.figure(figsize = (10, 5))    
plt.barh(x,y,color="red",alpha=0.7 ) 
plt.grid(color='#95a5a6', linestyle='--', linewidth=2,alpha=0.7)
plt.ylabel("Algorithms")
plt.xlabel("precision_score")
plt.title("All algorithms Precision analysis")
plt.show()

# Recall_Score Analysis

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
x=["RFC","LR","NB","KNN","SVM","DT","SGD"]
y=[]
for i in [y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7,y_pred8]:
    y.append((recall_score(y_test,i))*100)
fig = plt.figure(figsize = (10, 5))    
plt.barh(x,y,color="green",alpha=0.7 ) 
plt.grid(color='#95a5a6', linestyle='--', linewidth=2,alpha=0.7)
plt.ylabel("Algorithms")
plt.xlabel("recall_score")
plt.title("All algorithms recall analysis")
plt.show()

# F1_Score Analysis

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
x=["RFC","LR","NB","KNN","SVM","DT","SGD"]
y=[]
for i in [y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7,y_pred8]:
    y.append((f1_score(y_test,i))*100)
fig = plt.figure(figsize = (10, 5))    
plt.barh(x,y,color="purple",alpha=0.7 ) 
plt.grid(color='#95a5a6', linestyle='--', linewidth=2,alpha=0.7)
plt.ylabel("Algorithms")
plt.xlabel("f1_score")
plt.title("All algorithms f1-score analysis")
plt.show()

# Model Dump and Download

In [None]:
# import pickle
# pickle.dump(rfc,open("heartPKL.pkl",'wb'))
# model=pickle.load(open("heartPKL.pkl",'rb'))

In [None]:
# print(model.predict([[3,145,233,1,0,150,2.3,0,1]]))

In [None]:
# from sklearn.model_selection import GridSearchCV
# parameters={
#     "n_estimators" : [10, 50, 100, 200],
#     "max_depth" : [3, 10, 20, 40]
# }
# cv = GridSearchCV(rfc,parameters,cv=5)
# cv.fit(x_train,y_train)


In [None]:
# def display(results):
#     print(f'Best parameters are: {results.best_params_}')
#     print("\n")
#     mean_score = results.cv_results_['mean_test_score']
#     std_score = results.cv_results_['std_test_score']
#     params = results.cv_results_['params']
#     for mean,std,params in zip(mean_score,std_score,params):
#         print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [None]:
# display(cv)

In [None]:
#https://towardsdatascience.com/heart-disease-uci-diagnosis-prediction-b1943ee835a7#:~:text=trestbps%3A%20The%20person's%20resting%20blood,%3D%20true%3B%200%20%3D%20false)