In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head()

## Atrributes Information

age<br>
sex(1 = male; 0 = female)<br>
cp:chest pain type (4 values)<br>
trestbps:resting blood pressure (in mm Hg on admission to the hospital)<br>
chol:serum cholestoral in mg/dl<br>
fbs:fasting blood sugar > 120 mg/dl(1 = true; 0 = false)<br>
restecg:resting electrocardiographic results (values 0,1,2)<br>
thalach:maximum heart rate achieved<br>
exang:exercise induced angina(1 = yes; 0 = no)<br>
oldpeak = ST depression induced by exercise relative to rest<br>
slope:the slope of the peak exercise ST segment<br>
ca:number of major vessels (0-3) colored by flourosopy<br>
thal: 3 = normal; 6 = fixed defect; 7 = reversable defect<br>
target - have disease or not (1=yes, 0=no)



### Types of chest pain(cp)
Value 0: typical angina<br>
Value 1: atypical angina<br>
Value 2: non-anginal pain<br>
Value 3: asymptomatic<br>

### Types of restecg
restecg: resting electrocardiographic results<br>
Value 0: normal<br>
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)<br>
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria<br>

### Types of slope
0: upsloping<br>
1: flat<br>
2: downsloping<br>


**Name of columns**

In [None]:
df.columns.values

In [None]:
df.target.value_counts()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

### Looking for correlation

In [None]:
corr_matrix = df.corr()
corr_matrix['target'].sort_values(ascending=False)

In [None]:
sns.countplot(data=df,x='target');

In [None]:
plt.pie(x=df.target.value_counts(),labels=['Disease','No Disease'],autopct='%1.1f%%',shadow=True);
plt.legend(loc='best');

from the above plot we can say there are 54.5% patients in the dataset having heart disease

**Now lets see in Target column,How many Male Females are suffering from heart Diesease**

In [None]:
sns.countplot(data=df,x='target',hue='sex');
plt.title('Heart Disease Frequency for Sex');
plt.xlabel('Sex (0 = Female, 1 = Male)');
plt.xticks(rotation=0);
plt.legend(["Haven't Disease", "Have Disease"]);
plt.ylabel('Frequency');

Clearly we can see in both the cases(Disease and No Disease) Males are dominating

**Handling Missing**

In [None]:
df.isnull().sum()

Therefore there is no Missing Values in the dataset

In [None]:
sns.distplot(df.age,kde=False);

In [None]:
df['age'].value_counts()

Therefore there are more patients in age group between 55-60.

In [None]:
pd.crosstab(df.age,df.target).plot(kind='bar',figsize=(20,6));
plt.title('Heart Disease Frequency for Ages');
plt.xlabel('Age');
plt.ylabel('Frequency');
plt.legend(["Haven't Disease", "Have Disease"]);

In [None]:
plt.figure(figsize=(20,6))
sns.barplot(x='cp',y='age',data=df,hue='target');
plt.legend(loc='upper right');

In [None]:
px.scatter(data_frame=df,x='age',y='thalach',title='Distribution of Max Heart Rate over Age',color='target')

In [None]:
pd.crosstab(df.cp,df.target).plot(kind="bar",figsize=(15,6),color=['#11A5AA','#AA1190' ])
plt.title('Heart Disease Frequency According To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(rotation = 0)
plt.ylabel('Frequency of Disease or Not')
plt.show()

Patients with chest pain "typical angina" don't suffer from heart disease.
Patients with chest pain "non-anginal pain" are  more prone to heart disease

In [None]:
data = df[['trestbps','chol','thalach']]
px.box(data_frame=data)

In [None]:
px.scatter(data_frame=df,x='age',y='chol')

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(x='target',y='age',data=df)
sns.swarmplot(x='target',y='age',data=df,palette="Pastel1")

In [None]:
sns.pairplot(data=df)

### Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
X = df.drop('target',axis=1)
Y = df['target']
model = ExtraTreesRegressor()
model.fit(X,Y)
print(model.feature_importances_)

In [None]:
# plot graph of feature importance for better visualization
feat_import = pd.Series(model.feature_importances_,index = X.columns)
feat_import.nlargest(8).plot(kind='barh')
plt.show()

Therefore chest pain(cp) & number of major vessels(ca) are the most important features

## Building a Model

In [None]:
X = df.drop('target',axis=1).values
Y = df['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state=42)
score = []

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
s1 = np.mean(cross_val_score(lr,X_train,y_train,scoring='accuracy',cv=10))
score.append(s1*100)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
s2 = np.mean(cross_val_score(dt,X_train,y_train,scoring='accuracy',cv=10))
score.append(s2*100)

In [None]:
rf = RandomForestClassifier(n_estimators=300)
rf.fit(X_train,y_train)
s3 = np.mean(cross_val_score(rf,X_train,y_train,scoring='accuracy',cv=10))
score.append(s3*100)

In [None]:
svc = SVC()
svc.fit(X_train,y_train)
s4 = np.mean(cross_val_score(svc,X_train,y_train,scoring='accuracy',cv=10))
score.append(s4*100)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
s5 = np.mean(cross_val_score(knn,X_train,y_train,scoring='accuracy',cv=10))
score.append(s5*100)

In [None]:
nb = GaussianNB()
nb.fit(X_train,y_train)
s6 = np.mean(cross_val_score(nb,X_train,y_train,scoring='accuracy',cv=10))
score.append(s6*100)

In [None]:
models = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','SVC','KNeighborsClassifier','GaussianNB']
for i in range(len(models)):
    print('The Accuracy Score for',models[i],'is',score[i])

**According to above results we can choose Logistic regression or Random Forest Classifier**

**Lets Perform Hyper Parameter Tuning to increase Accuracy**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters = {
    'n_estimators':range(10,500,10),
    'criterion': ('gini','entropy'),
    'max_features':('auto','sqrt','log2'),
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'bootstrap': [True,False]
}
grid = RandomizedSearchCV(rf,param_distributions=parameters,scoring='accuracy',cv=5,verbose=0,n_iter=50,random_state=42,n_jobs=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

## Building an Artificial Neural Networks(ANN)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout

In [None]:
model = Sequential()
model.add(Dense(units=32,kernel_initializer='uniform',activation='relu',input_dim=13))
model.add(Dense(64,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.compile(optimizer='adam',metrics=['accuracy'],loss='binary_crossentropy')

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=300,batch_size=10)

In [None]:
metrics = pd.DataFrame(model.history.history)

In [None]:
model.evaluate(X_test,y_test)

In [None]:
metrics[['loss','val_loss']].plot()

In [None]:
metrics[['accuracy','val_accuracy']].plot()

### Predicting the Test set results

In [None]:
pred = model.predict_classes(X_test)
pred

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
cm = confusion_matrix(y_test,pred)

In [None]:
cm

In [None]:
accuracy_score(y_test,pred)

In [None]:
print(classification_report(y_test,pred))