# **Understand the Data**
* **age:age in year**
* **sex: Male=1 ,Female=0**
* **cp: Chest pain experienced (1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic)**
* **trestbps: resting blood pressure in (mmHg), Ideal blood pressure is considered to be between 90/60 mmHg and 120/80 mmHg, High 140/90 mmHg or higher, Low 90/60 mmHg or Lower**
* **chol: Serum cholestoral in mg/dL the normal range in (All aged 19 and younger at most 170 mg/dL) and (in aged 20 and older 125-200 mg/dL)**
* **fbs: the person's fasting blood suger (>120 mg/dL) (1=True, 0=False)**
* **restecg: Resting electrocardiographic measurmeant (0 = normal, 1 = having ST-T wave abnormal, 2 = showing probable or definite left ventricular hypertrophy)**
* **thalach: The person's maximum heart rate achieved**
* **exang: Exercise induced angina (1 = yes; 0 = no)**
* **oldpeak: ST depression induced by exercise relative to rest**
* **slope: the slope of the peak exercise ST segment ( 1: upsloping, 2: flat, 3: downsloping)**
* **ca: number of major vessels (0-3) colored by flourosopy** 
* **thal: Thalassemia is a blood disorder passed down through families (inherited) in which the body makes an abnormal form or inadequate amount of hemoglobin. (3 = normal; 6 = fixed defect; 7 = reversable defect)**
* **target: Heart disease (1= True, 0= False)**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, confusion_matrix
%matplotlib inline


# **Read Data**

In [None]:
data=pd.read_csv("../input/heart-disease-uci/heart.csv")
data.head()

# **Analyzing Data**

In [None]:
data.info()

**Dataset dimensionality**

In [None]:
data.shape
# 303 rows and 14 colums

**Check the null values**

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(10,10))
corr= data.corr()
sns.heatmap(corr, annot=True, fmt='.1f')
plt.show()

### The values (thalach) and (cp) is strongest +ve correlated and values (exang) and (oldpeak) and (ca) in strongest -ve correlated

# **Visualization**

In [None]:
min_age= min(data['age'])
max_age= max(data['age'])
print('Minmum age is',min_age)
print('Maximum age is',max_age)

In [None]:
young_ages=data[(data['age']>=29)&(data['age']<40)]
middle_ages=data[(data['age']>=40)&(data['age']<55)]
elderly_ages=data[(data['age']>55)]
print('Young Ages is',len(young_ages))
print('Middle Ages is',len(middle_ages))
print('Elderly Ages is',len(elderly_ages))

### split the age into 3 category (Young, Middle, Elderly) to easly in visualization and more accurate

In [None]:
sns.barplot(x=['young ages','middle ages','elderly ages'],y=[len(young_ages),len(middle_ages),len(elderly_ages)])
plt.title('Ages in dataset')
plt.xlabel('ages range')
plt.ylabel('ages count');

### It is obvious that elderly is more range

In [None]:
sns.countplot(data['sex']);

male more than female

## **Now, Sex VS Target**

In [None]:
male_target1 =len(data[(data['sex']==1) & (data['target']== 1)])
male_target0 =len(data[(data['sex']==1) & (data['target']== 0)])
sns.barplot(x=['male_target1','male_target0'], y=[male_target1,male_target0])
plt.title('Male In Target')
plt.xlabel('Male and Target')
plt.show()

In [None]:
female_target1 =len(data[(data['sex']==0) & (data['target']== 1)])
female_target0 =len(data[(data['sex']==0) & (data['target']== 0)])
sns.barplot(x=['female_target1','female_target0'], y=[female_target1,female_target0])
plt.title('Female In Target')
plt.xlabel('Female and Target')
plt.show()

### **SO, Female have more chance of having Heart Disease than Male**

## Chest Pain VS Target

In [None]:
data['cp'].value_counts()

- ***1: typical angina= 143*** 
- ***2: atypical angina= 87*** 
- ***3: non-anginal pain= 50***
- ***4: asymptomatic= 23***

In [None]:
typical_angina_target1 =len(data[(data['cp']==0) & (data['target']== 1)])
typical_angina_target0 =len(data[(data['cp']==0) & (data['target']== 0)])
sns.barplot(x=['typical_angina_target1','typical_angina_target0'], y=[typical_angina_target1,typical_angina_target0])
plt.title('Typical Angina In Target')
plt.xlabel('Typical Angina and Target')
plt.show()

In [None]:
atypical_angina_target1 =len(data[(data['cp']==1) & (data['target']== 1)])
atypical_angina_target0 =len(data[(data['cp']==1) & (data['target']== 0)])
sns.barplot(x=['atypical_angina_target1','atypical_angina_target0'], y=[atypical_angina_target1,atypical_angina_target0])
plt.title('Atypical Angina In Target')
plt.xlabel('Atypical Angina and Target')
plt.show()

In [None]:
nonanginal_pain_target1 =len(data[(data['cp']==2) & (data['target']== 1)])
nonanginal_pain_target0 =len(data[(data['cp']==2) & (data['target']== 0)])
sns.barplot(x=['nonanginal_pain_target1','nonanginal_pain_target0'], y=[nonanginal_pain_target1,nonanginal_pain_target0])
plt.title('Non-anginal Pain In Target')
plt.xlabel('Non-anginal Pain and Target')
plt.show()

In [None]:
asymptomatic_target1 =len(data[(data['cp']==3) & (data['target']== 1)])
asymptomatic_target0 =len(data[(data['cp']==3) & (data['target']== 0)])
sns.barplot(x=['asymptomatic_target1','asymptomatic_target0'], y=[asymptomatic_target1,asymptomatic_target0])
plt.title('Asymptomatic In Target')
plt.xlabel('Asymptomatic and Target')
plt.show()

## In Typical Angina cases with chest pain are less common with heart disease. But on the other hand, there are problems in all other cases of chest pain

In [None]:
sns.barplot(y='chol',data=data,x='sex',hue='target',palette='coolwarm')
plt.title('Cholestrol and sex')
plt.xlabel('sex')
plt.ylabel('Cholestrol');

### Female have higher cholestrol than men


In [None]:
plt.figure(figsize=(10,10))
sns.swarmplot(y='chol',data=data,x='thal',hue='target')
plt.show();

### Chance of heart diseases increases with increase in Cholestrol level.

# Fasting blood Suger VS Target

In [None]:
sns.countplot(data['fbs'],hue=data['target'])
plt.title('Fasting blood Suger')
plt.xlabel('Fasting blood Suger')
plt.show()

### In case  Fasting blood Suger < 120 have more chance of having Heart Disease than people havnig Fasting blood Suger >120

## Resting electrocardiographic measurment VS Target

In [None]:
data['restecg'].value_counts()

- ***0: Normal= 147***
- ***1: ST-T wave abnormal= 152***
- ***2: left ventricular hypertrophy= 4***

In [None]:
normal_target1 =len(data[(data['restecg']==0) & (data['target']== 1)])
normal_target0 =len(data[(data['restecg']==0) & (data['target']== 0)])
sns.barplot(x=['normal_target1','normal_target0'], y=[normal_target1,normal_target0])
plt.title('Electrocardiographic(Normal) In Target')
plt.xlabel('Electrocardiographic')
plt.show()

In [None]:
abnormal_target1 =len(data[(data['restecg']==1) & (data['target']== 1)])
abnormal_target0 =len(data[(data['restecg']==1) & (data['target']== 0)])
sns.barplot(x=['abnormal_target1','abnormal_target0'], y=[abnormal_target1,abnormal_target0])
plt.title('Electrocardiographic(ST-T wave abnormal) In Target')
plt.xlabel('Electrocardiographic')
plt.show()

In [None]:
hypertrophy_target1 =len(data[(data['restecg']==2) & (data['target']== 1)])
hypertrophy_target0 =len(data[(data['restecg']==2) & (data['target']== 0)])
sns.barplot(x=['hypertrophy_target1','hypertrophy_target0'], y=[hypertrophy_target1,hypertrophy_target0])
plt.title('Electrocardiographic(hypertrophy) In Target')
plt.xlabel('Electrocardiographic')
plt.show()

### In ST-T wave abnormal cases with Electrocardiographic are more common with heart disease. But on the other hand, there are problems in all other cases of Electrocardiographic
​


In [None]:
cp_thalach=data.groupby('cp')['thalach'].mean()

In [None]:
sns.barplot(x=cp_thalach.index,y=cp_thalach.values)
plt.xlabel('Degree of Chest Pain (Cp)')
plt.ylabel('Maximum Thalach By Cp Values')
plt.title('thalach to degree of chest pain')
plt.show()

### When the heart rate is less when the chest pain is low. But in cases where chest pain is typical Angina, it is observed that the area is more. atypical Angina and Non-anginal Pain were found to be of the same degree.

In [None]:
sns.countplot(data['exang'],hue=data['target']);

### In case doesn't make exercise induced angina(0) have more chance of having Heart Disease than people havnig exercise induced angina(1)

# Slope VS Target 

In [None]:
data['slope'].value_counts()

In [None]:
upslopeing_target1 =len(data[(data['slope']==0) & (data['target']== 1)])
upslopeing_target0 =len(data[(data['slope']==0) & (data['target']== 0)])
sns.barplot(x=['upslopeing_target1','upslopeing_target0'], y=[upslopeing_target1,upslopeing_target0])
plt.title('Slope of the peak(upslopeing) In Target')
plt.xlabel('Slope of the peak exercise ST segment')
plt.show()

In [None]:
flat_target1 =len(data[(data['slope']==1) & (data['target']== 1)])
flat_target0 =len(data[(data['slope']==1) & (data['target']== 0)])
sns.barplot(x=['flat_target1','flat_target0'], y=[flat_target1,flat_target0])
plt.title('Slope of the peak(flat) In Target')
plt.xlabel('Slope of the peak exercise ST segment')
plt.show()

In [None]:
downslopeing_target1 =len(data[(data['slope']==2) & (data['target']== 1)])
downslopeing_target0 =len(data[(data['slope']==2) & (data['target']== 0)])
sns.barplot(x=['downslopeing_target1','downslopeing_target0'], y=[downslopeing_target1,downslopeing_target0])
plt.title('Slope of the peak(upslopeing) In Target')
plt.xlabel('Slope of the peak exercise ST segment')
plt.show()

### In cases upslopeing with chest pain are more common with heart disease. But on the other hand, there are problems in all other cases of chest pain


# Thalassemia VS Target

In [None]:
data['thal'].value_counts()

- ***0 mean no Thalassemia in blood= 2***
- ***1 mean Thalassemia is normal in blood= 18***
- ***2 mean Thalassemia is fixed defect= 166***
- ***3 mean Thalassemia is reversable defect= 117***

In [None]:
sns.countplot(data['thal']);

In [None]:
# see the value of thal in target 1
m= len(data[(data['thal']==0) & (data['target']==1)])
A= len(data[(data['thal']==1) & (data['target']==1)])
r= len(data[(data['thal']==2) & (data['target']==1)])
w= len(data[(data['thal']==3) & (data['target']==1)])
print('No Thalassemia and have heart disease:',m)
print('Normal Thalassemia and have heart disease:',A)
print('Fixed defect in Thalassemia and have heart disease:',r)
print('Reversable defect Thalassemia and have heart disease:',w)

In [None]:
# see the value of thal in target 0
a= len(data[(data['thal']==0) & (data['target']==0)])
n= len(data[(data['thal']==1) & (data['target']==0)])
g= len(data[(data['thal']==2) & (data['target']==0)])
l= len(data[(data['thal']==3) & (data['target']==0)])
print("No Thalassemia and doesn't have heart disease:",a)
print("Normal Thalassemia and doesn't have heart disease:",n)
print("Fixed defect in Thalassemia and doesn't have heart disease:",g)
print("Reversable defect Thalassemia and dpesn't have heart disease:",l)

In [None]:
f,ax=plt.subplots(figsize=(7,7))
sns.barplot(y=['Target 1&0 Thal 0','Target 1&0 Thal 1','Target 1&0 Thal 2','Target 1&0 Thal 3'],x=[1,6,130,28],color='green',alpha=0.5,label='Target 1 Thal State')
sns.barplot(y=['Target 1&0 Thal 0','Target 1&0 Thal 1','Target 1&0 Thal 2','Taarget 1&0 Thal 3'],x=[1,12,36,89],color='red',alpha=0.7,label='Target 0 Thal State')
ax.legend(loc='lower right',frameon=True)
ax.set(xlabel='Target State and Thal Counter',ylabel='Target State and Thal State',title='Target VS Thal')
plt.xticks(rotation=90)
plt.show()

### In case Fixed defect(Target 1&0 Thal 2) is more chance to have heart disease and Reversable defect (Target 1&0 Thal 3)case is more chance to doesn't have heart disease

# Spliting Data

In [None]:
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## We find the outliers and MinMaxScaler() it is sensitive to outliers, so if there are outliers in the data, you might want to consider the Robust Scaler.

# what is the Robust Scaler?
### The Robust Scaler uses a similar method to the (Min-Max scaler) but it instead uses the interquartile range, rathar than the min-max, so that it is robust to outliers.
# Formula = X - Q1(x) / Q3(x) - Q1(x) 
### For each feature.Of course this means it is using the less of the data for scaling so it’s more suitable for when there are outliers in the data.
### Notice that after Robust scaling, the distributions are brought into the same scale and overlap, but the outliers remain outside of bulk of the new distributions.
### However, in Min-Max scaling, the two normal distributions are kept seperate by the outliers that are inside the 0-1 range.

In [None]:
scaler =preprocessing.RobustScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
Accuracies= {} # make dictionry to save all accuracies models

# define model
lg = LogisticRegression()

# parameters
parameters=[{'penalty':['l1','l2'],'C':[0.1,0.4,0.5],'random_state':[0]}]

#define search
search = GridSearchCV(lg, parameters, scoring='accuracy', n_jobs=-1, )

# execute search
result = search.fit(x_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# make function to draw the plot ROC to easly
def plot_roc_(fpr,tpr,roc_auc):
    plt.figure(figsize=(5,5))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr,tpr, color='red',label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],linestyle='--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# What is AUC - ROC Curve?
### AUC - ROC curve is a performance measurement for classification problem at various thresholds settings. ROC is a probability curve and AUC represents degree or measure of separability. It tells how much model is capable of distinguishing between classes. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s. By analogy, Higher the AUC, better the model is at distinguishing between patients with disease and no disease.

## The ROC curve is plotted with TPR against the FPR where TPR is on y-axis and FPR is on the x-axis.

### Although it's not without certain issues. As a rule of thumb, an AUC can be classed as follows,

* 0.90 - 1.00 = excellent
* 0.80 - 0.90 = good
* 0.70 - 0.80 = fair
* 0.60 - 0.70 = poor
* 0.50 - 0.60 = fail

# Logistic Regression Model

In [None]:
lr=LogisticRegression(C=0.1,penalty='l2',random_state=0)
lr.fit(x_train,y_train)

y_pred=lr.predict(x_test)
y_proba=lr.predict_proba(x_test)

#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['Logistic Regression'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("Logistic TRAIN score with ",format(lr.score(x_train, y_train)))
print("Logistic TEST score with ",format(lr.score(x_test, y_test)))
print()
co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

# KNN Model

In [None]:
# define model
knn = KNeighborsClassifier()

# parameters
parameters=[{'n_neighbors':np.arange(2,33),'n_jobs':[2,6]}]

# define search
search = GridSearchCV(knn, parameters, scoring='accuracy')

# execute search
result = search.fit(x_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
knn=KNeighborsClassifier(n_jobs=2, n_neighbors=7)
knn.fit(x_train,y_train)

y_pred=knn.predict(x_test)

y_proba=knn.predict_proba(x_test)
#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['KNN'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("KNN TRAIN score with ",format(knn.score(x_train, y_train)))
print("KNN TEST score with ",format(knn.score(x_test, y_test)))
print()

co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

# SVM Model

In [None]:
# define model
svc =SVC()

# parameters
parameters = [{'kernel': ['linear'],'random_state': [2]}, {'kernel': ['rbf'],'gamma':[0.9,0.06,0.3],'random_state': [0],
        'C':[1,2,3,4,5,6],'degree':[2], 'probability':[True]}]

# define search
search = GridSearchCV(svc, parameters, scoring='accuracy')

# execute search
result = search.fit(x_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
svc=SVC(C=6, degree=2, gamma=0.06, kernel='rbf',probability=True, random_state=0)
svc.fit(x_train,y_train)

y_pred=svc.predict(x_test)
y_proba=svc.predict_proba(x_test)

#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['SVM'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("SVC TRAIN score with ",format(svc.score(x_train, y_train)))
print("SVC TEST score with ",format(svc.score(x_test, y_test)))
print()

co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

# Random Forest

In [None]:
# define model
RF=RandomForestClassifier()

# parameters
parameters = [{'max_depth': np.arange(1, 10),'min_samples_split': np.arange(2, 5),'random_state': [3],
               'n_estimators': np.arange(10, 20)}]
# define search
search = GridSearchCV(RF, parameters, scoring='accuracy')

# execute search
result = search.fit(x_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
RF=RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=11, random_state =3)
RF.fit(x_train,y_train)

y_pred=RF.predict(x_test)
y_proba=RF.predict_proba(x_test)

#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['RF'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("Random Forest TRAIN score with ",format(svc.score(x_train, y_train)))
print("Random Forest TEST score with ",format(svc.score(x_test, y_test)))
print()

co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

#  Decision Tree Model

In [None]:
# define model
DT=DecisionTreeClassifier()

# parameters
parameters = [{'random_state': [42],}]
# define search
search = GridSearchCV(DT, parameters, scoring='accuracy')

# execute search
result = search.fit(x_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
DT=DecisionTreeClassifier(random_state=42)
DT.fit(x_train,y_train)

y_pred=DT.predict(x_test)
y_proba=DT.predict_proba(x_test)

#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['DT'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("Decision Tree TRAIN score with ",format(svc.score(x_train, y_train)))
print("Decision Tree TEST score with ",format(svc.score(x_test, y_test)))
print()

co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

# Gradient Boosting

In [None]:
# define model
GB=GradientBoostingClassifier()

# parameters
parameters = [{'random_state': [42],}]
# define search
search = GridSearchCV(GB, parameters, scoring='accuracy')

# execute search
result = search.fit(x_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
GB=GradientBoostingClassifier(random_state=42)
GB.fit(x_train,y_train)

y_pred=GB.predict(x_test)
y_proba=GB.predict_proba(x_test)

#Sensitivity= TP / TP + FN
#Specificity= TN / TN + FP
fpr, tpr, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(fpr, tpr)
plot_roc_(fpr,tpr,roc_auc)

In [None]:
acc = accuracy_score(y_test, y_pred)*100
Accuracies['GB'] = acc
print("Accuracy: {:.2f}%".format(acc))
print("Decision Tree TRAIN score with ",format(svc.score(x_train, y_train)))
print("Decision Tree TEST score with ",format(svc.score(x_test, y_test)))
print()

co=confusion_matrix(y_test,y_pred)
print(co)
sns.heatmap(co,annot=True)
plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(Accuracies.keys()), y=list(Accuracies.values()), palette='coolwarm')
plt.show()