# Initialization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df

## Definition of Column

1. Age is age of patient
2. Sex is gender of patient
    - 1 = male
    - 0 = female
3. cp=Chest Pain. values betwen 0-3
    - 0 = Asymptomatic
    - 1 = Atypical Angina
    - 2 = Pain without Relation to Angina'
    - 3 = Typical Angina
4. trestbps = Resting Blood preasure. normal preasure of blood with no exercise 
5. chol = Cholesterol contain on the blood
6. fbs = Fasting Blood Sugar/Glucose, taken after long time after meal
    - 0 = Blood glucose normal (under 120 mg/dl)
    - 1 = Blood glucose high (higher 120 mg/dl)
7. restecg = Result of Electrocardiogram (ECG) test
    - 0 = probable left ventricula hypertrophy
    - 1 = normal
    - 2 = abnormalities
8. thalach = Maximum Heart Rate during stress test
9. exang = condition of patient when exercise 
    - 0 = dont have angina
    - 1 = have angina
10. oldpeak = Decrease of ST segment during exercise  when resting 
11. slope = ST slope segment during exercise 
    - 0 = descending
    - 1 = flat
    - 2 = ascending
12. ca = number of main blood vessel represent by radioactive dye. values between 0-4
13. thal = Thalesmia types.
    - 0 = NULL
    - 1 = fixed defect
    - 2 = normal
    - 3 = reversible defect
14. target = condition heart attack of the patient
    - 1 = dont have heart attack
    - 0 = have heart attack


In [None]:
#rename the columns 
name = {'age':'Age','sex':'Sex','cp':'Chest_Pain','trestbps':'Resting_BP','chol':'Cholesterol',
        'fbs':'Fasting_Blood_Glucose','restecg':'ECG','thalach':'Max_HR','exang':'Exercise_Angina',
        'oldpeak':'ST_Depresion','ca':'Major_Vessel','slope':'ST_Slope','thal':'Thalesmia_Types',
        'target':'Condition'}
df.rename(columns=name, inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
#find unique value on data 
def know_unique(col):
  for i in col:
    print('Columnn {} = {}'.format(i, df[i].unique()))

column=['Sex','Chest_Pain','Fasting_Blood_Glucose','ECG','Exercise_Angina','ST_Slope','Major_Vessel','Thalesmia_Types']
know_unique(column)

#Data Analytical

In [None]:
df_analytic = df.copy()
df_analytic.head(3)

## Replace chategorical data

In [None]:
df_analytic['Sex'].replace({1:'Male',0:'Female'}, inplace=True)
df_analytic['Condition'].replace({0:'Heart_Attack',1:'Normal'},inplace=True)
df_analytic['Chest_Pain'].replace({0:'Asymptomatic',1:'Atypical Angina',
                                   2:'Pain without Relation to Angina',
                                   3:'Typical Angina'}, inplace=True)
df_analytic['Fasting_Blood_Glucose'].replace({0:'No',1:'Yes'}, inplace=True)
df_analytic['ECG'].replace({0:'Left Ventricula Hypertrophy',1:'Normal',2:'Abnormalities T Wave'},
                           inplace=True)
df_analytic['Exercise_Angina'].replace({0:'No',1:'Yes'}, inplace=True)
df_analytic['ST_Slope'].replace({0:'Descending',1:'Flat',2:'Ascending'},inplace=True)
df_analytic['Thalesmia_Types'].replace({1:'Fixed Defect',2:'Normal Flow',
                                        3:'Reversible Defect'}, inplace=True)
df_analytic

In [None]:
sns.set(style='darkgrid')
plt.figure(figsize=(6,6))
sns.countplot(x='Sex', hue='Condition', data=df_analytic)
plt.legend()
plt.show()


In [None]:
sns.set(style='darkgrid')
g = sns.catplot(x='Age',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)
g.set_xticklabels(rotation=30)

From the graph above we can observe that the higher the age, the higher the chances of suffering from a heart attack. men have a higher probability than women. higher heart attack disease around age 50 to 60 both of gender

In [None]:
g = sns.catplot(x='Chest_Pain',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size=20)


From this graph, it can be seen that heart attacks are more likely to be asymptomatic or not detected beforehand, both in men and women

In [None]:
g = sns.catplot(x='Resting_BP',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)
g.set_xticklabels(rotation=45)

If it is observed from the graph above, it can be seen that the higher the blood pressure in the patient, the higher the likelihood of developing heart disease, especially in men

In [None]:
s = df_analytic.groupby(['Sex', 'Cholesterol'])['Condition'].count().reset_index().sort_values(by='Condition',
                                                                                       ascending=False)
s.head(10).style.background_gradient(cmap='Reds')

From the data above, it can be observed that men have a higher probability of having a heart attack due to cholesterol than women

In [None]:
g = sns.catplot(x='Fasting_Blood_Glucose',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

From the graph above, it can be observed that the condition of fasting blood sugar does not directly affect the patient's condition of heart attack, both male and female patients.

In [None]:
g = sns.catplot(x='ECG',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

From the graph above, it can be observed that in patients with heart attack disease, there is no difference in abnormalities in the t wave ECG signal results so that on average patients who have heart attacks are not detected first

In [None]:
s = df_analytic.groupby(['Sex', 'Max_HR'])['Condition'].count().reset_index().sort_values(by='Condition',
                                                                                       ascending=False)
s.head(10).style.background_gradient(cmap='Reds')

From the table above, it is difficult to observe that there is a relationship between Maximum heart rate during exercise and conditions of heart attack, even though male patients have a higher tendency than female patients in this data.

In [None]:
g = sns.catplot(x='Exercise_Angina',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

Angina exercise conditions affect a person having a heart attack, especially in men

In [None]:
g = sns.catplot(x='ST_Depresion',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)
g.set_xticklabels(rotation=45)

If observed from the graph above, it can be seen that the language of patients with the absence of ST depression has a higher tendency to develop heart attack so that it can be concluded that heart attack disease is difficult to observe directly.

In [None]:
g = sns.catplot(x='ST_Slope',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

From the graph above it can be observed that patients with ST segment are flat and have a higher probability of developing heart disease than patients with ST segment descending. this is more likely to be observed in male patients

In [None]:
g = sns.catplot(x='Major_Vessel',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

the higher the narrow blood vessel radiograph, the higher the patient's chances of having a heart attack

In [None]:
g = sns.catplot(x='Thalesmia_Types',hue='Condition',col='Sex',data=df_analytic,
                  kind='count', height=10, aspect=1)
g.set_titles(size =20)

The condition of thalassemia in these patients can be used as a good indicator to detect the presence of heart attack disease

In [None]:
corelation = df_analytic[['Age', 'Resting_BP','Cholesterol','Max_HR']].corr()
plt.figure(figsize=(14,8))
sns.heatmap(corelation, annot=True, linewidths=0.5, cmap='coolwarm')

From the correlation table above it can be observed that the language of the relationship between age and blood pressure at rest has the highest correlation value compared to the others This proves that the language, the higher the age, the possibility for blood pressure to rest is high

# Build Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn import metrics

In [None]:
chategorical_column= ['Chest_Pain','ECG','Major_Vessel','Thalesmia_Types','ST_Slope']
final = pd.get_dummies(data=df, columns=chategorical_column)
final

In [None]:
X = final.drop('Condition',axis=1)
y = final['Condition']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Random Forest

In [None]:
RF = RandomForestClassifier()
param = {'n_estimators':[100,300,500,700, 900],
         'criterion':['gini', 'entropy']
         ,'max_features':['auto','sqrt','log2']}

RFClassifier = GridSearchCV(RF,param, scoring='neg_mean_squared_error', cv=5)

In [None]:
RFClassifier.fit(X_train, y_train)
RFClassifier.best_params_

In [None]:
RF_model = RandomForestClassifier(criterion='gini', max_features='log2',
                                  n_estimators=300, random_state=5)
RF_model.fit(X_train, y_train)
RF_accuracy = RF_model.score(X_test,y_test)
print('Random Forest Classifier Accuracy = ',(RF_accuracy*100))

In [None]:
RF_pred = RF_model.predict(X_test)
cf_matrix = confusion_matrix(y_test,RF_pred)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3} 'for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='')

## XGBoost

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
XGB = XGBClassifier(objective='reg:squarederror')
param = {'n_estimators': [50, 60, 70, 80, 90], 
        'learning_rate': [0.09, 0.1, 0.15, 0.2],
        'max_depth': [3, 4, 5]}
xgb_reg = GridSearchCV(XGB, param, scoring='neg_mean_squared_error', cv=5)

In [None]:
xgb_reg.fit(X_train, y_train)
xgb_reg.best_params_

In [None]:
xgb_model= XGBClassifier(n_estimators=50,
                         learning_rate=0.09,
                         max_depth=3)
xgb_model.fit(X_train, y_train)
xgb_accuracy = RF_model.score(X_test,y_test)
print('XGBoost Classifier Accuracy = ',(xgb_accuracy*100))

In [None]:
xgb_pred = xgb_model.predict(X_test)
cf_matrix = confusion_matrix(y_test,xgb_pred)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3} 'for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='')

## ELM

Extreme Learning Machine

In [None]:
!pip install scikit-elm==0.21a0

In [None]:
!pip install 'fsspec>=0.3.3'

In [None]:
from skelm import ELMClassifier

In [None]:
ELM = ELMClassifier()
param = {'n_neurons':[50,60,70,80,90,100],
         'alpha':[100,10,1,1e-1,1e-2,1e-3],
         'ufunc':['sigm','tanh','relu']}
ELM_reg = GridSearchCV(ELM, param, scoring='neg_mean_squared_error', cv=5)

In [None]:
ELM_reg.fit(X_train, y_train)
ELM_reg.best_params_

In [None]:
ELM_model = ELMClassifier(alpha=10,n_neurons=60,ufunc='relu')
ELM_model.fit(X_train,y_train)
ELM_accuracy = ELM_model.score(X_test,y_test)
print('ELM Classifier Accuracy = ',(ELM_accuracy*100))

In [None]:
ELM_pred = ELM_model.predict(X_test)
cf_matrix = confusion_matrix(y_test,ELM_pred)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3} 'for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='')