In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Importing Data
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

# Data

Our data is maden for predicting heart attack chance of people.

Age : Age of the patient

Sex : Sex of the patient (male=1, female=0)

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type
  Value 0: typical angina
  Value 1: atypical angina
  Value 2: non-anginal pain
  Value 3: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results
  Value 0: normal
  Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
  Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

output : 0= less chance of heart attack 1= more chance of heart attack


In [None]:
#Showing first 5 rows of data
data.head()

In [None]:
#Checking types of columns
data.dtypes

In [None]:
data.describe()

In [None]:
#Correlation matrix
sns.set(style="white")

corr = data.corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

f, ax = plt.subplots(figsize=(20, 15))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

plt.title('Correlation Matrix', fontsize=18)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
            square=True, annot=True)

plt.show()

According to correlation matrix every features have higher correlation then 0.1 with the target except fbs and chol

In [None]:
#Histogram
data.hist(column='output')

# Chest Pain

Firstly we wanted to check chest pain because of it has high correlation with our target and its values are integer encoded.

In [None]:
#counting values of chest pain
sns.countplot(data=data, x="cp")

In [None]:
#Violin plot
sns.violinplot(x = 'cp', y = 'output', data = data)

In [None]:
#Taking heart attack chance percantages for every chest pain
cppercent0 = (100)*data[data['cp']==0]['output'].sum()/len(data[data['cp']==0]['output'])
cppercent1= (100)*data[data['cp']==1]['output'].sum()/len(data[data['cp']==1]['output'])
cppercent2= (100)*data[data['cp']==2]['output'].sum()/len(data[data['cp']==2]['output'])
cppercent3= (100)*data[data['cp']==3]['output'].sum()/len(data[data['cp']==3]['output'])

cppercentages=[cppercent0,cppercent1,cppercent2,cppercent3]

In [None]:
#showing percentage disturbituon of every value of cp in itself (for example percantage of output=1 in cp=2)
fig = plt.figure()
plt.bar([0,1,2,3],cppercentages)

According to graphs chest pain values does not show in order effect to the output so we decided to change integer encoding to one-hot encoding (dummies).

In [None]:
pd.get_dummies(data['cp'], prefix='cp')

In [None]:
#replacing integer encoding with real names
data['cp'] = data['cp'].replace(0,'typical')
data['cp'] = data['cp'].replace(1,'atypical')
data['cp'] = data['cp'].replace(2,'non-anginal')
data['cp'] = data['cp'].replace(3,'asymptomatic')

In [None]:
data.head()

In [None]:
#getting dummies and adding them to the dataset
data = pd.concat([data,pd.get_dummies(data['cp'], prefix='cp')],axis=1)
#dropping old cp
data.drop(['cp'],axis=1, inplace=True)
#reordering features
data=data[['age','sex','cp_typical','cp_atypical','cp_non-anginal','cp_asymptomatic','trtbps','chol','fbs','restecg','thalachh','exng','oldpeak','slp','caa','thall','output']]

In [None]:
data.head()

# Sex

In [None]:
sns.countplot(data=data, x="sex")

In [None]:
sexpercentf = (100)*data[data['sex']==0]['output'].sum()/len(data[data['sex']==0]['output'])
sexpercentm = (100)*data[data['sex']==1]['output'].sum()/len(data[data['sex']==1]['output'])
sexpercentages=[sexpercentf,sexpercentm]
fig = plt.figure()
plt.bar(['Female','Male'],sexpercentages)

# Thalachh

In [None]:
sns.catplot(x="output", y="thalachh", hue="sex", data=data)

In [None]:
sns.boxplot(x="output", y="thalachh",hue='sex', data=data)

# Age

In [None]:
data.hist(column='age')

In [None]:
sns.catplot(x="output", y="age", hue="sex", data=data)

# restecg

In [None]:
#counting values of restecg
sns.countplot(data=data, x="restecg")

In [None]:
#Violin plot
sns.violinplot(x = 'restecg', y = 'output', data = data)

In [None]:
#Taking heart attack chance percantages for every restecg
restecgpercent0 = (100)*data[data['restecg']==0]['output'].sum()/len(data[data['restecg']==0]['output'])
restecgpercent1= (100)*data[data['restecg']==1]['output'].sum()/len(data[data['restecg']==1]['output'])
restecgpercent2= (100)*data[data['restecg']==2]['output'].sum()/len(data[data['restecg']==2]['output'])

restecgpercentages=[restecgpercent0,restecgpercent1,restecgpercent2]

In [None]:
#showing percentage disturbituon of every value of restecg in itself (for example percantage of output=1 in cp=2)
fig = plt.figure()
plt.bar([0,1,2],restecgpercentages)

According to visualizations restecg values doesn't show any in order effect like chest pain feature so we decided to use dummies for this feature too.

In [None]:
#replacing integer encoding with real names
data['restecg'] = data['restecg'].replace(0,'normal')
data['restecg'] = data['restecg'].replace(1,'abnormaly')
data['restecg'] = data['restecg'].replace(2,'probable')

In [None]:
data.head()

In [None]:
#getting dummies and adding them to the dataset
data = pd.concat([data,pd.get_dummies(data['restecg'], prefix='restecg')],axis=1)
#dropping old cp
data.drop(['restecg'],axis=1, inplace=True)
#reordering features
data=data[['age','sex','cp_typical','cp_atypical','cp_non-anginal','cp_asymptomatic','trtbps','chol','fbs','restecg_normal','restecg_abnormaly','restecg_probable','thalachh','exng','oldpeak','slp','caa','thall','output']]

In [None]:
data.head()

In [None]:
data.shape

# Models

We are going to predict output so its a binary classification target. The methods will we use are Logistic Regression, KNN and Decision Tree

In [None]:
#Taking values from data
M=data.values
X = M[:,0:18]
y = M[:,18]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
#Train Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# prepare the Kfold cross-validation procedure
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=42, shuffle=True)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
x=[5,6,7,8,9,10,11,12,13]
for k in x:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    print(knn.score(X_test, y_test))

In [None]:
#Cross validation
knncross = KNeighborsClassifier(n_neighbors=11)
knnscores = cross_val_score(knncross, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(knnscores)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test, y_test)

In [None]:
#Cross validation
lrcross = LogisticRegression()
lrscores = cross_val_score(lrcross, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(lrscores)

Our performance is upgraded with cross validation

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train,y_train)
dtree.score(X_test, y_test)

In first experiment Logistic Regression gave the best result. Logistic Regression has advantages most of time in high correlated data its only disadvantage is its only usable in 2 class outputs but it is not a problem in our dataset. Decision tree has given second best result we believe decision tree is mostly superior model against knn but its a more complicated model due to knn. Decision trees result chances in its random state and mostly gives better results with more improved hyperparameters.

Now we try different hyperparameters, we try to use gridseach for decision tree

In [None]:
params={
    "criterion":['gini','entropy'],
    "max_depth":range(1,10),
    "min_samples_split":range(1,10),
    "min_samples_leaf":range(1,10)
    
}

In [None]:
grid= GridSearchCV(dtree,
                  param_grid=params,
                  cv=10)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
dtree2 = DecisionTreeClassifier(random_state=42,criterion='gini',max_depth= 5,min_samples_leaf= 8,min_samples_split= 2)
dtree2.fit(X_train,y_train)
dtree2.score(X_test, y_test)

With grid search performance of decision tree classifier improved

# Different Features

We choose to use features which has higher correlattion then 0.30

In [None]:
data2=data.copy()

In [None]:
data2=data2[['cp_typical','cp_atypical','cp_non-anginal','cp_asymptomatic','thalachh','exng','oldpeak','slp','caa','thall','output']]

In [None]:
data2.head()

In [None]:
#Taking values from data
M2=data2.values
X2 = M2[:,0:10]
y2 = M2[:,10]

In [None]:
#Train Test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.20, random_state=42)

# KNN

In [None]:
x=[5,6,7,8,9,10,11,12,13]
for k in x:
    knn2 = KNeighborsClassifier(n_neighbors=k)
    knn2.fit(X2_train,y2_train)
    print(knn2.score(X2_test, y2_test))

In [None]:
#Cross validation
knn2cross = KNeighborsClassifier(n_neighbors=10)
knn2scores = cross_val_score(knn2cross, X2, y2, scoring='accuracy', cv=cv, n_jobs=-1)
print(knn2scores)

# Logistic Regression

In [None]:
lr2 = LogisticRegression()
lr2.fit(X2_train,y2_train)
lr2.score(X2_test, y2_test)

In [None]:
#Cross validation
lr2cross = LogisticRegression()
lr2scores = cross_val_score(lr2cross, X2, y2, scoring='accuracy', cv=cv, n_jobs=-1)
print(lr2scores)

# Decision Tree

In [None]:
dtree3 = DecisionTreeClassifier(random_state=42)
dtree3.fit(X2_train,y2_train)
dtree3.score(X2_test, y2_test)

In [None]:
params={
    "criterion":['gini','entropy'],
    "max_depth":range(1,10),
    "min_samples_split":range(1,10),
    "min_samples_leaf":range(1,10)
    
}

In [None]:
grid2= GridSearchCV(dtree,
                  param_grid=params,
                  cv=10)
grid2.fit(X2_train,y2_train)

In [None]:
grid2.best_params_

In [None]:
dtree4 = DecisionTreeClassifier(random_state=42,criterion='entropy',max_depth= 4,min_samples_leaf= 6,min_samples_split= 2)
dtree4.fit(X2_train,y2_train)
dtree4.score(X2_test, y2_test)

# Conclusion

Results of first features:

KNN(best):
0.7540983606557377

KNN_Cross(best):
0.73333333

Logistic Regression:
0.8688524590163934

Logistic Regression_Cross(best):
0.93548387

Decision Tree:
0.8360655737704918

Decision Tree(gridcv):
0.8524590163934426

Results of second features:

KNN(best):
0.7868852459016393

KNN_Cross(best):
0.80645161

Logistic Regression:
0.8688524590163934

Logistic Regression_Cross(best):
0.90322581

Decision Tree:
0.819672131147541

Decision Tree(gridcv):
0.819672131147541

According to our tests logistic regression with cross validation with first features shows best results and second best results with second features. Logistic Regression withot cross validation shows third and fourt best options. Its expected due to its very efficient when output is 2 class problem. Tests shows decision tree comes second in total after logistic regression. In first features decision tree gets better results with grid search but the results doesn't change with second features. KNN methods shows worst results between this 3 models. According to test changing features gives best effects with KNN. It shows with better feature chooses KNN can get better effects.