In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
heart_data= pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
heart_data.head()

In [None]:
heart_data.describe()

In [None]:
heart_data.info()

### As we can see there are no null values

# Univariate Analysis

In [None]:
plt.rcParams['figure.figsize']=(20,10)
sns.countplot(heart_data['age'])
plt.xlabel('Age', size=18)
plt.ylabel('Count',size=18)
plt.title('Age Distribution', size=25)

#### From above figure, It seems like people below the age of 57 are less prone to heart attack and people betwween the age of 55-60 are prone to have heart attack.

In [None]:
#1- men 0-women
heart_data.sex.value_counts(normalize= True)

In [None]:
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['sex'])
plt.xlabel('Sex', size=18)
plt.ylabel('Count',size=18)
plt.title('Sex Distribution', size=25)

In [None]:
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['cp'])
plt.xlabel('Chest Pain Type', size=18)
plt.ylabel('Count',size=18)
plt.title('Type of chest pain Distribution', size=25)

In [None]:
# trtbps: resting blood pressure (in mm Hg)
plt.rcParams['figure.figsize']=(20,10)
plt.hist(heart_data['trtbps'], color=['pink'], bins=(80,100,110,120,130,140,150,160,190))
plt.xlabel('trtbps', size=18)
plt.ylabel('Count',size=18)
plt.title('trtbps Distribution', size=25)

In [None]:
# chol : cholestoral in mg/dl fetched via BMI sensor
plt.rcParams['figure.figsize']=(20,10)
plt.hist(heart_data['chol'], color=['pink'])
plt.xlabel('Cholestoral level', size=18)
plt.ylabel('Count',size=18)
plt.title('cholestoral Distribution', size=25)

In [None]:
# fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['fbs'])
plt.xlabel('fbs', size=18)
plt.ylabel('Count',size=18)
plt.title('fasting blood sugar Distribution', size=25)

In [None]:
#rest_ecg : resting electrocardiographic results
#-Value 0: normal
#-Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
#-Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['restecg'])
plt.xlabel('rest_ecg', size=18)
plt.ylabel('Count',size=18)
plt.title('rest ecg Distribution', size=25)

In [None]:
#thalach : maximum heart rate achieved
plt.rcParams['figure.figsize']=(20,10)
plt.hist(heart_data['thalachh'], color=['pink'])
plt.xlabel('Thalachh', size=18)
plt.ylabel('Count',size=18)
plt.title('Thalachh Distribution', size=25)

In [None]:
#exng: exercise induced angina (1 = yes; 0 = no)
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['exng'])
plt.xlabel('exng', size=18)
plt.ylabel('Count',size=18)
plt.title('exng Distribution', size=25)

In [None]:
plt.rcParams['figure.figsize']=(20,10)
plt.hist(heart_data['oldpeak'], color=['pink'])
plt.xlabel('Oldpeak', size=18)
plt.ylabel('Count',size=18)
plt.title('oldpeak Distribution', size=25)

In [None]:
#slp: slope - the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['slp'])
plt.xlabel('slope', size=18)
plt.ylabel('Count',size=18)
plt.title('Slope', size=25)

In [None]:
#caa - number of major vessels (0-4) colored by flourosopy
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['caa'])
plt.xlabel('no of vessels', size=18)
plt.ylabel('Count',size=18)
plt.title('No of vessels', size=25)

In [None]:
#thall - 2 = normal; 1 = fixed defect; 3 = reversable defect
plt.rcParams['figure.figsize']=(8,8)
sns.countplot(heart_data['thall'])
plt.xlabel('Thall', size=18)
plt.ylabel('Count',size=18)
plt.title('Thall distribution', size=25)

In [None]:
plt.figure(figsize=(20,30))

plt.subplot(5,2,1)
plt.title('Prevalence of Heart attack by Sex',fontsize=15)
sns.countplot(heart_data['output'], hue=heart_data['sex'])

plt.subplot(5,2,2)
plt.title('Prevalence of Heart attack by Chest Pain',fontsize=15)
sns.countplot(heart_data['output'], hue=heart_data['cp'])

plt.subplot(5,2,3)
plt.title('Prevalence of Heart attack by fasting blood sugar > 120 mg/dl',fontsize=15)
sns.countplot(heart_data['output'],hue=heart_data['fbs'])

plt.subplot(5,2,4)
plt.title('Prevalence of Heart attack by restecg',fontsize=15)
sns.countplot(heart_data['output'],hue = heart_data['restecg'])

plt.subplot(5,2,5)
plt.title('Prevalence of Heart attack by Exercise induced angina',fontsize=15)
sns.countplot(heart_data['output'],hue=heart_data['exng'])

plt.subplot(5,2,6)
plt.title('Prevalence of Heart attack by slp',fontsize=15)
sns.countplot(heart_data['output'],hue=heart_data['slp'])

plt.subplot(5,2,7)
plt.title('Prevalence of Heart attack by number of major vessels',fontsize=15)
sns.countplot(heart_data['output'],hue=heart_data['caa'])

plt.subplot(5,2,8)
plt.title('Prevalence of Heart attack by thall',fontsize=15)
sns.countplot(heart_data['output'],hue=heart_data['thall'])


In [None]:
# output reprsents whether the person had a heart-attack (output=1) or not (output=0)
data = pd.crosstab(heart_data['age'], heart_data['output'])
data.div(data.sum(1).astype(float),axis=0).plot(kind='bar', stacked=True, figsize=(20,10), color=['blue','pink'])

plt.title('Prevalence of heart attack by age', fontsize = 30)
plt.xlabel('Age', fontsize = 15)
plt.legend()
plt.show()

In [None]:
# output reprsents whether the person had a heart-attack (output=1) or not (output=0)
# trtbps: resting blood pressure (in mm Hg)
data = pd.crosstab(heart_data['trtbps'], heart_data['output'])
data.div(data.sum(1).astype(float),axis=0).plot(kind='bar', stacked=True, figsize=(20,10), color=['blue','pink'])

plt.title('Prevalence of heart attack by resting blood pressure', fontsize = 30)
plt.xlabel('trtbps', fontsize = 15)
plt.legend()
plt.show()

In [None]:
data = pd.crosstab(heart_data['chol'], heart_data['output'])
data.div(data.sum(1).astype(float),axis=0).plot(kind='hist', stacked=True, figsize=(20,10), color=['blue','pink'])

plt.title('Prevalence of heart attack by cholestrol', fontsize = 30)
plt.xlabel('cholestrol', fontsize = 15)
plt.legend()
plt.show()

In [None]:
#thalach : maximum heart rate achieved
data = pd.crosstab(heart_data['thalachh'], heart_data['output'])
data.div(data.sum(1).astype(float),axis=0).plot(kind='hist', stacked=True, figsize=(20,10), color=['blue','pink'])

plt.title('Prevalence of heart attack by maximum heart rate achieved', fontsize = 30)
plt.xlabel('thalachh', fontsize = 15)
plt.legend()
plt.show()

In [None]:
data = pd.crosstab(heart_data['oldpeak'], heart_data['output'])
data.div(data.sum(1).astype(float),axis=0).plot(kind='hist', stacked=True, figsize=(20,10), color=['blue','pink'])

plt.title('Prevalence of heart attack by oldpeak', fontsize = 30)
plt.xlabel('thalachh', fontsize = 15)
plt.legend()
plt.show()

In [None]:
corr_matrix = heart_data.corr()
corr_matrix

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_matrix, annot=True)

## So far we have established a few important points.

### 1 - As resting blood pressure increasres, so to does risk of heart disease

### 2 - Rising Choloseterol does not appear to be a major indicator

### 3 - A low Max HR acheived is a big warning sign.

### 4 - Risk of heart disease increases with age

### 5 - In general, 40 to 58 aged people have more chance of heart attack.

### 6- Risk increases for Chest Pain - Non-anginal type

### 7- Hishest risk is when number of major vessels - 0

### 9- Thalium Stress Test Result - 2

### 10- Old peak - between 0-0.5

In [None]:
heart_data['age']= heart_data['age']/max(heart_data['age'])
heart_data['trtbps']= heart_data['trtbps']/max(heart_data['trtbps'])
heart_data['cp']= heart_data['cp']/max(heart_data['cp'])
heart_data['chol']= heart_data['chol']/max(heart_data['chol'])
heart_data['thalachh']= heart_data['thalachh']/max(heart_data['thalachh'])

In [None]:
heart_data.describe()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( heart_data.drop(['output'],axis=1), heart_data.output, test_size=0.2, random_state=0,
                                                    stratify = heart_data.output)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

clf= LogisticRegression()
params= {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge

clf_grid= GridSearchCV(estimator= clf, param_grid= params, cv=5)
clf_grid.fit(X_train,y_train)


In [None]:
from sklearn.metrics import accuracy_score

y_pred= clf_grid.predict(X_test)
acc= accuracy_score(y_test, y_pred)
print('Accuracy is: ', acc)

In [None]:
# RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred2= classifier.predict(X_test)
acc2= accuracy_score(y_test, y_pred2)
print('Accuracy is: ', acc2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

classifier=KNeighborsClassifier()

params1 = {
    'n_neighbors': (1,10, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev'),}
clf2_grid= GridSearchCV(estimator= classifier, param_grid= params1, cv=5)
clf2_grid.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred3= clf2_grid.predict(X_test)
acc3= accuracy_score(y_test, y_pred3)
print('Accuracy is: ', acc3)