In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import mpld3

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix , roc_auc_score , recall_score, precision_score , f1_score , accuracy_score, auc

In [None]:
heartdata=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
heartdata.head()

In [None]:
heartdata.output.value_counts()

Lets group the age groups into buckets 10-20, 20-30 and so on till 90-100, and then check the distributions in the age buckets.

In [None]:
a = np.arange(0,100,10)
age_bucket = [str(x)+'-'+str(x+10) for x in a]
(age_bucket)
heartdata['age_bucket'] = heartdata['age'].apply(lambda x: age_bucket[x//10])
heartdata.head()

In [None]:
sns.countplot(x='age_bucket', data=heartdata, hue='output', order = heartdata['age_bucket'].value_counts().index)

The dataset is an imbalanced one w.r.t the age groups i.e we dont have equal number of samples from each age group. The following are the observations:
1. Most of the samples cover the age ranges 40-70, 
2. In most age ranges the number of positive cases (heart attacks) is more than the negative cases.
3. In the age range 60-70, number of positive cases in less than the number of negative cases

lets see the distriution between males and females

In [None]:
sns.catplot(x='age_bucket', data=heartdata, hue='output', col='sex', kind='count')

Assuming sex=0 to be female and sex=1 to be male
1. Most of the data points are for males
2. Females have a significant number of positive cases for all age groups
3. In males, most positive cases are in the age group 40-60

Dropping the extra column 'age_bucket'

In [None]:
heartdata= heartdata.drop('age_bucket',axis=1)

In [None]:
c=heartdata.corr()
plt.figure(figsize=(10,10))
sns.heatmap(c, annot=True)
plt.show()

Lets look at the correlation between the features. The heatmap tells us the following 
1. The output or prediction is related positively to the cp, thalachh,and slp in decreasing order of importance
2. The output or prediction is related negatively to the caa, thall, sex,age in decreasing order of importance
3. There is some positive corelation between features like slp & thalachh, age & caa, trtbps,chol
4. Negative correlation between age and thalachh

Data processing before training a model.
1. one hot encode the categorical features
2. Standardise the numerical features

In [None]:
cat_features=['caa','cp','restecg','slp','thall']
#onehotenoding
df=heartdata[cat_features]
enc = OneHotEncoder(handle_unknown='ignore')
encod=enc.fit(df)
a=encod.transform(df).toarray()
cols=encod.get_feature_names(cat_features)
cols

In [None]:
enc_df=pd.DataFrame(a, columns=cols)
enc_df.head()

In [None]:
#drop categorical columns
new_df=heartdata.drop(cat_features,axis=1)
#Join 2 dataframes
new_df=new_df.join(enc_df)
new_df.shape

In [None]:
#normalise numerical columns
std=['age','trtbps','chol','thalachh', 'oldpeak']
scaler = StandardScaler(copy=False)
scaler.fit(new_df[std])
stdpd=scaler.transform(new_df[std])

In [None]:
new1_df=pd.DataFrame(stdpd,columns=std)
new_df=new_df.drop(std,axis=1)
new_df=new_df.join(new1_df)
new_df.shape

separating the labels for the data, i.e the output column of the dataset

In [None]:
labels=new_df['output']
new_df=new_df.drop('output',axis=1)

In [None]:
#splitting the dataset into train and test sets. Since its a small dataset, we use most 
#of the samples for training

X_train, X_test, y_train, y_test = train_test_split(new_df, labels, test_size=0.2, random_state=42)

In [None]:
X_train.shape , X_test.shape

In [None]:
#SVC
clf_svc=SVC(random_state=42,probability=True).fit(X_train,y_train)
y_pred=clf_svc.predict(X_test)
clf_svc.score(X_test,y_test)
y_pred

An important step in analysing how well the model has been trained is to check for the following
1. Accuracy
2. Precision
3. Recall
4. F1score
5. AuC

We use the probabilities output by the model, to draw the area under the ROC. This graph gives us the TPR vs FPR for various thresholds

In [None]:
tn, fp, fn, tp=confusion_matrix(y_test, y_pred).ravel()
prob=clf_svc.predict_proba(X_test)
prob=np.hsplit(prob,2)[1]
prob=np.array(prob).squeeze()

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test,prob)

In [None]:
plt.plot(fpr,tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC: AUC = {:.4f}".format(roc_auc_score(y_test,y_pred)))


In [None]:
recall=recall_score(y_test,y_pred)
prec=precision_score(y_test,y_pred)
f1sc=f1_score(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)

print (" Recall : {}\n Precision : {}\n F1 score : {}\n Accuracy : {}".format(recall,prec,f1sc,acc))

The above model performs quite well with a precision of 93.3% and recall of 87.5% , giving an f1 score of 90.3%
A high f1 score in turn tells us that the model does good both in terms of recall and precision.
In the medical setting, recall is of utmost importance since it is important to not miss out on an acutal positive case. i.e. in this case, it becomes more important to catch the cases with a higher probability of heart attack