In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pylab import rcParams
warnings.filterwarnings('ignore')

In [None]:
sns.set_style("darkgrid")

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

# Data Exploration

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
data.isna().sum()

In [None]:
data['age'].nunique()

**Number of Unique Values for Each Column**

In [None]:
for x in data.columns.values:
    print(x+': '+str(data[x].nunique()))

**Create Copied Data for Visualization**

In [None]:
c_data = data.copy()

c_data['sex'] = c_data['sex'].map({0:'Male', 1:'Female'})
c_data['exng'] = c_data['exng'].map({0:'False', 1:'True'})
c_data['cp'] = c_data['cp'].map({0:'typical angina', 1:'atypical angina', 2:'non-anginal pain', 3:'asymptomatic'})
c_data['fbs'] = c_data['fbs'].map({0:'False', 1:'True'})
c_data['output'] = c_data['output'].map({0:'Less Chance', 1:'More Chance'})

c_data.head()

# **Visualization**

**Visualize Categorical Columns (Whose output is 1)**

In [None]:
def graph(name, u, title):
    sns.countplot(x=c_data[name], hue=c_data['output'], ax=u)
    
    plt.setp(u.get_xticklabels(), rotation=0)
    u.set_title(title, fontsize=11, fontdict={"fontweight": "bold"})
    
    for p in u.patches:
        text = str(int(p.get_height()))
        u.annotate(text, (p.get_x()+p.get_width()/2, p.get_height()+3),
                   ha="center", va='center', fontsize=10, fontweight="bold")

fig2, ax2 = plt.subplots(4,2, figsize=(15, 15), gridspec_kw={"wspace" : 0.4, "hspace" : 0.3, "top": 0.95})

colors=["#ff0000","#ff8000","#ffff00","#80ff00","#00ff00", "#00ff80", "#00ffff", "#0080ff", "#0000ff", "#8000ff", "#ff00ff", "#ff0080"]

graph("sex", ax2[0,0], 'sex')
graph("exng", ax2[0,1], 'Exercise induced angina')
graph("cp", ax2[1,0], 'Chest Pain Type')
graph("fbs", ax2[1,1], 'Fasting Blood Sugar > 120 mg/dl')
graph('restecg', ax2[2,0], 'Resting Electrocardiographic Results')
graph('caa', ax2[2,1], 'Number of Major Vessels')
graph('slp', ax2[3,0], 'Slope')
graph('thall', ax2[3,1], 'Thal Rate')

plt.rcParams['axes.axisbelow'] = True

**Age Visualization**

In [None]:
fig, ax = plt.subplots(2, 1,figsize=(12,10))
a = sns.histplot(c_data['age'].loc[c_data['output']=='More Chance'], bins=10, binwidth=10, binrange=(10,80), color='red', ax=ax[0])
for p in a.patches:
    a.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()-1), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax[0].set_title('More Chance of Heart Attack', fontsize=11, fontdict={"fontweight": "bold"})

b = sns.histplot(c_data['age'].loc[c_data['output']=='Less Chance'], bins=10, binwidth=10, binrange=(10,80), color='blue', ax=ax[1])
for p in b.patches:
    b.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()-1), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax[1].set_title('Less Chance of Heart Attack', fontsize=11, fontdict={"fontweight": "bold"})

plt.rcParams['axes.axisbelow'] = True

Graphs above show that numerically 50s have the most 'output = 1' counts which indicates more chance of heart attack. However, in percentage, 40s have higher percentgae of 'output = 1' ratio than any other age groups. Other age groups have less than 50% of having more chance of heart attack counts, while 40s have around 70%.

**Visualization of the Continuous Features**

In [None]:
data.head()

In [None]:
con_data = data[['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']]

In [None]:
plt.figure(figsize=(15,8))
mask = np.triu(np.ones_like(con_data.corr(), dtype=np.bool))
sns.heatmap(data=con_data.corr(),annot=True,cmap='BrBG',mask=mask)

In [None]:
def graph1(name, u, title):
    sns.kdeplot(x=con_data[name],hue=data['output'], ax=u, shade=True, palette=['#2271b1','#68de7c'])
    u.set_title(title, fontsize=11, fontdict={"fontweight": "bold"})
    

fig2, ax2 = plt.subplots(3,2, figsize=(15, 15), gridspec_kw={"wspace" : 0.4, "hspace" : 0.3, "top": 0.95})

colors=["#ff0000","#ff8000","#ffff00","#80ff00","#00ff00", "#00ff80", "#00ffff", "#0080ff", "#0000ff", "#8000ff", "#ff00ff", "#ff0080"]

graph1("age", ax2[0,0], 'Age')
graph1("trtbps", ax2[0,1], 'Resting Blood Pressure')
graph1("chol", ax2[1,0], 'Cholestoral in mg/dl fetched via BMI sensor')
graph1("thalachh", ax2[1,1], 'Thal Rate')
graph1('oldpeak', ax2[2,0], 'Previous Peak')


plt.rcParams['axes.axisbelow'] = True

**Interpretation: When the green and blue curves are almost the same, it means the feature does not separate the outcomes. Larger the difference between two curves, More important of the feature**

# **Data Preprocessing**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = data.drop('output', axis=1)
y = data['output']

sc = StandardScaler()
scaled_X = sc.fit_transform(X)

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# **Machine Learning**

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn import metrics

In [None]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
acc1 = accuracy_score(pred1, y_test)
print(classification_report(pred1, y_test))
print(acc1)

In [None]:
model2 = GaussianNB()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
acc2 = accuracy_score(pred2, y_test)
print(classification_report(pred2, y_test))
print(acc2)

In [None]:
model3 = KNeighborsClassifier()
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = accuracy_score(pred3, y_test)
print(classification_report(pred3, y_test))
print(acc3)

In [None]:
model4= DecisionTreeClassifier(max_depth=10, min_samples_leaf=15)
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
acc4 = accuracy_score(pred4, y_test)
print(classification_report(pred4, y_test))
print(acc4)

In [None]:
model5 = RandomForestClassifier()
model5.fit(X_train, y_train)
pred5 = model5.predict(X_test)
acc5 = accuracy_score(pred5, y_test)
print(classification_report(pred5, y_test))
print(acc5)

In [None]:
model6 = SVC()
model6.fit(X_train, y_train)
pred6 = model6.predict(X_test)
acc6 = accuracy_score(pred6, y_test)
print(classification_report(pred6, y_test))
print(acc6)

In [None]:
model7 = XGBClassifier()
model7.fit(X_train, y_train)
pred7 = model7.predict(X_test)
acc7 = accuracy_score(pred7, y_test)
print(classification_report(pred7, y_test))
print(acc7)

# **Model Comparison Table**

In [None]:
acc_table = pd.DataFrame({'Model': ['Logistic Regression',
                                   'Naive Bayes',
                                   'KNN',
                                   'Decision Tree',
                                   'Random Forest Tree',
                                   'SVC',
                                   'XGB'],
                         'Accuracy Score': [acc1,
                                           acc2,
                                           acc3,
                                           acc4,
                                           acc5,
                                           acc6,
                                           acc7]})
acc_table = acc_table.sort_values(by='Accuracy Score', ascending=False)
acc_table.style.background_gradient(cmap='Blues')

# **Best Model Parameter Tuning (KNN)**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

knn = KNeighborsClassifier()

space = dict()
space['n_neighbors'] = [4,5,6,7,8,10]
space['weights'] = ['uniform', 'distance']
space['leaf_size'] = [10,20,30,40,50]
space['algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']

search = GridSearchCV(knn, space, scoring='accuracy', n_jobs=-1, cv=cv)
result = search.fit(X_train,y_train)
print('Best Score: %s' %result.best_score_)
print('Best HyperParameters: %s' %result.best_params_)

In [None]:
model3 = KNeighborsClassifier(n_neighbors=7, leaf_size=10)
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = accuracy_score(pred3, y_test)
print(classification_report(pred3, y_test))
print(acc3)

The Accuracy of the result did not change at all. However, I do not know why the best score from GridSearchCV returns less accuracy than the default KNN Classifier.

# **ROC Curve of KNN Model**

In [None]:
metrics.plot_roc_curve(model3, X_test, y_test)
print('roc_auc_score is: ', roc_auc_score(y_test, pred3))

# **KNN Visualization**

Need to find out which features are correlated the most

In [None]:
plt.figure(figsize=(15,8))
mask = np.triu(np.ones_like(data.corr(), dtype=np.bool))
sns.heatmap(data=data.corr(),annot=True,cmap='BrBG',mask=mask)

* Feature to Feature Correlations - Higher value indicates simillarity of both two features. Therefore, the less value the better.
* Feature to Outcome Correlations - Higher value indicates the importance of feature 

Chose feature cp and oldpeak because feature to feature correlation is -0.15 which is low and both of their feature to outcome correlations are high (0.43, -0.43)

In [None]:
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")

def accuracy(k, X_train, y_train, X_test, y_test):
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)

    # predict the response
    pred = knn.predict(X_test)

    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)

def classify_and_plot(X, y):
    ''' 
    split data, fit, classify, plot and evaluate results 
    '''
    # split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    # init vars
    n_neighbors = 7
    h           = .02  # step size in the mesh

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])

    rcParams['figure.figsize'] = 5, 5
        
    clf = KNeighborsClassifier(n_neighbors)
    clf.fit(X_train, y_train)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    fig = plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)   
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("0/1 outcome classification (k = %i)" % (n_neighbors))
    plt.show()

        # evaluate
    y_expected  = y_test
    y_predicted = clf.predict(X_test)

        # print results
    print('----------------------------------------------------------------------')
    print('Classification report')
    print('----------------------------------------------------------------------')
    print('\n', classification_report(y_expected, y_predicted))
    print('----------------------------------------------------------------------')
    print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
    print('----------------------------------------------------------------------')

# we only take the best two features and prepare them for the KNN classifier
rows_nbr = 303 # data.shape[0]
X_prime  = np.array(data.iloc[:rows_nbr, [2,9]])
X        = X_prime # preprocessing.scale(X_prime)
y        = np.array(data.iloc[:rows_nbr, 13])

# classify, evaluate and plot results
classify_and_plot(X, y)

**KNN Classification with only two features gives us 80% of accuracy which is pretty good**