In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
heart = pd.read_csv('../input/heart-disease-uci/heart.csv')

Attribute Information:
> 1. age
> 2. sex (1 = male; 0 = female)
> 3. chest pain type (4 values)
> 4. resting blood pressure
> 5. serum cholestoral in mg/dl
> 6. fasting blood sugar > 120 mg/dl
> 7. resting electrocardiographic results (values 0,1,2)
> 8. maximum heart rate achieved
> 9. exercise induced angina
> 10. oldpeak = ST depression induced by exercise relative to rest
> 11. the slope of the peak exercise ST segment
> 12. number of major vessels (0-3) colored by flourosopy
> 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
> 14. target (1=yes, 0=no)

# Data exploration

In [None]:
heart.info()

In [None]:
heart.describe()

In [None]:
heart.sample(5)

Check for any null values

In [None]:
heart.isnull().sum()

In [None]:
heart.isnull().any(axis=1).sum()

In [None]:
heart.groupby('target').mean()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(heart.corr(), cmap='Spectral', annot=True)
plt.show()

Difference of correlation coefficients in Male and Female

In [None]:
abs(heart[heart['sex']==0].corr()['target'].drop(labels=['sex'])) - abs(heart[heart['sex']==1].corr()['target'].drop(labels=['sex']))

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
ax.plot(heart[heart['sex']==0].corr()['target'].drop(labels=['sex', 'target']), label='Female')
ax.plot(heart[heart['sex']==1].corr()['target'].drop(labels=['sex', 'target']), label='Male')
ax.plot(heart.corr()['target'].drop(labels=['sex', 'target']), label='Both')
plt.title('Correlation of heart disease with various parameters')
plt.legend()

## Observations:
1. Age is negatively correlated with heart disease. As older people are more likely to get heart disease, they are likely to go for health check-up even when they have mild or no symptoms. Young people only go for health check-up when they have clear symptoms, so they are more likely to be diagnosed with having heart disease.
2. cholesterol level, fasting blood glucose have negligible correlation with heart disease.
3. Chest pain(cp), maximum heart rate(thalach), slope of ST segment in ECG are positively correlated with heart disease.
4. exercise induced angina(exang), oldpeak(ST depression induced by exercise), number of major vessels (0-3) colored by flourosopy(ca), thal are negatively correlated with heart disease. In all of these correlation is less for Males than Females.
5. trestbps and fbs are negatively correlated for females compared to Males.

In [None]:
heart.groupby(['sex','target']).count()

In [None]:
heart['age range'] = pd.cut(heart['age'], bins=[0, 40, 50, 60, 70, 100])

In [None]:
sns.countplot(heart['age range'], hue='target', data=heart)

In [None]:
heart.groupby(['age range', 'sex', 'target'])['age'].count()

In [None]:
heart.groupby(['age range', 'target', 'sex'])['age'].count()

Females are more likely to be diagnosed with heart disease in all age groups.

In [None]:
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [None]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [None]:
for i in numerical_features:
    g = sns.FacetGrid(heart, col='sex', hue='target', height=5)
    g.map(sns.distplot, i)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16,8))
for i, ax in enumerate(axes.ravel()):
    sns.countplot(heart[categorical_features[i]], ax=ax, hue=heart['target'])
    ax.set_xlabel(categorical_features[i])
plt.tight_layout()

In [None]:
pp = numerical_features
pp.append('target')
pp

In [None]:
sns.pairplot(heart.loc[:, pp], hue='target')

# Logistic Regression

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

In [None]:
heart.head(3)

In [None]:
score_mean = {}
score_max = {}

In [None]:
def process(dataframe, rand):
    y = dataframe['target']
    X = dataframe.drop(['target', 'age range'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = rand)
    mct = make_column_transformer(
            (OneHotEncoder(categories='auto', handle_unknown='ignore',sparse=False), ['cp', 'slope', 'thal']), 
            remainder=MinMaxScaler())
    X_train = mct.fit_transform(X_train)
    X_test = mct.transform(X_test)
    return X_train, X_test, y_train, y_test

In [None]:
def regression(dataframe, rand):
    y = dataframe['target']
    X = dataframe.drop(['target', 'age range'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = rand)
    mct = make_column_transformer(
            (OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore'), ['cp', 'slope', 'thal']), 
            remainder=MinMaxScaler())
    X_train = mct.fit_transform(X_train)
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(X_train, y_train)
    X_test = mct.transform(X_test)
    return logreg, X_test, y_test

Regression score is varying with random state chosen for splitting data.

In [None]:
scores = []
for i in range(0, 200):
    logreg, X_test, y_test = regression(heart, i)
    scores.append(logreg.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(scores)
plt.xlabel('random state')
plt.ylabel('regression score')

In [None]:
np.array(scores).mean()

In [None]:
score_mean['Logistic Regression'] = np.round(np.array(scores).mean(), 2)

Average score for random states between 0 and 200 is 0.84

In [None]:
logreg, X_test, y_test = regression(heart, 153)

In [None]:
logreg.score(X_test, y_test)

In [None]:
score_max['Logistic Regression'] = np.round(logreg.score(X_test, y_test), 2)

In [None]:
predictions = logreg.predict(X_test)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
prob = logreg.predict_proba(X_test)

In [None]:
roc_score = roc_auc_score(y_test, predictions)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, prob[:, 1])

In [None]:
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_score)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

Heat map of probabilities for each case.

In [None]:
sns.heatmap(prob[np.argsort(prob[:, 0])])

In [None]:
plt.figure(figsize=(15,5))
plt.plot(prob[np.argsort(prob[:, 0])])
plt.xlabel('test case number')
plt.ylabel('probability')
plt.legend(['disease', 'no disease'])

## Running the model separately for females and males

In [None]:
heart_f = heart[heart['sex']==0].drop(['sex'], axis=1)

In [None]:
logreg_f, X_test, y_test = regression(heart_f, 0)

In [None]:
logreg_f.score(X_test, y_test)

In [None]:
heart_m = heart[heart['sex']==1].drop(['sex'], axis=1)

In [None]:
logreg_m, X_test, y_test = regression(heart_m, 33)

In [None]:
logreg_m.score(X_test, y_test)

Regression model score is higher for females than males probably because correlation is stronger.

In [None]:
scores = []
for i in range(0, 200):
    logreg, X_test, y_test = regression(heart_f, i)
    scores.append(logreg.score(X_test, y_test))

In [None]:
np.array(scores).mean()

Average score for random states between 0 and 200 is 0.89 for females.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(scores)
plt.xlabel('random state')
plt.ylabel('regression score')

In [None]:
scores = []
for i in range(0, 200):
    logreg, X_test, y_test = regression(heart_m, i)
    scores.append(logreg.score(X_test, y_test))

In [None]:
np.array(scores).mean()

Average score for random states between 0 and 200 is 0.79 for females.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(scores)
plt.xlabel('random state')
plt.ylabel('regression score')

# K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
max_score = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    score_list = []
    for i in range(1, 20):
        knn = KNeighborsClassifier(n_neighbors = i)
        knn.fit(X_train, y_train)
        score_list.append(knn.score(X_test, y_test))
    max_score.append(max(score_list))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(max_score)

In [None]:
np.array(max_score).mean()

In [None]:
score_mean['K Nearest Neighbors'] = np.round(np.array(max_score).mean(), 2)

Average score for random states between 0 and 200 is 0.85

In [None]:
X_train, X_test, y_train, y_test = process(heart, 153)

In [None]:
score_list = []
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    score_list.append(knn.score(X_test, y_test))

In [None]:
max(score_list)

In [None]:
plt.plot(range(1,20), score_list)
plt.xticks(range(1,20))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()

Highest score is achieved with neighbors = 12

In [None]:
knn = KNeighborsClassifier(n_neighbors = 12)

In [None]:
knn.fit(X_train, y_train)

In [None]:
predictions = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

In [None]:
score_max['K Nearest Neighbors'] = np.round(knn.score(X_test, y_test), 2)

# Support Vector Machines

In [None]:
score_mean

In [None]:
from sklearn.svm import SVC

In [None]:
score_list = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    svm = SVC(1, gamma='scale')
    svm.fit(X_train, y_train)
    score_list.append(svm.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)

In [None]:
np.array(score_list).mean()

In [None]:
score_mean['Support Vector Machines'] = np.round(np.array(score_list).mean(), 2)

Average score for random states between 0 and 200 is 0.83

In [None]:
X_train, X_test, y_train, y_test = process(heart, 153)

In [None]:
svm = SVC(1, gamma='scale')

In [None]:
svm.fit(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

In [None]:
score_max['Support Vector Machines'] = np.round(svm.score(X_test, y_test), 2)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
score_list = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    score_list.append(dtc.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)

In [None]:
np.array(score_list).mean()

In [None]:
score_mean['Decision Tree Classifier'] = np.round(np.array(score_list).mean(), 2)

Average score for random states between 0 and 200 is 0.74

In [None]:
X_train, X_test, y_train, y_test = process(heart, 5)

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
dtc.fit(X_train, y_train)

In [None]:
dtc.score(X_test, y_test)

In [None]:
score_max['Decision Tree Classifier'] = np.round(dtc.score(X_test, y_test), 2)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
score_list = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    score_list.append(rfc.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)

In [None]:
np.array(score_list).mean()

In [None]:
score_mean['Random Forest Classifier'] = np.round(np.array(score_list).mean(), 2)

Average score for random states between 0 and 200 is 0.82

In [None]:
X_train, X_test, y_train, y_test = process(heart, 153)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
rfc.score(X_test, y_test)

In [None]:
score_max['Random Forest Classifier'] = np.round(rfc.score(X_test, y_test), 2)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
score_list = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    score_list.append(gnb.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)

In [None]:
np.array(score_list).mean()

In [None]:
score_mean['Naive Bayes'] = np.round(np.array(score_list).mean(), 2)

Average score for random states between 0 and 200 is 0.79

In [None]:
X_train, X_test, y_train, y_test = process(heart, 153)

In [None]:
gnb = GaussianNB()

In [None]:
gnb.fit(X_train, y_train)

In [None]:
gnb.score(X_test, y_test)

In [None]:
score_max['Naive Bayes'] = np.round(gnb.score(X_test, y_test), 2)

# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
# To filter ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
import warnings
warnings.filterwarnings('ignore') 

In [None]:
score_list = []
for i in range(200):
    X_train, X_test, y_train, y_test = process(heart, i)
    mlp = MLPClassifier(10, max_iter=200)
    mlp.fit(X_train, y_train)
    score_list.append(mlp.score(X_test, y_test))

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)

In [None]:
np.array(score_list).mean()

In [None]:
score_mean['Neural Network'] = np.round(np.array(score_list).mean(), 2)

Average score for random states between 0 and 200 is 0.83

In [None]:
X_train, X_test, y_train, y_test = process(heart, 153)

In [None]:
mlp = MLPClassifier(10, max_iter=200)

In [None]:
mlp.fit(X_train, y_train)

In [None]:
mlp.score(X_test, y_test)

In [None]:
score_max['Neural Network'] = np.round(mlp.score(X_test, y_test), 2)

# Comparison of Classifiers

In [None]:
score_max

In [None]:
score_mean

In [None]:
plt.figure(figsize=(10,5))
plt.plot(list(score_mean.keys()), list(score_mean.values()), 'b-o', label = 'mean score')
plt.plot(list(score_max.keys()), list(score_max.values()), 'r-*', label = 'max score')
for i, v in enumerate(score_mean.values()):
    plt.text(i, v+.01, v)
for i, v in enumerate(score_max.values()):
    plt.text(i, v+.01, v)
plt.xticks(rotation=45)
plt.ylim(0.7, 1)
plt.legend()