In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

age: age in years 

sex: (1 = male; 0 = female) 

cp: chest pain type 

trestbps: resting blood pressure (in mm Hg on admission to the hospital) 

chol: serum cholestoral in mg/dl 

fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 

restecg: resting electrocardiographic results 

thalach: maximum heart rate achieved 

exang: exercise induced angina (1 = yes; 0 = no) 

oldpeak: ST depression induced by exercise relative to rest 

slope: the slope of the peak exercise ST segment 

ca: number of major vessels (0-3) colored by flourosopy 

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 

target: 1 or 0

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 400
import seaborn as sns #plotting package
import graphviz #to visualize decision trees
from sklearn import datasets
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.shape

In [None]:
df.columns = ['age','sex', 'chest_pain', 'resting_blood_pressure', 'cholestoral', 'fasting_blood_sugar', 'resting_ECG', 'max_heart_rate', 'angina_from_exercise',
          'st_depression', 'st_slope','major_vessels_with_flourosopy','thalassemia','target']


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.distplot(df[col])

In [None]:
features_response = df.columns.tolist()
features_response

In [None]:
corr = df[features_response].corr()

In [None]:
sns.heatmap(corr,xticklabels = corr.columns.values,
           yticklabels = corr.columns.values, center = 0)

In [None]:
X = df[features_response].iloc[:,:-1].values
y = df[features_response].iloc[:,-1].values
print(X.shape, y.shape)

In [None]:
from sklearn.feature_selection import f_classif
[f_stat, f_p_value] = f_classif(X,y)
f_test_df = pd.DataFrame({'Feature': features_response[:-1],
                         'F statistic': f_stat,
                         'p value': f_p_value})
f_test_df.sort_values('p value')

In [None]:
#Determining the best features from statistics using 80th percentile and above
from sklearn.feature_selection import SelectPercentile
selector = SelectPercentile(f_classif, percentile = 20)
selector.fit(X,y)
best_feature_ix = selector.get_support()
best_feature_ix
features = features_response[:-1]
best_features = [features[counter] for counter in range(len(features))
                if best_feature_ix[counter]]
best_features

In [None]:
labels = np.array(df['target'])
features = df.drop('target', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

In [None]:
#Using logisitc regression with 'angina' to determine a heart disease patient or not
from sklearn.linear_model import LogisticRegression
my_lr = LogisticRegression() #Binary classifier
my_lr.C = 0.1
my_lr.solver = 'liblinear'
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df['angina_from_exercise'].values.reshape(-1,1), df['target'].values, test_size =0.2, random_state=24)
print('The mean values of the train predictions are {}, \nThe mean values of the testing predictions are {}.'.format(np.mean(y_train), np.mean(y_test)))

In [None]:
my_lr.fit(X_train, y_train)
y_pred = my_lr.predict(X_test)
from sklearn import metrics
metrics.accuracy_score(y_test,y_pred)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
y_pred_proba = my_lr.predict_proba(X_test) #obtaining predicted probabilites
pos_proba = y_pred_proba[:,1] #putting second column of predicted probabilites into an array
#plot an roc auc curve
fpr,tpr, thresholds = metrics.roc_curve(y_test, pos_proba)
plt.plot(fpr,tpr,'*-')
plt.plot([0,1],[0,1],'r--')
plt.legend(['Logistic regression','Random chance'])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')

In [None]:
metrics.roc_auc_score(y_test, pos_proba)

**Creating the decision tree**

In [None]:
#Using a decision tree classifier to decide variable importance
from sklearn.model_selection import train_test_split
from sklearn import tree
XA_train, XA_test, ya_train, ya_test = \
train_test_split(df[features_response[:-1]].values, df['target'].values, 
                test_size=0.2, random_state = 24)

In [None]:
dt = tree.DecisionTreeClassifier(max_depth = 4)
dt.fit(XA_train, ya_train)
dot_data = tree.export_graphviz(dt, out_file = None, filled = True, 
                               rounded= True, feature_names = features_response[:-1],
                                proportion=True, class_names = ['Not Heart Disease', 'Heart Disease'])
graph = graphviz.Source(dot_data)
graph

In [None]:
ya_pred = dt.predict(XA_test)
confusionmatrix = metrics.confusion_matrix(ya_test, ya_pred)
confusionmatrix

In [None]:
#plot an roc auc curve
fpr,tpr, thresholds = metrics.roc_curve(ya_test, ya_pred)
plt.plot(fpr,tpr,'*-')
plt.plot([0,1],[0,1],'r--')
plt.legend(['Logistic regression','Random chance'])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
acc = metrics.accuracy_score(ya_test,ya_pred)
print('The Decision Trees accuracy is {}'.format(acc))

In [None]:
P = sum(ya_test)
TP = sum((ya_test==1) & ya_pred==1)
FN = sum((ya_test==1) & (ya_pred==0))
N = sum(ya_test ==0)
TN = sum((ya_test==0)&(ya_pred==0))
FP = sum((ya_test==0) & (ya_pred==1))
SE = TP/(TP+FN)
SP= TN/(TN+FP)
print('The sensitivity is {} and the the specificity is {}'.format(SE,SP))

In [None]:
from sklearn.tree import export_graphviz

var_importance = list(dt.feature_importances_)
feature_importances1 = [(features, round(importance,2)) for features, importance in zip(feature_list, var_importance)]


feature_importances1 = sorted(feature_importances1, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances1];

In [None]:
#Plotting variable importance
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(var_importance)))
# Make a bar chart
plt.bar(x_values, var_importance, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');


Using a random forest classifier for the decision tree

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(XA_train, ya_train)

In [None]:
rf_pred = clf.predict(XA_test)

acc2 = metrics.accuracy_score(ya_test,rf_pred)
print('The Decision Trees accuracy is {}'.format(acc2))

In [None]:
n_nodes = []
max_depths = []

# Stats about the trees in random forest
for ind_tree in clf.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

In [None]:
from sklearn.tree import export_graphviz

var_importance1 = list(clf.feature_importances_)
feature_importances2 = [(features, round(importance,2)) for features, importance in zip(feature_list, var_importance1)]


feature_importances2 = sorted(feature_importances2, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances2];

In [None]:
#Plotting variable importance
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(var_importance1)))
# Make a bar chart
plt.bar(x_values, var_importance1, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');


In [None]:
features_response = df.columns.tolist()

items_to_remove = ['age', 'cholestoral', 'st_slope','max_heart_rate','sex', 'resting_blood_pressure', 'fasting_blood_sugar', 'resting_ECG','angina_from_exercise','target']

features_response = [item for item in features_response if item not in items_to_remove]
features_response


In [None]:
newXA_train, newXA_test, newya_train, newya_test = \
train_test_split(df[features_response[:-1]].values, df['target'].values, 
                test_size=0.2, random_state = 24)