In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
import warnings
from sklearn.cluster import KMeans
import warnings
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.model_selection import train_test_split #for data splitting
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm
import statsmodels.api as sm
import eli5 #for purmutation importance
from eli5.sklearn import PermutationImportance
import shap #for SHAP values
from pdpbox import pdp, info_plots #for partial plots
np.random.seed(123) #ensure reproducibility
pd.options.mode.chained_assignment = None  #hide any pandas warnings
import scipy.stats as stats
import pylab
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)


  import pandas.util.testing as tm


ModuleNotFoundError: ignored

In [None]:
dt = pd.read_csv("heart.csv")

In [None]:
dt.head(10)

In [None]:
dt.tail()

In [None]:
dt.columns

In [None]:
dt.columns = ['age', 'gender', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [None]:
dt['gender'][dt['gender'] == 0] = 'female'
dt['gender'][dt['gender'] == 1] = 'male'

dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain'
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic'

dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality'
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'

dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes'

dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'
dt['st_slope'][dt['st_slope'] == 2] = 'flat'
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping'

dt['thalassemia'][dt['thalassemia'] == 1] = 'normal'
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect'
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'

In [None]:
dt.dtypes

In [None]:
dt.info()

# DATA CLEANSING

In [None]:
dt.isnull().sum()

### Droping Null values because Null values are very less as compared to data size

In [None]:
dt=dt.dropna()

# DATA INSIGHTS

In [None]:
dt.age.unique()

In [None]:
dt.age.value_counts()

In [None]:
dt.gender.unique()

In [None]:
dt.gender.value_counts()

In [None]:
dt.chest_pain_type.unique()

In [None]:
dt.chest_pain_type.value_counts()

In [None]:
dt.resting_blood_pressure.unique()

In [None]:
dt.resting_blood_pressure.value_counts()

In [None]:
dt.cholesterol.unique()

In [None]:
dt.cholesterol.value_counts()

In [None]:
dt.fasting_blood_sugar.unique()

In [None]:
dt.fasting_blood_sugar.value_counts()

In [None]:
dt.rest_ecg.unique()

In [None]:
dt.rest_ecg.value_counts()

In [None]:
dt.max_heart_rate_achieved.value_counts()

In [None]:
dt.max_heart_rate_achieved.unique()

In [None]:
dt.exercise_induced_angina.unique()

In [None]:
dt.exercise_induced_angina.value_counts()

In [None]:
dt.st_depression.value_counts()

In [None]:
dt.st_depression.value_counts()

In [None]:
dt.st_slope.unique()

In [None]:
dt.st_slope.value_counts()

In [None]:
dt.num_major_vessels.value_counts()

In [None]:
dt.num_major_vessels.unique()

In [None]:
dt.thalassemia.unique()

In [None]:
dt.thalassemia.value_counts()

In [None]:
from dataprep.datasets import load_dataset
from dataprep.eda import create_report
#df = load_dataset("titanic")
create_report(dt)

# CHECK FOR OUTLIERS

In [None]:
plt.boxplot(dt.age)

In [None]:
plt.boxplot(dt.resting_blood_pressure)

In [None]:
plt.boxplot(dt.cholesterol)

In [None]:
plt.boxplot(dt.max_heart_rate_achieved)

In [None]:
plt.boxplot(dt.num_major_vessels)

# STATISTICAL EDA

In [None]:
dt.mean()

In [None]:
dt.median()

In [None]:
dt.mode()

In [None]:
dt.describe()

In [None]:
dt.skew()

In [None]:
dt.corr()

In [None]:
dataplot = sns.heatmap(dt.corr(),cmap = "PiYG",annot = True)
plt.rcParams['figure.figsize'] = [10,10]

## Heart Diseases Ratio in Dataset
Blue Graph indicate no heart desease and Orange Graph show Heart desease

In [None]:
def plotTarget():
    sns.countplot(x='target', data=dt, ax=ax)
    for i, p in enumerate(ax.patches):
        count=dt['target'].value_counts().values[i]
        x=p.get_x()+ p.get_width() /2.
        y=p.get_height() + 3
        label='{:1.2f}'.format(count / float(dt.shape[0]))
        ax.text(x, y,label, ha='center')
        
fig_target,ax=plt.subplots(nrows=1, ncols=1, figsize=(5, 2))
plotTarget()

## Select Age as most dependent data on label
Disease Probability Bar Plot

In [None]:
def plotAge():
    facet_grid = sns.FacetGrid(dt, hue='target')
    facet_grid.map(sns.kdeplot, "age", shade=True, ax=axes[0])
    legend_labels = ['disease false', 'disease true']
    #for t, l in zip(axes[0].get_legend().texts, legend_labels):
        #t.set_text(l)
        
    axes[0].set(xlabel='age', ylabel='density')

    avg = dt[["age", "target"]].groupby(['age'], as_index=False).mean()
    sns.barplot(x='age', y='target', data=avg, ax=axes[1])
    axes[1].set(xlabel='age', ylabel='disease probability')

    plt.clf()

In [None]:
fig_age, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 8))

plotAge()

## Ploting Function For Categorical Data " Chest Pain" && "Thalassemia"
Ploting Function For Continoius Data

In [None]:
category=[('gender',['female','male']),('chest_pain_type',['typical angina','atypical angina','non-anginal pain','asymptomatic']),('fasting_blood_sugar',['lower than 120mg/ml','greater than 120mg/ml']),('rest_ecg',['normal','ST-T wave abnormality','left ventricular hypertrophy']),('exercise_induced_angina',['no','yes']),('st_slope',['upsloping','flat','downsloping']),('thalassemia',['fixed','normal','reversable'])]
continuous = [('resting_blood_pressure'),('cholesterol'),('max_heart_rate_achieved'),('st_depression'),('num_major_vessels')]

        
def plotCategorial(attribute, labels, ax_index):
    sns.countplot(x=attribute, data=dt, ax=axes[ax_index][0])
    sns.countplot(x='target', hue=attribute, data=dt, ax=axes[ax_index][1])
    avg = dt[[attribute, 'target']].groupby([attribute], as_index=False).mean()
    sns.barplot(x=attribute, y='target', hue=attribute, data=avg, ax=axes[ax_index][2])
    
    for t, l in zip(axes[ax_index][1].get_legend().texts, labels):
        t.set_text(l)
    for t, l in zip(axes[ax_index][2].get_legend().texts, labels):
        t.set_text(l)


def plotContinuous(attribute, xlabel, ax_index):
    sns.distplot(dt[[attribute]], ax=axes[ax_index][0])
    axes[ax_index][0].set(xlabel=xlabel, ylabel='density')
    sns.violinplot(x='target', y=attribute, data=dt, ax=axes[ax_index][1])
    
    
def plotGrid(isCategorial):
    if isCategorial:
        [plotCategorial(x[0], x[1], i) for i, x in enumerate(category)] 
    else:
        [plotContinuous(x[0], x[1], i) for i, x in enumerate(continuous)]

In [None]:
fig_categorial,axes=plt.subplots(nrows=len(category), ncols=3, figsize=(14, 18))
plotGrid(isCategorial=True)

In [None]:
sns.kdeplot(dt.age)

In [None]:
stats.probplot(dt.age,plot=pylab)

In [None]:
sns.kdeplot(dt.max_heart_rate_achieved)

In [None]:
stats.probplot(dt.max_heart_rate_achieved,plot=pylab)

In [None]:
def normality(data,feature):
    mlp.figure(figsize=(10,5))
    mlp.subplot(1,2,1)
    sns.kdeplot(data[feature])
    mlp.subplot(1,2,2)
    stats.probplot(data[feature],plot=pylab)
    mlp.show()

In [None]:
normality(dt,'age')

In [None]:
normality(dt,'resting_blood_pressure')

In [None]:
normality(dt,'cholesterol')

In [None]:
normality(dt,'resting_blood_pressure')

In [None]:
normality(dt,'max_heart_rate_achieved')

In [None]:
normality(dt,'st_depression')

In [None]:
normality(dt,'num_major_vessels')

In [None]:
sns.pairplot(dt,hue="target",size = 3)

# TRANSFORMATIONS(log, sqrt, reciprocal, boxcox)

In [None]:
dt1 = dt.copy(deep = True)

In [None]:
dt1['age_log'] = np.log(dt1['age'])
normality(dt1,'age_log')

In [None]:
dt1['age_rcp'] = 1/dt1.age
normality(dt1,'age_rcp')

In [None]:
dt1['age_sqrt'] = np.sqrt(dt1.age)
normality(dt1,'age_sqrt')

In [None]:
dt1['age_bcx'],parameters = stats.boxcox(dt1['age'])
normality(dt1,'age_bcx')

In [None]:
dt1['resting_blood_pressure_log'] = np.log(dt1['resting_blood_pressure'])
normality(dt1,'resting_blood_pressure_log')

In [None]:
dt1['resting_blood_pressure_rcp'] = 1/dt1.resting_blood_pressure
normality(dt1,'resting_blood_pressure_rcp')

In [None]:
dt1['resting_blood_pressure_sqrt'] = np.sqrt(dt1.resting_blood_pressure)
normality(dt1,'resting_blood_pressure_sqrt')

In [None]:
dt1['resting_blood_pressure_exp'] = dt1.resting_blood_pressure**(1/1.2)
normality(dt1,'resting_blood_pressure_exp')

In [None]:
dt1['resting_blood_pressure_box'],parameters = stats.boxcox(dt1['resting_blood_pressure'])
normality(dt1,'resting_blood_pressure_box')

# FEATURE ENGINEERING

In [None]:
X = dt.iloc[:,:-1]
Y = dt['target']

In [None]:
X.head()

In [None]:
Y.head()

# ONE HOT ENCODER

In [None]:
x = pd.get_dummies(X, drop_first=True)

In [None]:
x.head()

In [None]:
x.shape

# FEATURE SELECTION

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
### Apply SelectKBest Algorithm
ordered_rank_features=SelectKBest(score_func=chi2,k=19)
ordered_feature=ordered_rank_features.fit(x,Y)

In [None]:
dtscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dtcolumns=pd.DataFrame(x.columns)

In [None]:
features_rank=pd.concat([dtcolumns,dtscores],axis=1)

In [None]:
features_rank.columns=['Features','Score']
features_rank

In [None]:
features_rank.nlargest(10,'Score')

# Feature Importance




This technique gives you a score for each feature of your data,the higher the score mor relevant it is

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
#import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(x,Y)

In [None]:
print(model.feature_importances_)

In [None]:
ranked_features=pd.Series(model.feature_importances_,index=x.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()

# Remove the uncorrelated features

In [None]:
threshold=0.2

In [None]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
correlation(x,threshold)

# INFORMATION GAIN

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
mutual_info=mutual_info_classif(x,Y)

In [None]:
mutual_data=pd.Series(mutual_info,index=x.columns)
mutual_data.sort_values(ascending=False)

In [None]:
selected_features = []
rfe = RFE(LogisticRegression())

rfe.fit(x.values, Y.values)

for i, feature in enumerate(x.columns.values):
    if rfe.support_[i]:
        selected_features.append(feature)

selected_X = x[selected_features]
selected_y = Y

lm = sm.Logit(selected_y, selected_X)
result = lm.fit()

print(result.summary2())

# DATA SPLITING

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,Y, test_size = .2, random_state=10) #split the data

In [None]:
X_train

In [None]:
X_test

# LAZY PREDICT TO DECIDE BEST ALGORITHMS

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [None]:
# Defines and builds the lazyclassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, y_train, y_train)
models_test,predictions_test = clf.fit(X_train, X_test, y_train, y_test)

# Prints the model performance
models_train

# DECISION TREE CLASSIFIER

In [None]:
dt_clf_gini = DecisionTreeClassifier(criterion = "gini",
                                     random_state = 100,
                                     max_depth = 5,
                                     min_samples_leaf = 5)
 
dt_clf_gini.fit(X_train, Y_train)
y_pred_gini = dt_clf_gini.predict(X_test)
 
print ("Decision Tree using Gini Index\nAccuracy is ",accuracy_score(y_test, y_pred_gini)*100 )

In [None]:
model = RandomForestClassifier(max_depth=5)
model.fit(X_train, y_train)

In [None]:
estimator = model.estimators_[1]
feature_names = [i for i in X_train.columns]

y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values

In [None]:
#code from https://towardsdatascience.com/how-to-visualize-a-decision-tree-from-a-random-forest-in-python-using-scikit-learn-38ad2d75f21c

export_graphviz(estimator, out_file='tree.dot', feature_names = feature_names,
                class_names = y_train_str,
                rounded = True, proportion = True, 
                label='root',
                precision = 2, filled = True)

from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=100'])

from IPython.display import Image
Image(filename = 'tree.png')

In [None]:
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)

# LOGISTIC REGRESSION 

In [None]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)

print(f"Accuracy: {lr.score(X_test,Y_test):0.3f}")

# NAIVE BAYES

# K-NN

# STOCHASTIC GRADIENT DESCENT

# SUPPORT VECTOR MACHINE

# PERFORMANCE ANALYSIS

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred_bin)
confusion_matrix

In [None]:
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for heart attack classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
auc(fpr, tpr)

In [None]:
perm = PermutationImportance(model, random_state=1).fit(X_test, Y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
base_features = x.columns.values.tolist()
#base_features.remove('target')

feat_name = 'num_major_vessels'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

In [None]:
feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

In [None]:
feat_name = 'st_depression'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

In [None]:
inter1  =  pdp.pdp_interact(model=model, dataset=X_test, model_features=base_features, features=['st_slope_upsloping', 'st_depression'])

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['st_slope_upsloping', 'st_depression'], plot_type='contour')
plt.show()

inter1  =  pdp.pdp_interact(model=model, dataset=X_test, model_features=base_features, features=['st_slope_flat', 'st_depression'])

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['st_slope_flat', 'st_depression'], plot_type='contour')
plt.show()

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values[1], X_test, plot_type="bar")

In [None]:
shap.summary_plot(shap_values[1], X_test)

In [None]:
def heart_disease_risk_factors(model, patient):

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient)
    shap.initjs()
    return shap.force_plot(explainer.expected_value[1], shap_values[1], patient)

In [None]:
data_for_prediction = X_test.iloc[1,:].astype(float)
heart_disease_risk_factors(model, data_for_prediction)

In [None]:
data_for_prediction = X_test.iloc[3,:].astype(float)
heart_disease_risk_factors(model, data_for_prediction)

In [None]:
ax2 = fig.add_subplot(224)
shap.dependence_plot('num_major_vessels', shap_values[1], X_test, interaction_index="st_depression")

In [None]:
shap_values = explainer.shap_values(X_train.iloc[:50])
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test.iloc[:50])