In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns #importing seaborn module 
import warnings
from collections import Counter
warnings.filterwarnings('ignore')  #this will ignore the warnings.it wont display warnings in notebook
#plt.style.use('fivethirtyeight')
plt.style.use('ggplot')
plt.rcParams['figure.figsize']=[6,3]
plt.rcParams['figure.dpi']=80

In [None]:
data = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
#Missing Values
data.isnull().sum()

#No missing values

In [None]:
data.info()

303 rows & 14 columns

In [None]:
data.describe()

Generate categorical and numerical columns

In [None]:
cat_col = ['sex', 'cp', 'restecg', 'exang', 'slope', 'ca','thal','fbs','target']
num_col = ['age', 'trestbps', 'chol','thalach','oldpeak']


First Look at categorical data 

In [None]:
plt.figure(figsize=(12, 28))
count = 1
for cols in cat_col:
    plt.subplot(9, 2, count)
    data[cols].value_counts().plot.pie(shadow=True,autopct='%1.1f%%')
    count +=1
    plt.subplot(9, 2, count)
    sns.countplot(cols, data=data)
    count+=1

* male (68.3%), female (31.7%) 
* cp - maximum value is 0 (47.2%) i.e.  0 -> 2-> 1 -> 3
* restecg - max occurance is 0(50.2%) & 1 (48.5%), 2 (1.3%) is minimal
* exang - 67.3% have no and 32.7% are yes
* slope - max occurance of 1&2 (46%), 0 is minimal (6.9%)
* ca- values from 0 to 4, 0 -> 1 -> 2 -> 3 -> 4 
* thal - 0,1,2,3 ; 2&3 are max, o is min
* fbs - 85.1% = 0 n 14.9%=1
* target - 54.5% have heart diesese, 45.5% have none

Numerical Data - Outlier detection and removal

In [None]:
features = num_col

def outlier_hunt(df):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than 2 outliers. 
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in df.columns.tolist():
        print("col", col)
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        print("Q1", Q1)
        
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        print("Q3", Q3)
        
        # Interquartile rrange (IQR)
        IQR = Q3 - Q1
        print("IQR", IQR)
        # outlier step
        outlier_step = 1.5 * IQR
        print("outlier_step", outlier_step)
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > 2 )
    
    print("outlier_indices",outlier_indices)
    return multiple_outliers   

print('The dataset contains %d observations with more than 2 outliers' %(len(outlier_hunt(data[features]))))


No outliers > 2, hence no need to drop

Numerical Data Analysis

In [None]:
plt.figure(figsize=(12, 24))
count = 1
for cols in num_col:
    plt.subplot(6, 2, count)
    sns.boxplot(x='target', y= cols, data= data)
    count +=1
    plt.subplot(6, 2, count)
    
    g = sns.kdeplot(data[cols][(data["target"] == 0) & (data[cols].notnull())], color="Red", shade = True)
    g = sns.kdeplot(data[cols][(data["target"] == 1) & (data[cols].notnull())], ax =g, color="Blue", shade= True)
    g.set_xlabel(cols)
    g.set_ylabel("Frequency")
    g = g.legend(["No Diesese","Diesese"])
    count+=1

* age - young people (25-30) have higher chance than older people
* For People having heart diesese - max distribution lies between (45-59) 
  Vs not having heart diesese (51-62)
* trestbps - people with value 190-220 & (80-85) have no heart diesese
* chol - people with very high chol (530-600) have heart diesese
* fbs - Doesn't not have any variance
* thalach - people with value (52-85) do not have diesese
* max freq of having diesese is between (150-172)
* oldpeak - people with value (5-7)& (-1.5 to -1.2) : No diesese


In [None]:
#Categorical Variables : Catplot
for cols in cat_col:
    if cols!='target':
        sns.catplot(x=cols,y='target',kind='bar',data=data)


* sex - Female have more chance of heart diesese than male
* cp - Value 1,2&3 indicate higher chance of heart diesese than 0
* restecg - Value 0 & 1 indicate higher chance of diesese than 2.
* Value 2 seems to be a outlier
* exang - value 0 indicates higher chance of diesese than 1
* slope - 2 indiactes higher chances of diesese than 0&1
* ca - Value 4&0 have higher chance of diesese than 1,2,3
* thal - Value 2 have highest chance and 3 have minimum chance of heart diesese
* fbs - fbs=0 have slightly higher chance of heart diesese than fbs=1


Explore Categorical Variables

In [None]:
#Catplot cp+target+restecg+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='restecg',hue='sex')


female have higher chances of having heart diesese than male for cp = 0to3 & restecg from 0-2

In [None]:
#Catplot cp+target+exang+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='exang',hue='sex')


female have higher chances of having heart diesese than male for cp = 0to3 & exang from 0-3
except for cp=3 & exang=3 for which female does not have heart diesese


In [None]:
#Catplot cp+target+slope+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='slope',hue='sex')


* female have no heart diesese for slope =0 & cp = 0to3
* for slope = 1 & 2, female have higher chance of heart diesese than male
* for cp=0to3, except slope=1 & cp=3, for which female have no heart diesese


In [None]:
#Catplot cp+target+ca+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='ca',hue='sex')



* Female - 
* For ca=0, higher chance for female than male for diesese
* For ca=1, & cp=0to2,higher chance for female than male for diesese
* For ca=2& cp=0,1, only female have diesese
* For ca=3 & 4 female do not have diesese

* male - 
* For (ca=1 & cp=3) and (ca=2 & cp=2to3) and ca=3 and 4, only male have diesese


In [None]:
#Catplot cp+target+thal+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='thal',hue='sex')

* male - for thal=0 & cp=2, only male have diesese
* for thal=1 & cp=0-3 only male have diesese
* for thal=2 & cp=0,2,3 , female have higher chance than male of heart diesese
* except for cp=1 where both have equal chance for heart diesese
* For thal=3, female have higher chance for cp=2, for other cp values only male have heart diesese


In [None]:
#Catplot restecg+target+exang+sex
sns.catplot(x='exang',y='target',kind='point',data=data,col='restecg',hue='sex')


* for restecg=2, only female have diesese for exang=0
* For restecg=0, female have higher diesese chance than male


In [None]:
#Explore Numerical Variables
sns.catplot(x='target',y='age',data=data,kind='box',hue='sex', col='cp')


For cp=0&3 age of female is slightly higher than male 

In [None]:
sns.catplot(x='target',y='chol',data=data,kind='box',hue='sex', col='exang')

Freq distribution of chol is higher for female than male  

In [None]:
sns.catplot(x='target',y='chol',data=data,kind='box',hue='sex', col='slope')

For slope=0 & diesese = 1, chol value for male are higher than female

In [None]:
sns.catplot(x='target',y='age',data=data,kind='box',hue='sex', col='ca')

For ca=2, and diesese=1, male have very low age than female

In [None]:
sns.catplot(x='target',y='oldpeak',data=data,kind='box',hue='sex', col='thal')

For thal=3 & diesese=1, age of male&female are lower than diesese=0

In [None]:
#Catplot cp+target+fbs+sex
sns.catplot(x='cp',y='target',kind='point',data=data,col='fbs',hue='sex')


female have higher chances of having heart diesese than male for fbs = 0

In [None]:
#Variable = age Vs restecg
f,ax=plt.subplots(1,3,figsize=(12,8))
sns.distplot(data[data['restecg']==0].age,ax=ax[0])
ax[0].set_title('age in restecg 0')
sns.distplot(data[data['restecg']==1].age,ax=ax[1])
ax[1].set_title('age in restecg 1')
sns.distplot(data[data['restecg']==2].age,ax=ax[2])
ax[2].set_title('age in restecg 2')
plt.show()

older people (age= 60 & 80) have restecg=2

In [None]:
#Variable = age Vs fbs
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.distplot(data[data['fbs']==0].age,ax=ax[0])
ax[0].set_title('age in fbs 0')
sns.distplot(data[data['fbs']==1].age,ax=ax[1])
ax[1].set_title('age in fbs 1')

people (age< 42) have no fbs

In [None]:
#Final Pair plot
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':12})
fig=plt.gcf()
fig.set_size_inches(18,10)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

Data Processing 

In [None]:
train = data.drop(["target"],axis=1)
train_ = data["target"]

X_train = train.values
y_train = train_.values

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

Train Test Split

In [None]:
from sklearn.model_selection import train_test_split, cross_val_predict,cross_validate
train_x, test_x,train_y,test_y = train_test_split(X_train,y_train,test_size  = 0.2, random_state=0)
print("Train dataset shape: {0}, \nTest dataset shape: {1}".format(train_x.shape, test_x.shape))


Machine Learning Algorithm begins

Simple KNN (K Nearest Neighbours)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

#storing  the K nearest neighbors classifier
Misclassified_sample = []
for i in range(1, 30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x,train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y != pred_i).sum())
print("Misclassified_sample = ", Misclassified_sample)

In [None]:
# Lowest number of samples for K=8

KNN_classifier = KNeighborsClassifier(n_neighbors=8)

# Fitting the values fo X and Y
KNN_classifier.fit(train_x, train_y)

#Predicting the test values with Model
prediction =  KNN_classifier.predict(test_x)

###### confusion matrix  starts ######
from sklearn.metrics import accuracy_score, confusion_matrix
cm_knn = confusion_matrix(test_y,prediction) 
names = np.unique(prediction)
sns.heatmap(cm_knn, square=True, annot=True, cbar=False,xticklabels=names, yticklabels=names, cmap="YlGnBu" ,fmt='g')
plt.xlabel('Truth')
plt.ylabel('Predicted')
###### Confusion matrix ends ########

#calculating the accuracy
accuracy_score = accuracy_score(test_y,prediction)
print("accuracy_score KNN=8 :",accuracy_score)

KNN accuracy : 88.52% with only 7 misclassified samples

Logistic Regresion 

In [None]:
# Generic function for model building
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
def fit_and_test(classifier, X_train, y_train, X_test, y_test, only_return_accuracy=False):
  classifier.fit(X_train, y_train)
  y_hat = classifier.predict(X_test)
  print('accuracy:', accuracy_score(y_test, y_hat))
  if not only_return_accuracy:
    print('f1_score:', f1_score(y_test, y_hat))


In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
#grid search over regularisation hyperparameter 'c'
for c in [0.001,.01, 0.02, 0.05, 0.25, 0.5, 0.75, 1,1.05,1.1,1.5,1.6,2,7]:
  lr = LogisticRegression(C=c, max_iter=1000) 
  print (f'At C = {c}:-', end=' ')
  fit_and_test(lr, train_x, train_y, test_x, test_y, True)

Logistic Regression Accuracy : 85.42% 

# Conclusion : We recommend KNN (88.52% accuracy with only 7 misclassified samples) to build the model

# **Please UpVote, if you have liked my Kernel :)**