In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Lets import Libraries for EDA and data visualizations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df.shape

Data size seems to be very small. So we should try simple models Here.
Lets see what the data looks like.

In [None]:
df.head()

In [None]:
df.describe()

Lets see Pandas Profiling on this Data 

In [None]:
pp.ProfileReport(df)

Data is a mixture of categorical and continuous values. Thats cool.
Lets see the distribution of target.

In [None]:
df.target.value_counts()

**DATA DESCRIPTION**

(This data description is given on UCI website for this dataset[ Link](https://archive.ics.uci.edu/ml/datasets/heart+disease))

*  age: age in years
*  sex: sex (1 = male; 0 = female)
*  cp: chest pain type
    -- Value 1: typical angina
    -- Value 2: atypical angina
    -- Value 3: non-anginal pain
    -- Value 4: asymptomatic
*  trestbps: resting blood pressure (in mm Hg on admission to the hospital)
*  chol: serum cholestoral in mg/dl
*  fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
*  restecg: resting electrocardiographic results
    -- Value 0: normal
    -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
*  thalach: maximum heart rate achieved
*  exang: exercise induced angina (1 = yes; 0 = no)
*  oldpeak = ST depression induced by exercise relative to rest
*  slope: the slope of the peak exercise ST segment
    -- Value 1: upsloping
    -- Value 2: flat
    -- Value 3: downsloping
*  ca: number of major vessels (0-3) colored by flourosopy
*  thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
*  target: Heart disease (0 = no, 1 = yes)

Lets see if there any null value in data. If there is any, We have to handle it here.

In [None]:
df.isnull().sum()

So no null values in data. Seems like small and clean dataset. Lets proceed...

Now we can see how many unique values each feature have. So to get the idea of categorical and numerical features.

In [None]:
for column in df.columns:
    print(column,df[column].nunique())

cool...
Now we can see the correlation matrix of data. Lets see if there is any high correlated feature that we can pay attention to.

In [None]:
plt.rcParams['figure.figsize'] = (16, 14)
# plt.style.use('ggplot')
sns.heatmap(df.corr(), annot = True, cmap = 'PiYG')
plt.title('Heatmap of Data', fontsize = 20)
plt.show()

Nope. Seems like no such highly correlated feature.

**PLOTS**

Okay. Time to make some plots to get the feel of data. This part is very important before modelling. Maybe fo small datasets you can get away without any plots and visualizations. But when the data size is huge and lots of features are there. You will have to make some plots to get better idea of the dataset.

Lets first plot numeric features and see there distributions.

In [None]:
f,ax=plt.subplots(3,2,figsize=(12,12))
f.delaxes(ax[2,1])

for i,feature in enumerate(['age','thalach','chol','trestbps','oldpeak']):
    sns.distplot(df[feature], ax=ax[i//2,i%2], hist=True, color= 'y' )

Seems fine.

Now lets plot categorical features and see there distribution. We are using countplot here.

In [None]:
f,ax=plt.subplots(4,2,figsize=(10,8))

for i,feature in enumerate(['sex','cp','fbs','restecg','exang','slope','ca','thal']):
    sns.countplot(x=feature,data=df,ax=ax[i//2,i%2], alpha=0.8, edgecolor=('white'), linewidth=2)
    plt.tight_layout()

Okay. so some featues are not evenly distributed here. For small datasets, this problem can be there.

We have visualized each features individually, Now we can plot there relation with the target variable to see their impact on target. This can give how a feature can be important in predicting target.

Lets first see numeric features interaction with target

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
sns.violinplot(df['target'], df['age'], palette = 'colorblind')
plt.title('Age vs Target', fontsize = 20, fontweight = 30)
plt.show()

In this distribution we can see Age is not a good feature in deciding target. 

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
sns.violinplot(df['target'], df['thalach'], palette = 'colorblind')
plt.title('thalach vs Target', fontsize = 20, fontweight = 30)
plt.show()

Here Target 1 seems to higher mean and also low "maximum heart rate achieved" values means no Heart Disease. It makes sense to us. Great.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
sns.violinplot(df['target'], df['chol'], palette = 'colorblind')
plt.title('chol vs Target', fontsize = 20, fontweight = 30)
plt.show()

In this plot, we can see people with heart diseases might have some high cholesterol values. Good feature!

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
sns.violinplot(df['target'], df['trestbps'], palette = 'colorblind')
plt.title('trestbps vs Target', fontsize = 20, fontweight = 30)
plt.show()

this plot shows that people not suffering from Heart disease might have little high blood pressure.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
sns.violinplot(df['target'], df['oldpeak'], palette = 'colorblind')
plt.title('oldpeak vs Target', fontsize = 20, fontweight = 30)
plt.show()

this plot shows that people not suffering from Heart disease might have high ST depression. 

Now lets see relation of categorical features with target.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['restecg']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of ECG measurement with Target', fontsize = 20, fontweight = 30)
plt.show()

restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)

Here we can see 0 and 2 values of ECG are more common in People with no heart disease. And value of 1 is more common in People with disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['fbs']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of blood sugar with Target', fontsize = 20, fontweight = 30)
plt.show()

Not any such difference.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['sex'])
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of Gender with Target', fontsize = 20, fontweight = 30)
plt.show()

So you can say Women are more prone to heart disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['cp']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of  chest pain with Target', fontsize = 20, fontweight = 30)
plt.show()

cp: The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic)

Here we can see count of 0 is high in people with no heart disease whereas count of other values are high in people with heart disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['exang']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of Exercise induced angina with Target', fontsize = 20, fontweight = 30)
plt.show()

Here also we can see relation with target. count of 0 is high in people with disease and count of 1 is high in people without disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['slope']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of slope with Target', fontsize = 20, fontweight = 30)
plt.show()

Also related. Count of 1 is higher in people without disease and count of 2 is higher in people with disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['ca']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation of major vessels with Target', fontsize = 20, fontweight = 30)
plt.show()

This plot is also very clear. Count of  is higher in people with disease and  other counts are higher in people without disease.

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
dat = pd.crosstab(df['target'], df['thal']) 
dat.div(dat.sum(1).astype(float), axis = 0).plot(kind = 'bar')
plt.title('Relation thalassemia with Target', fontsize = 20, fontweight = 30)
plt.show()

This plot is also very clear. Count of 2 is higher in people with disease and count of 3 is higher in people without disease

So these were some important plots. You can also make bivariate plot for multiple feature interactions. But I wanted to keep it simple.

In [None]:
categorical_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [None]:
for i in categorical_cols:
    print(i,'\n', df[i].value_counts())

**PREPROCESSING**

In preprocessing step, we will standard scale all numeric features and one-hot encode all multilabel features. This is pretty simple and straight forward Preprocessing. You can also do some outlier removal.

In [None]:
multi_label_cols = [i for i in categorical_cols if df[i].nunique()>2]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std = StandardScaler()
df[numeric_cols] = std.fit_transform(df[numeric_cols])

In [None]:
df.shape

In [None]:
df = pd.get_dummies(data = df,columns = multi_label_cols)

In [None]:
x = df.drop(['target'],axis=1)
y = df['target']

Now lets do test train split for model training and evaluation. You can also try k-fold cross validation here.

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

**TRAINING AND EVALUATING MODELS**

Since the dataset is small, We will try simple models to reduce overfitting here. But can try XGBoost just for fun. 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
lr = LogisticRegression()
svm = SVC(probability=True)
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
xg = xgb.XGBClassifier()

In [None]:
models = ['lr','svm','rf','xg']
for model in models:
    clf = eval(model)
    clf.fit(x_train, y_train)
    y_pred_prob = clf.predict_proba(x_test)[:, 1]
    y_pred = clf.predict(x_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    # evaluating the model
    print(f"Training Accuracy for model {model} is: ", clf.score(x_train, y_train))
    print(f"Testing Accuracy for model {model} is:", clf.score(x_test, y_test))
    print(f"AUC Score for model {model} is: {auc}")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot = True)
    print(classification_report(y_test, y_pred))
    plt.figure()
    plt.plot(fpr, tpr)
    plt.title(f'ROC for model {model}')

Here we can see the Logistic regression is doing a good job and not overfitting the data.


**Hyperparameter tuning**

Here we are trying Grid search for Logistic regression which did a good job. But you can also try it on different models.

In [None]:
from sklearn.model_selection import GridSearchCV
grid={"C":np.logspace(-3,3,10), "penalty":["l1","l2"]}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(x,y)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

That's it. Seems like a decent accuracy with minimal work. 



Please give it an upvote if you like this notebook. Also if you have any questions or comments, Please post. Will answer surely. 

Thank you.