In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [39]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

In [4]:
from sklearn.decomposition import PCA

In [5]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


### Outlier removal

In [8]:
# outliers present in the following (based on df.describe()): resting bp,cholestoral, fasting bs, oldpeak



In [9]:
def get_z_score(val, mean, std):
    return (val-mean)/std

_, RestingBP_mean, RestingBP_std, *_ = df.RestingBP.describe()
_, Cholesterol_mean, Cholesterol_std, *_ = df.Cholesterol.describe()
_, FastingBS_mean, FastingBS_std, *_ = df.FastingBS.describe()
_, Oldpeak_mean, Oldpeak_std, *_ = df.Oldpeak.describe()

df['RestingBP_z'] = df.RestingBP.apply(lambda x:get_z_score(x, RestingBP_mean, RestingBP_std))
df['Cholesterol_z'] = df.Cholesterol.apply(lambda x: get_z_score(x, Cholesterol_mean, Cholesterol_std))
df['FastingBS_z'] = df.FastingBS.apply(lambda x: get_z_score(x, FastingBS_mean, FastingBS_std))
df['Oldpeak_z'] = df.Oldpeak.apply(lambda x: get_z_score(x, Oldpeak_mean, Oldpeak_std))

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,RestingBP_z,Cholesterol_z,FastingBS_z,Oldpeak_z
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.410685,0.824621,-0.551041,-0.831979
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1.49094,-0.171867,-0.551041,0.105606
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.129442,0.769768,-0.551041,-0.831979
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.30266,0.138964,-0.551041,0.574398
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.950812,-0.034736,-0.551041,-0.831979


In [10]:
df1 = df[(df['RestingBP_z']<3)]# | (df['RestingBP_z']<-3)]# or (df['Cholesterol_z']>3) or (df['FastingBS_z']>3) or (df['Oldpeak_z']>3)]

In [11]:
df1.shape

(911, 16)

In [12]:
df2 = df1[(df1['Cholesterol_z']<=3)]# | (df1['Cholesterol_z']>-3)]

In [13]:
df2.shape

(908, 16)

In [14]:
df2[(df2['FastingBS_z']>3) | (df2['FastingBS_z']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,RestingBP_z,Cholesterol_z,FastingBS_z,Oldpeak_z


In [15]:
df2[(df2['Oldpeak_z']>3)]# | (df2['Oldpeak_z']<-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,RestingBP_z,Cholesterol_z,FastingBS_z,Oldpeak_z
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1,0.410685,0.294379,-0.551041,3.855945
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0,2.463169,0.650921,-0.551041,3.105877
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1,0.410685,0.16639,-0.551041,4.418496
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1,0.410685,0.9069,-0.551041,3.105877
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1,1.49094,-0.318141,-0.551041,4.981047
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1,-0.993646,1.089741,-0.551041,3.293394


In [16]:
df2[(df2['Oldpeak_z']>=3)]# | (df2['Oldpeak_z']>-3)]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,RestingBP_z,Cholesterol_z,FastingBS_z,Oldpeak_z
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1,0.410685,0.294379,-0.551041,3.855945
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0,2.463169,0.650921,-0.551041,3.105877
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1,0.410685,0.16639,-0.551041,4.418496
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1,0.410685,0.9069,-0.551041,3.105877
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1,1.49094,-0.318141,-0.551041,4.981047
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1,-0.993646,1.089741,-0.551041,3.293394


In [17]:
df3 = df2[(df2['Oldpeak_z']<=3)]

In [18]:
df3.shape

(902, 16)

In [19]:
df3.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,RestingBP_z,Cholesterol_z,FastingBS_z,Oldpeak_z
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0,0.410685,0.824621,-0.551041,-0.831979
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1,1.49094,-0.171867,-0.551041,0.105606
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0,-0.129442,0.769768,-0.551041,-0.831979
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1,0.30266,0.138964,-0.551041,0.574398
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0,0.950812,-0.034736,-0.551041,-0.831979


In [20]:
df3 = df3.drop(['RestingBP_z','Cholesterol_z','FastingBS_z','Oldpeak_z'], axis='columns')

In [21]:
df3.shape

(902, 12)

### Label encoding

In [22]:
df4 = df3.copy()

In [23]:
df4.Sex = df4.Sex.replace(
{'M':0,
    'F':1}
)

In [24]:
df4.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [25]:
df4.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [26]:
df4.ChestPainType.replace(
{
    'ATA':0, 'NAP':1, 'ASY':2, 'TA':3
}, inplace=True
)

In [27]:
df4.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [28]:
df4.RestingECG.replace(
{
    'Normal':0, 'ST':1, 'LVH':2
    
}, inplace=True)

In [29]:
df4.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [30]:
df4.ExerciseAngina.replace(
{
    'N':0,
    'Y':1
}, inplace=True)

In [31]:
df4.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [32]:
df4.ST_Slope.replace({
    'Up':0, 'Flat':1, 'Down':2
}, inplace=True)

In [33]:
df4.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


In [34]:
df5 = pd.get_dummies(df4, columns=['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope'], \
                     drop_first=True)

In [35]:
df5.head()

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,HeartDisease,Sex_1,ChestPainType_1,ChestPainType_2,ChestPainType_3,FastingBS_1,RestingECG_1,RestingECG_2,ExerciseAngina_1,ST_Slope_1,ST_Slope_2
0,40,140,289,172,0.0,0,0,0,0,0,0,0,0,0,0,0
1,49,160,180,156,1.0,1,1,1,0,0,0,0,0,0,1,0
2,37,130,283,98,0.0,0,0,0,0,0,0,1,0,0,0,0
3,48,138,214,108,1.5,1,1,0,1,0,0,0,0,1,1,0
4,54,150,195,122,0.0,0,0,1,0,0,0,0,0,0,0,0


In [36]:
X = df5.drop(['HeartDisease'], axis='columns')
y = df5.HeartDisease

### Scale the data

In [37]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Standalone model - SVC

In [38]:
model = SVC()

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y)

In [42]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8893805309734514

## Bagging - SVC

In [44]:
bag_model = BaggingClassifier(
            base_estimator = SVC(),
            n_estimators = 100,
            max_samples = 0.8,
            oob_score = True,
            random_state = 2)

In [45]:
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.8609467455621301

In [46]:
bag_model.score(X_test, y_test)

0.8893805309734514

Bagging is effective when we have high variance and instable model such as decision tree. Let's explore how bagging changes the performance for a decision tree classifier.

## Standalone model - Decision tree

In [47]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit( X_train, y_train)
model.score(X_test, y_test)

0.7920353982300885

## Bagging - Decision tree

In [48]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.8461538461538461

In [49]:
bag_model.score(X_test, y_test)

0.8451327433628318

You can see that with bagging the score improved 

* Bagging tries to solve over-fitting problem while Boosting tries to reduce bias. If the classifier is unstable (high variance), then we should apply Bagging. If the classifier is stable and simple (high bias) then we should apply Boosting.