In [1]:
import pandas as pd
import numpy as np

import seaborn as sns

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Dataset from kaggle
# https://www.kaggle.com/dileep070/heart-disease-prediction-using-logistic-regression
dataset = pd.read_csv('../data/framingham.csv')
dataset.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Drop all the NAs
dataset.dropna(inplace = True)

In [4]:
# Identify the dependent and Independent Variables
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

In [5]:
# Split into training and test dataset
X_train , X_test , Y_train , Y_test= train_test_split(X, Y, test_size = 0.2)

**BaggingClassifier**:

A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.

This algorithm encompasses several works from the literature. When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting. If samples are drawn with replacement, then the method is known as Bagging. When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces. Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches.

**BaggingClassifier Parameters:**
- `base_estimator` : The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.
***
- `n_estimators` : The number of base estimators in the ensemble.
***
- `max_samples` : The number of samples to draw from X to train each base estimator.
***
- `max_features` : The number of features to draw from X to train each base estimator.
***
- `bootstrap` : Whether samples are drawn with replacement. If False, sampling without replacement is performed.
***
- `bootstrap_features` : Whether features are drawn with replacement.
***
- `oob_score` : Whether to use out-of-bag samples to estimate the generalization error.
***
- `warm_start` : When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble.

In [6]:
from sklearn.ensemble import BaggingClassifier

clf = SVC(gamma="auto")
bagging_clf = BaggingClassifier(base_estimator=clf, n_estimators=150, random_state=42)
bagging_clf.fit(X_train, Y_train)

BaggingClassifier(base_estimator=SVC(gamma='auto'), n_estimators=150,
                  random_state=42)

In [7]:
Y_pred1 = bagging_clf.predict(X_test)

In [8]:
# Confusion Matrix
cm_score = confusion_matrix(Y_pred1, Y_test)
print("Confusion Matrix")
print(cm_score)

# Accuracy Score
acc_score = accuracy_score(Y_pred1, Y_test)
print("Accuracy Score")
print(acc_score)

Confusion Matrix
[[624 108]
 [  0   0]]
Accuracy Score
0.8524590163934426
