In [44]:
url = "https://raw.githubusercontent.com/codebasics/py/refs/heads/master/ML/19_Bagging/diabetes.csv"

import pandas as pd

df = pd.read_csv(url)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [45]:
ip = df.drop("Outcome",axis="columns")
op = df.Outcome

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ip_scaled = scaler.fit_transform(ip)

In [47]:
from sklearn.model_selection import train_test_split

ip_train, ip_test, op_train, op_test = train_test_split(ip_scaled, op, stratify=op, random_state=10)
# stratify will make sure to keep the ratio of True vs False outcomes consistent through the datasets

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), ip, op, cv=5)
scores

array([0.68831169, 0.63636364, 0.68831169, 0.77777778, 0.7254902 ])

In [49]:
scores.mean()

0.7032509973686445

In [50]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),  # which model to use
    n_estimators=100, # how many sub estimators
    max_samples=0.8, # sample size
    oob_score=True, # test with out of bag data
    random_state=0 # for reproducibility
)

bag.fit(ip_train, op_train)

In [51]:
bag.oob_score_

0.7534722222222222

In [52]:
bag.score(ip_test, op_test)

0.7760416666666666

In [53]:
score = cross_val_score(bag, ip, op, cv=5)
score.mean()

0.7578728461081402

In [54]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=50), ip, op, cv=5)
scores.mean()

0.7565911212970037

# Heart Failure

In [55]:
url = "https://raw.githubusercontent.com/codebasics/py/refs/heads/master/ML/18_PCA/Exercise/heart.csv"

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [57]:
heart_df = pd.read_csv(url)
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [58]:
heart_df_1 = heart_df[heart_df.Cholesterol <= (heart_df.Cholesterol.mean()+3*heart_df.Cholesterol.std())]

In [59]:
heart_df_2 = heart_df_1[heart_df_1.Oldpeak <= (heart_df_1.Oldpeak.mean()+3*heart_df_1.Oldpeak.std())]

In [60]:
heart_df_3 = heart_df_2[heart_df_2.RestingBP <= (heart_df_2.RestingBP.mean()+3*heart_df_2.RestingBP.std())]

In [61]:
# map sex, exercise to integers # can use encoding here as well

heart_df_3.loc[:, 'Sex'] = heart_df_3['Sex'].map({"M": 1, "F": 0})
heart_df_3.loc[:, 'ExerciseAngina'] = heart_df_3['ExerciseAngina'].map({"Y": 1, "N": 0})

heart_df_3.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [62]:
heart_df_3 = pd.get_dummies(heart_df_3, drop_first=True, dtype=int)
heart_df_3.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_1,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_1,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [63]:
ip = heart_df_3.drop("HeartDisease", axis=1)
op = heart_df_3.HeartDisease

In [64]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ip_scaled = scaler.fit_transform(ip)

In [65]:
scores = cross_val_score(SVC(), ip_scaled, op, cv=5)
scores.mean()

0.8336525475751996

In [66]:
ip_train, ip_test, op_train, op_test = train_test_split(ip_scaled, op, stratify=op, random_state=10)

In [67]:
bag_svm = BaggingClassifier(
    estimator=SVC(),  # which model to use
    n_estimators=100, # how many sub estimators
    max_samples=0.8, # sample size
    oob_score=True, # test with out of bag data
    random_state=0 # for reproducibility
)

bag_svm.fit(ip_train, op_train)

In [68]:
bag_svm.oob_score_

0.8698224852071006

In [69]:
bag_svm.score(ip_test, op_test)

0.8805309734513275

In [70]:
# For SVM Model, bagging has improved the score by 5%. Good.

In [71]:
bag_tree = BaggingClassifier(
    estimator=DecisionTreeClassifier(),  # which model to use
    n_estimators=100, # how many sub estimators
    max_samples=0.8, # sample size
    oob_score=True, # test with out of bag data
    random_state=0 # for reproducibility
)

bag_tree.fit(ip_train, op_train)

In [72]:
bag_tree.oob_score_

0.8550295857988166

In [73]:
bag_tree.score(ip_test, op_test)

0.8451327433628318

In [74]:
scores = cross_val_score(DecisionTreeClassifier(), ip, op, cv=5)
scores.mean()

0.7204849600982197

In [75]:
# for Desison Tree, the score increased by 13%

In [76]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(), ip, op, cv=5)
scores.mean()

0.8136832412523021

In [77]:
# Random forest comes very close to BaggedTrees because that's what it is using underneath