# Lab 4 - Ensemble learning 

 Bagging algorithm

In [3]:
import pandas as pd 
import numpy as np 

In [5]:
# load the dataset
df = pd.read_csv("diabetes.csv")
# See how our dataset is structured u
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Missing values makes the dataset incomplete. 
# The dataset with missing values leads to inconsistent results and poor model performance.
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# Adding x and y variables
# The x variable will hold all the input columns, while the y variable will hold the output column.
# In our case, our output column is the Output column. The remaining columns will be used as model inputs.
X = df.drop("Outcome",axis="columns")
y = df.Outcome

In [9]:
# Dataset scaling - Dataset scaling is transforming a dataset to fit within a specific range.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# See some of the scaled datasets
X_scaled[:3]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415]])

In [11]:
# Splitting the Dataset - We will split the scaled dataset into training and testing. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [12]:
# To check the number of data samples in the training set
X_train.shape

(576, 8)

In [13]:
# See the size of the testing dataset
X_test.shape

(192, 8)

In [14]:
# Model building using Decision Tree Classifier
# The decision tree classifier is the Scikit-learn algorithm used for classification.
from sklearn.tree import DecisionTreeClassifier

In [15]:
# We will use k-fold cross-validation to build our decision tree classifier. 
# The model is then trained using each subset and gets the accuracy scores after each iteration. 
# Finally, the mean accuracy score is calculated. K refers to the number of subsets/portions we split the dataset.
from sklearn.model_selection import cross_val_score

In [16]:
# It produces the model accuracy score after each iteration
scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores

array([0.72077922, 0.67532468, 0.7012987 , 0.77777778, 0.7254902 ])

In [17]:
# Getting the Mean Accuracy Score
scores.mean()

0.7201341142517613

# Implementing the Bagging algorithms

In [18]:
# The BaggingClassifier classifier will follow all the bagging steps and build an optimized model. 
# The BaggingClassifier will fit the weak/base learners on the randomly sampled subsets.
# Next, it will use the voting techniques to produce an aggregated final model. 
# Finally, we will use the DecisionTreeClassifier algorithm as our weak/base learners.
from sklearn.ensemble import BaggingClassifier

In [19]:
bag_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=0.8, bootstrap=True,oob_score=True,random_state=0)

In [20]:
# Fitting the Model - Fitting will enable the model to learn from the training dataset to understand the dataset
bag_model.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.8,
                  n_estimators=100, oob_score=True, random_state=0)

In [21]:
# Accuracy Score
bag_model.oob_score_

0.7534722222222222

The model improves the accuracy score. For example, the accuracy score improved from 0.7214497920380273 to 0.7534722222222222.

In [22]:
# We can also check the accuracy score using the testing dataset to determine if our model is overfitting. 
bag_model.score(X_test, y_test)

0.7760416666666666

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

evaluate(bag_model, X_train, X_test, y_train, y_test)

The accuracy score shows that our model is not overfitting. Overfitting occurs when we get a lower accuracy when using the testing dataset.