# Implementation of Boosting Algorithm

In [212]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Load Data

In [213]:
# Load the dataset
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Preprocessing

In [214]:
#check for null values
print(data.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


## Encode Data

In [215]:

# Extract the categorical columns
cat_cols = ['gender', 'ever_married','work_type','residence_type','smoking_status']
X_cat = data[cat_cols]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False) 

# Fit and transform the categorical columns
X_encoded = encoder.fit_transform(X_cat)

# Create a DataFrame with the encoded data
encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(cat_cols))

# Drop the original categorical columns and concatenate the encoded columns
data = pd.concat([data.drop(cat_cols, axis=1), encoded_df], axis=1)


In [216]:
data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,61.0,0,0,202.21,,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,80.0,0,1,105.92,32.5,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,49.0,0,0,171.23,34.4,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,79.0,1,0,174.12,24.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Handling Null Values

In [217]:
# Fill NA values
interpolated_data=data.interpolate(method='linear')
print(interpolated_data.isnull().sum())

age                               0
hypertension                      0
heart_disease                     0
avg_glucose_level                 0
bmi                               0
stroke                            0
gender_Male                       0
gender_Other                      0
ever_married_Yes                  0
work_type_Never_worked            0
work_type_Private                 0
work_type_Self-employed           0
work_type_children                0
residence_type_Urban              0
smoking_status_formerly smoked    0
smoking_status_never smoked       0
smoking_status_smokes             0
dtype: int64


## Feature Extraction and Splitting

In [218]:
X = interpolated_data.drop('stroke', axis=1)  # Features
y = interpolated_data['stroke']  # Target variable

# Splitting the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Initialize Model

In [219]:
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Fit the classifier to the training data
clf.fit(X_train, y_train)

## Predict on Test Set 

In [220]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

## Evaluate Results

In [221]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy*100:.2f}%')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 89.92%
Confusion Matrix:
[[913  55]
 [ 48   6]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       968
           1       0.10      0.11      0.10        54

    accuracy                           0.90      1022
   macro avg       0.52      0.53      0.53      1022
weighted avg       0.91      0.90      0.90      1022



## Apply Boosting Algorithm

In [222]:
class Boosting:

    def __init__(self,dataset,T,test_dataset):
        self.dataset = dataset
        self.T = T
        self.test_dataset = test_dataset
        self.alphas = None
        self.models = None
        self.accuracy = []
        self.predictions = None
    
    def fit(self):
        # Set the descriptive features and the target feature
        X = self.dataset.drop(['stroke'],axis=1)
        Y = self.dataset['stroke'].where(self.dataset['stroke']==1,-1)

        # Initialize the weights of each sample with wi = 1/N and create a dataframe in which the evaluation is computed
        Evaluation = pd.DataFrame(Y.copy())
        Evaluation['weights'] = 1/len(self.dataset) # Set the initial weights w = 1/N
        
        # Run the boosting algorithm by creating T "weighted models"
        
        alphas = [] 
        models = []
        
        for t in range(self.T):
            Tree_model = DecisionTreeClassifier(criterion="entropy",max_depth=1)
            model = Tree_model.fit(X,Y,sample_weight=np.array(Evaluation['weights'])) 

            # Append the single weak classifiers to a list which is later on used to make the weighted decision
            models.append(model)
            predictions = model.predict(X)
            score = model.score(X,Y)

            # Add values to the Evaluation DataFrame
            Evaluation['predictions'] = predictions
            Evaluation['evaluation'] = np.where(Evaluation['predictions'] == Evaluation['stroke'],1,0)
            Evaluation['misclassified'] = np.where(Evaluation['predictions'] != Evaluation['stroke'],1,0)

            # Calculate the misclassification rate and accuracy
            accuracy = sum(Evaluation['evaluation'])/len(Evaluation['evaluation'])
            misclassification = sum(Evaluation['misclassified'])/len(Evaluation['misclassified'])


            # Caclulate the error
            err = np.sum(Evaluation['weights']*Evaluation['misclassified'])/np.sum(Evaluation['weights'])
 
   
            # Calculate the alpha values
            alpha = np.log((1-err)/err)
            alphas.append(alpha)


            # Update the weights wi --> These updated weights are used in the sample_weight parameter
            # for the training of the next decision stump. 
            Evaluation['weights'] *= np.exp(alpha*Evaluation['misclassified'])

            #print('The Accuracy of the {0}. model is : '.format(t+1),accuracy*100,'%')
            #print('The missclassification rate is: ',misclassification*100,'%')
        
        self.alphas = alphas
        self.models = models
            
    def predict(self):
        X_test = self.test_dataset.drop(['stroke'],axis=1).reindex(range(len(self.test_dataset)))
        Y_test = self.test_dataset['stroke'].reindex(range(len(self.test_dataset))).where(self.dataset['stroke']==1,-1)
    
        # With each model in the self.model list, make a prediction 
        
        accuracy = []
        predictions = []
        
        for alpha,model in zip(self.alphas,self.models):
            prediction = alpha*model.predict(X_test) # We use the predict method for the single decisiontreeclassifier models in the list
            predictions.append(prediction)
            self.accuracy.append(np.sum(np.sign(np.sum(np.array(predictions),axis=0))==Y_test.values)/len(predictions[0]))
        self.predictions = np.sign(np.sum(np.array(predictions),axis=0))


## Boost

In [223]:

number_of_base_learners = 50

for i in range(number_of_base_learners):
    model = Boosting(interpolated_data,i,interpolated_data)
    model.fit()
    model.predict()
acc=model.accuracy[-1]*100
print(f'With a number of {number_of_base_learners} base models we receive an accuracy of {acc:.2f}%')    


With a number of 50 base models we receive an accuracy of 95.13%


In [224]:
print(f'Accuracy improved from {accuracy*100:.2f}% to {acc:.2f}%')

Accuracy improved from 89.92% to 95.13%
