Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

Getting Mesothelioma Dataset

In [2]:
dataset = pd.read_csv('meso.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

In [3]:
for i in range(len(Y)): 
    if Y[i] == 2:
        Y[i] = 0

In [4]:
print(X)

[[47.  1.  0. ...  0.  0. 34.]
 [55.  1.  0. ...  1.  1. 42.]
 [29.  1.  1. ...  0.  0. 43.]
 ...
 [58.  1.  6. ...  0.  1. 68.]
 [42.  1.  6. ...  1.  0. 78.]
 [54.  1.  0. ...  1.  0. 45.]]


Splitting data into training and test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 20)

Normalizing features

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train)

[[ 0.25423309 -1.20686442 -1.03063205 ...  0.81348922 -1.04527154
   0.61040794]
 [-0.75217371 -1.20686442 -0.20758313 ...  0.81348922  0.95668921
   0.52421475]
 [-2.94797037 -1.20686442 -1.03063205 ... -1.22927259 -1.04527154
  -0.42391029]
 ...
 [ 0.71169073 -1.20686442  1.43851469 ...  0.81348922  0.95668921
   0.61040794]
 [-0.02024149 -1.20686442 -0.61910759 ...  0.81348922  0.95668921
   0.61040794]
 [ 0.80318226  0.82859349 -1.03063205 ...  0.81348922 -1.04527154
   0.99827727]]


Performing PCA on dataset for Dimentionality Reduction

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 20)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(X_train_pca)

[[-0.25216517 -1.10819812  0.94878094 ...  2.32792395  0.74974737
  -1.34555446]
 [-1.27378499 -0.39779663 -0.66878045 ...  0.82798641  2.13157134
  -0.04178804]
 [ 1.09788884  0.36881018  0.34507897 ... -0.94692524  1.13885766
  -0.21527782]
 ...
 [ 0.12239457  2.01161015  1.23288378 ... -0.26121869  0.26319944
   1.25907715]
 [-1.31322545 -0.40886626  0.56887131 ... -0.42303245  0.03668646
   0.78021005]
 [ 0.12470052 -0.30633533 -1.66863637 ... -1.33504749  0.10142783
   0.71513577]]


Gradient Boosting Algorithm

In [20]:
# +-------------------------------------------------------------------+
# |  Author: Siddhant Sudesh Chalke, Roll No. 21BCS118, IIIT Dharwad  |
# +-------------------------------------------------------------------+

# Using Gradient Boosting for predicting Mesothelioma

from sklearn.metrics import accuracy_score

class DecisionTree:
    def __init__(self, max_depth = None):
        self.max_depth = max_depth
    
    def get_entropy(self, y):
        classes, counts = np.unique(y, return_counts = True)
        probablities = counts / len(y)
        entropy = -np.sum(probablities * np.log2(probablities))

        return entropy
    
    # Splitting data based on feature and threshold
    def split_data(self, x, y, feature_index, threshold):
        left = x[:, feature_index] <= threshold
        right = ~left

        return x[left], x[right], y[left], y[right]
    
    # Finding the best feature and threshold to split the data
    def get_best_split(self, x, y):
        best_gain = 0
        best_feature_index = None
        best_threshold = None
        n_samples, n_features = x.shape
        base_entropy = self.get_entropy(y)

        for feature_index in range(n_features):
            thresholds = np.unique(x[:, feature_index])
            for threshold in thresholds:
                x_left, x_right, y_left, y_right = self.split_data(x, y, feature_index, threshold)

                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                left_entropy = self.get_entropy(y_left)
                right_entropy = self.get_entropy(y_right)

                information_gain = base_entropy - ((len(y_left) / n_samples) * left_entropy + (len(y_right) / n_samples) * right_entropy)

                if information_gain > best_gain:
                    best_gain = information_gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold
    
    # Recursively building the decision tree
    def build_tree(self, x, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.mean(y)
        
        best_feature_index, best_threshold = self.get_best_split(x, y)
        if best_feature_index is None:
            return np.mean(y)
        
        x_left, x_right, y_left, y_right = self.split_data(x, y, best_feature_index, best_threshold)

        left_subtree = self.build_tree(x_left, y_left, depth + 1)
        right_subtree = self. build_tree(x_right, y_right, depth + 1)

        return {
            'feature_index' : best_feature_index,
            'threshold' : best_threshold,
            'left' : left_subtree,
            'right' : right_subtree
        }
    
    # Fitting the decision tree to the data
    def fit(self, x, y):
        self.tree = self.build_tree(x, y, depth = 0)

    # Predicting for a single datapoint
    def single_prediction(self, sample, tree):
        if isinstance(tree, dict):
            if sample[tree['feature_index']] <= tree['threshold']:
                return self.single_prediction(sample, tree['left'])
            else:
                return self.single_prediction(sample, tree['right'])
        else:
            return tree
        
    # Predicting class of multiple samples
    def predict(self, x):
        return np.array([self.single_prediction(sample, self.tree) for sample in x])


# Gradient Boosting
class GradientBoostingClassifer:
    def __init__(self, n_estimators = 50, learning_rate = 0.01, max_depth = 3, threshold_probability = 0.75):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.threshold_probability = threshold_probability
        self.estimators = []
    
    # Function to return all the parameters of the Gradient Boosting Classifier
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'learning_rate': self.learning_rate,
            'max_depth': self.max_depth,
            'threshold_probability': self.threshold_probability
        }

    # Function to set parameters of the Gradient Boosting Classifier
    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

    # Sigmoid function to get probablity from numerical predictions
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def residuals(self, y, y_pred):
        return y - self.sigmoid(y_pred)
    
    # Fitting Gradient boosting classifier to data
    def fit(self, x, y):
        n_samples = len(x)
        y_pred = np.zeros(n_samples)

        for i in range(self.n_estimators):
            residual = self.residuals(y, y_pred)
            tree = DecisionTree(max_depth = self.max_depth)
            tree.fit(x, residual)
            self.estimators.append(tree)

            y_pred = y_pred + self.learning_rate * tree.predict(x)
    
    # Predict function to classify based on given features
    def predict(self, x):
        y_pred = np.zeros(len(x))
        for tree in self.estimators:
            y_pred = y_pred + self.learning_rate * tree.predict(x)
        
        y_res = []
        for y in self.sigmoid(y_pred):
            if y >= self.threshold_probability:
                y_res.append(1)
            else:
                y_res.append(0)
        
        return y_res
    
    # Function to return accuracy score of the the Gradient Boosting Classifier
    def score(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred, sample_weight=sample_weight)

Finding the best hyperparameters

In [21]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 350, 400, 450, 500],
    'max_depth': [50, 50, 40, 40, 40, 30, 30]
}

# Initialize the XGBoost classifier
xgb_clf = GradientBoostingClassifer()

# Perform grid search
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_


In [22]:
print(best_params)

{'max_depth': 50, 'n_estimators': 350}


Training the model

In [24]:
tree_classifier = GradientBoostingClassifer(n_estimators = 350, learning_rate = 0.01, max_depth = 50, threshold_probability = 0.7)
tree_classifier.fit(X_train_pca, Y_train)

y_pred = tree_classifier.predict(X_test_pca)
print(y_pred)

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]


Evaluating the model

In [30]:
from sklearn.metrics import hamming_loss, accuracy_score

print("Accuracy Score: {:.2f}%".format(accuracy_score(y_true = Y_test, y_pred = y_pred) * 100))
print("Hamming Loss: {:.4f}".format(hamming_loss(y_true = Y_test, y_pred = y_pred)))

Accuracy Score: 74.49%
Hamming Loss: 0.2551
