# Improving Diabetes Dataset Classification Model

## 1. Import and load Diabetes dataset

In [None]:
# import necessary packages
import pandas as pd
import plotly.express as px
import numpy as np

In [None]:
# load the Diabetes dataset
diabetes =  pd.read_csv("diabetes.csv")

# view dataset head
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Load the CSV file and print the dataframe head to confirm that the data was loaded correctly.

## 2. Cleaning and Exploratory Data Analysis

In [None]:
# obtain descriptive statistics for feature columns to determine outliers
diabetes.describe()[["Pregnancies","Glucose","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]]

Unnamed: 0,Pregnancies,Glucose,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,99.0,846.0,67.1,2.42,81.0



Perform exploratory data analysis to identify patterns in the data. Considering the quartile values, minima, and maxima, many of the variables in this dataset seem to have outliers.

In [None]:
# drop duplicates and na values
diabetes = diabetes.drop_duplicates()
diabetes = diabetes.dropna()

In [None]:
# create feature df and label df
diabetes_features = diabetes[["Pregnancies","Glucose","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]]
diabetes_label = diabetes[["Outcome"]]

In [None]:
# use Robust Scaler to scale features
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
diabetes_features_scaled = scaler.fit_transform(diabetes_features.to_numpy())
diabetes_features_scaled = pd.DataFrame(diabetes_features_scaled, columns=[["Pregnancies","Glucose","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]])
diabetes_features_scaled.head()

Unnamed: 0,Pregnancies,Glucose,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.6,0.751515,0.375,-0.239686,0.172043,0.665359,1.235294
1,-0.4,-0.775758,0.1875,-0.239686,-0.580645,-0.056209,0.117647
2,1.0,1.6,-0.71875,-0.239686,-0.935484,0.783007,0.176471
3,-0.4,-0.678788,0.0,0.499018,-0.419355,-0.537255,-0.470588
4,-0.6,0.484848,0.375,1.08055,1.193548,5.007843,0.235294


Perform exploratory data analysis to identify patterns in the data. Considering the quartile values, minima, and maxima, many of the variables in this dataset seem to have outliers.

## 3. Finding Model Mean Accuracy Scores

In order to compute the model mean accuracy scores, we completed train-validation-test splits in 20 trials to gain an average test accuracy score per model on various data splits.

### 3.1 Traditional Machine Learning Models (XGBoost, Random Forest, Gradient Boosting, SVM, AdaBoost, CatBoost, ExtraTrees, LightGBM)

We began by installing the necessary libraries and packages.

In [None]:
!pip install catboost
!pip install lightgbm



In [None]:
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import ExtraTreesClassifier
import lightgbm as lgbm

from sklearn.metrics import accuracy_score

We had to design a function for splitting, training, and testing the ensemble methods in an effective manner over numerous trials to generate reliable average accuracy scores.

In [None]:
# function for splitting, training, and testing traditional ML models

def split_train_test_trad_model():
  state = random.randint(1, 100)

  # data splitting
  features = diabetes_features_scaled.to_numpy() # First extract our test data and store it in x_test, y_test
  labels = diabetes_label.to_numpy()
  _x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.30, random_state=state) # set test size to 30 for 70-30 train-test split

  k = 7 # set k = 7, for 60% training and 10% validation

  kfold_spliter = KFold(n_splits=k)

  folds_data = [] # this is an inefficient way but still do it

  fold = 1

  for train_index, validation_index in kfold_spliter.split(_x):
    x_train , x_valid = _x[train_index,:],_x[validation_index,:]
    y_train , y_valid = _y[train_index,:] , _y[validation_index,:]
    print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
    print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
    fold+=1
    folds_data.append((x_train,y_train,x_valid,y_valid))

  xgboost = xgb.XGBClassifier()
  rf = RandomForestClassifier(random_state=23) # some random seed for reproducibility
  grad_boost = GradientBoostingClassifier()
  svm_linear = SVC(kernel="linear")
  ada_boost = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
  cat_boost = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function='Logloss', verbose=True)
  et = ExtraTreesClassifier(n_estimators=100, random_state=0)
  lgb = lgbm.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)

  all_models = {"xgboost": xgboost,
                "random_forest":rf,
                "grad_boost":grad_boost,
                "svm_linear":svm_linear,
                "ada_boost": ada_boost,
                "cat_boost":cat_boost,
                "extra_trees": et,
                "light_gbm": lgb}

  print (f"We are working with classifiers {all_models.keys()}")

  # Iterate over all models
  for model_name in all_models.keys():

    print (f"Evaluating {model_name} ...")
    model = all_models[model_name]

    # Let's store training and validation accuracies for all folds
    train_acc_for_all_folds = []
    valid_acc_for_all_folds = []

    #Iterate over all folds
    for i, fold in enumerate(folds_data):
      x_train, y_train, x_valid, y_valid = fold

      # Train the model
      _ = model.fit(x_train,y_train.flatten())

      # Evluate model on training data
      y_pred_train = model.predict(x_train)

      # Evaluate the model on validation data
      y_pred_valid = model.predict(x_valid)

      # Compute training accuracy
      train_acc = accuracy_score(y_pred_train , y_train)

      # Store training accuracy for each folds
      train_acc_for_all_folds.append(train_acc)

      # Compute validation accuracy
      valid_acc = accuracy_score(y_pred_valid , y_valid.flatten())

      # Store validation accuracy for each folds
      valid_acc_for_all_folds.append(valid_acc)

    #average training accuracy across k folds
    avg_training_acc = sum(train_acc_for_all_folds)/k

    print (f"Average training accuracy for model {model_name} = {avg_training_acc}")

    #average validation accuracy across k folds
    avg_validation_acc = sum(valid_acc_for_all_folds)/k

    print (f"Average validation accuracy for model {model_name} = {avg_validation_acc}")

    #average validation accuracy across k folds
    avg_test_acc = sum(valid_acc_for_all_folds)/k

    print (f"Average validation accuracy for model {model_name} = {avg_validation_acc}")

  # evaluating each model according to its accuracy score on the test data

  model_accuracies = []

  for model_name in all_models.keys():
    print (f"Evaluating {model_name} ...")
    model = all_models[model_name]
    # Test accuracy on our test dataset
    y_pred_test = model.predict(x_test)
    # Compute accuracy on test data
    accuracy = accuracy_score(y_pred_test, y_test.flatten())
    # Print accuracy on the test data
    print (f"Accuracy on test data {accuracy}")
    model_accuracies.append(accuracy)

  return model_accuracies

In [None]:
# average accuracies over 20 experiments

trad_model_accuracies = []

for x in range(20):
  trad_model_accuracies.append(split_train_test_trad_model())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Average training accuracy for model light_gbm = 0.9987584376389432
Average validation accuracy for model light_gbm = 0.7690166975881262
Average validation accuracy for model light_gbm = 0.7690166975881262
Evaluating xgboost ...
Accuracy on test data 0.6926406926406926
Evaluating random_forest ...
Accuracy on test data 0.7142857142857143
Evaluating grad_boost ...
Accuracy on test data 0.7359307359307359
Evaluating svm_linear ...
Accuracy on test data 0.7359307359307359
Evaluating ada_boost ...
Accuracy on test data 0.7359307359307359
Evaluating cat_boost ...
Accuracy on test data 0.7142857142857143
Evaluating extra_trees ...
Accuracy on test data 0.7186147186147186
Evaluating light_gbm ...
Accuracy on test data 0.70995670995671
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1)

In [None]:
# print average accuracies over 20 experiments

average_accuracies = np.mean(trad_model_accuracies, axis=0)

print(f'XGBoost mean accuracy: {average_accuracies[0]}')
print(f'Random Forest mean accuracy: {average_accuracies[1]}')
print(f'Gradient Boosting mean accuracy: {average_accuracies[2]}')
print(f'SVM mean accuracy: {average_accuracies[3]}')
print(f'AdaBoost mean accuracy: {average_accuracies[4]}')
print(f'CatBoost mean accuracy: {average_accuracies[5]}')
print(f'ExtraTrees mean accuracy: {average_accuracies[6]}')
print(f'LightGBM mean accuracy: {average_accuracies[7]}')

XGBoost mean accuracy: 0.7218614718614719
Random Forest mean accuracy: 0.7465367965367966
Gradient Boosting mean accuracy: 0.7482683982683983
SVM mean accuracy: 0.7627705627705628
AdaBoost mean accuracy: 0.7506493506493507
CatBoost mean accuracy: 0.7383116883116883
ExtraTrees mean accuracy: 0.7439393939393939
LightGBM mean accuracy: 0.7331168831168832


Our average accuracy scores indicated that the AdaBoost, LightGBM, and SVM models in our replicated study were more accurate predictors than in Fong and Motani's study. However, most of our accuracies were within 0.05 of the scores listed in the original study, except for SVM. In the original study, the SVM model was significantly lower than the other traditional machine learning models at 0.5554.

### 3.2 Decision Trees

#### 3.2.1 Symbolic Regression-Enhanced Decision Tree

The Symbolic Regression-Enhanced Decision Tree, or SREDT, combines gplearn's Symbolic Regressor with a typical Decision Tree to generate an expression ideal for data splitting according to its more accurate, cohesive representation of the data.

In [None]:
!pip install -U scikit-learn
!pip install gplearn



In [None]:
from sklearn.tree import DecisionTreeClassifier
from gplearn.genetic import SymbolicRegressor
import sympy

Similar to the traditional machine learning models, we defined a function for splitting, training, and testing the model. Afterwards, we called the function to generate an average accuracy score over 20 trials.

In [None]:
# function for splitting, training, and testing SREDT model

def split_train_test_sredt_model():
  # design a Symbolic Regression-Enhanced Decision Tree
  state = random.randint(1, 100)

  # data splitting
  features = diabetes_features_scaled.to_numpy() # First extract our test data and store it in x_test, y_test
  labels = diabetes_label.to_numpy()
  _x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.30, random_state=state) # set test size to 30 for 70-30 train-test split

  k = 7 # set k = 7, for 60% training and 10% validation

  kfold_spliter = KFold(n_splits=k)

  folds_data = [] # this is an inefficient way but still do it

  fold = 1

  for train_index, validation_index in kfold_spliter.split(_x):
    x_train , x_valid = _x[train_index,:],_x[validation_index,:]
    y_train , y_valid = _y[train_index,:] , _y[validation_index,:]
    print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
    print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
    fold+=1
    folds_data.append((x_train,y_train,x_valid,y_valid))

  # call and define symbolic regressor
  rows = len(diabetes_features_scaled)
  symreg = SymbolicRegressor(population_size = rows, metric = 'mse')

  # fit training data to the regressor
  sr = symreg.fit(_x, _y)

  # extract symbolic expressions from the best individual
  sr_exp_str = sr._program # using the ._program attribute to find the "fittest program found in the final generation of the evolution"
  print(sr) # get a readable representation of the program

  # convert symbolic expression string to a sympy expression
  sr_exp = sympify(str(sr_exp_str))

  # use the symbolic expression to generate new features from the original training features
  _x_new = np.array([sr_exp.subs({'x1': x1_val, 'x2': x2_val}) for x1_val, x2_val in zip(_x[:, 0], _x[:, 1])])
  _x_new = np.expand_dims(_x_new, axis=1)
  _x_new = np.array(_x_new)
  # combine features
  #_x = np.array(_x)
  #_x_combined = np.hstack((_x, _x_new))

  # initialize Decision Tree
  sredt = DecisionTreeClassifier()
  print (f"We are working with classifier SREDT")

  print (f"Evaluating SREDT ...")

  # Let's store training and validation accuracies for all folds
  train_acc_for_all_folds = []
  valid_acc_for_all_folds = []

  #Iterate over all folds
  for i, fold in enumerate(folds_data):
    x_train, y_train, x_valid, y_valid = fold

    print(x_train.shape)
    print(y_train.shape)
    print(type(x_train))
    print(type(y_train))

    # Train the model
    _ = sredt.fit(x_train,y_train.flatten())

    # Evluate model on training data
    y_pred_train = sredt.predict(x_train)

    # Evaluate the model on validation data
    y_pred_valid = sredt.predict(x_valid)

    # Compute training accuracy
    train_acc = accuracy_score(y_pred_train , y_train)

    # Store training accuracy for each folds
    train_acc_for_all_folds.append(train_acc)

    # Compute validation accuracy
    valid_acc = accuracy_score(y_pred_valid , y_valid.flatten())

    # Store validation accuracy for each folds
    valid_acc_for_all_folds.append(valid_acc)

    #average training accuracy across k folds
    avg_training_acc = sum(train_acc_for_all_folds)/k

  print (f"Average training accuracy for model SREDT = {avg_training_acc}")

  #average validation accuracy across k folds
  avg_validation_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average validation accuracy for model SREDT = {avg_validation_acc}")

  #average test accuracy across k folds
  avg_test_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average test accuracy for model SREDT = {avg_test_acc}")

  # evaluating each model according to its accuracy score on the test data

  print (f"Evaluating SREDT ...")
  # Test accuracy on our test dataset
  y_pred_test = sredt.predict(x_test)
  # Compute accuracy on test data
  accuracy = accuracy_score(y_pred_test, y_test.flatten())
  # Print accuracy on the test data
  print (f"Accuracy on test data {accuracy}")

  return accuracy

In [None]:
# average accuracy over 20 experiments

sredt_model_accuracies = []

for x in range(20):
  sredt_model_accuracies.append(split_train_test_sredt_model())

Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), (460, 1))
Fold 3 validation data shape = ((77, 7), (77, 1))
Fold 4 training data shape = ((460, 7), (460, 1))
Fold 4 validation data shape = ((77, 7), (77, 1))
Fold 5 training data shape = ((460, 7), (460, 1))
Fold 5 validation data shape = ((77, 7), (77, 1))
Fold 6 training data shape = ((461, 7), (461, 1))
Fold 6 validation data shape = ((76, 7), (76, 1))
Fold 7 training data shape = ((461, 7), (461, 1))
Fold 7 validation data shape = ((76, 7), (76, 1))


  y = column_or_1d(y, warn=True)


add(0.347, mul(0.229, X1))
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6816228883898058
Average test accuracy for model SREDT = 0.6816228883898058
Evaluating sredt ...
Accuracy on test data 0.7142857142857143
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training dat

  y = column_or_1d(y, warn=True)


0.376
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7205351039937506
Average test accuracy for model SREDT = 0.7205351039937506
Evaluating sredt ...
Accuracy on test data 0.6406926406926406
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


sub(mul(add(add(X4, X1), X1), 0.130), -0.340)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6928278488428864
Average test accuracy for model SREDT = 0.6928278488428864
Evaluating sredt ...
Accuracy on test data 0.6363636363636364
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))


  y = column_or_1d(y, warn=True)


sub(0.248, mul(X1, -0.299))
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7559320378869251
Average test accuracy for model SREDT = 0.7559320378869251
Evaluating sredt ...
Accuracy on test data 0.6666666666666666
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training da

  y = column_or_1d(y, warn=True)


add(0.275, 0.064)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6836246460306611
Average test accuracy for model SREDT = 0.6836246460306611
Evaluating sredt ...
Accuracy on test data 0.6190476190476191
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape =

  y = column_or_1d(y, warn=True)


0.349
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7318377111610195
Average test accuracy for model SREDT = 0.7318377111610195
Evaluating sredt ...
Accuracy on test data 0.6536796536796536
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


0.411
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6983448881945123
Average test accuracy for model SREDT = 0.6983448881945123
Evaluating sredt ...
Accuracy on test data 0.6666666666666666
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


mul(add(add(0.570, mul(add(add(0.570, mul(add(0.570, add(0.570, X0)), 0.323)), X0), 0.323)), X1), 0.323)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7038619275461381
Average test accuracy for model SREDT = 0.7038619275461381
Evaluating sredt ...
Accuracy on test data 0.7142857142857143
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (

  y = column_or_1d(y, warn=True)


mul(0.570, 0.570)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6946587247339127
Average test accuracy for model SREDT = 0.6946587247339127
Evaluating sredt ...
Accuracy on test data 0.6493506493506493
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape =

  y = column_or_1d(y, warn=True)


0.354
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.67774143150083
Average test accuracy for model SREDT = 0.67774143150083
Evaluating sredt ...
Accuracy on test data 0.7445887445887446
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), (460

  y = column_or_1d(y, warn=True)


add(0.345, mul(X1, 0.345))
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6890440386680989
Average test accuracy for model SREDT = 0.6890440386680989
Evaluating sredt ...
Accuracy on test data 0.6883116883116883
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training dat

  y = column_or_1d(y, warn=True)


0.348
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7207059857435797
Average test accuracy for model SREDT = 0.7207059857435797
Evaluating sredt ...
Accuracy on test data 0.7012987012987013
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


add(0.333, mul(X1, 0.267))
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.664827653549458
Average test accuracy for model SREDT = 0.664827653549458
Evaluating sredt ...
Accuracy on test data 0.7489177489177489
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data 

  y = column_or_1d(y, warn=True)


0.358
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7113807245386192
Average test accuracy for model SREDT = 0.7113807245386192
Evaluating sredt ...
Accuracy on test data 0.7186147186147186
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


mul(add(0.802, add(add(0.674, X1), mul(add(X4, add(X4, X1)), 0.218))), 0.218)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.683575822673567
Average test accuracy for model SREDT = 0.683575822673567
Evaluating sredt ...
Accuracy on test data 0.645021645021645
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation dat

  y = column_or_1d(y, warn=True)


0.307
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.657650620056635
Average test accuracy for model SREDT = 0.657650620056635
Evaluating sredt ...
Accuracy on test data 0.7359307359307359
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), (4

  y = column_or_1d(y, warn=True)


add(0.328, mul(0.123, sub(add(add(add(X4, X1), X1), X0), 0.345)))
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.6705155746509129
Average test accuracy for model SREDT = 0.6705155746509129
Evaluating sredt ...
Accuracy on test data 0.7359307359307359
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape =

  y = column_or_1d(y, warn=True)


0.349
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7245142075969143
Average test accuracy for model SREDT = 0.7245142075969143
Evaluating sredt ...
Accuracy on test data 0.6320346320346321
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), 

  y = column_or_1d(y, warn=True)


add(mul(0.319, X1), 0.319)
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7264183185235817
Average test accuracy for model SREDT = 0.7264183185235817
Evaluating sredt ...
Accuracy on test data 0.7056277056277056
Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training dat

  y = column_or_1d(y, warn=True)


0.353
We are working with classifier SREDT
Evaluating sredt ...
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(460, 7)
(460, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(461, 7)
(461, 1)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Average training accuracy for model SREDT = 1.0
Average validation accuracy for model SREDT = 0.7226100966702471
Average test accuracy for model SREDT = 0.7226100966702471
Evaluating sredt ...
Accuracy on test data 0.6320346320346321


In [None]:
# print average accuracy over 20 experiments

average_accuracy = np.mean(sredt_model_accuracies)

print(f'SREDT mean accuracy: {average_accuracy}')

SREDT mean accuracy: 0.6824675324675324


Our average accuracy score approaches that of Fong and Motani at 0.6825 compared to their 0.7403.

#### 3.2.2 Decision Tree

For the traditional Decision Tree, we closely followed the methodology of our ensemble methods after importing the DecisionTreeClassifier from scikit-learn.

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# function for splitting, training, and testing DT model

def split_train_test_dt_model():
  # design a Decision Tree
  state = random.randint(1, 100)

  # data splitting
  features = diabetes_features_scaled.to_numpy() # First extract our test data and store it in x_test, y_test
  labels = diabetes_label.to_numpy()
  _x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.30, random_state=state) # set test size to 30 for 70-30 train-test split

  k = 7 # set k = 7, for 60% training and 10% validation

  kfold_spliter = KFold(n_splits=k)

  folds_data = [] # this is an inefficient way but still do it

  fold = 1

  for train_index, validation_index in kfold_spliter.split(_x):
    x_train , x_valid = _x[train_index,:],_x[validation_index,:]
    y_train , y_valid = _y[train_index,:] , _y[validation_index,:]
    print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
    print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
    fold+=1
    folds_data.append((x_train,y_train,x_valid,y_valid))

  # Decision Tree

  dt = DecisionTreeClassifier()

  print (f"We are working with the classifier DT")

  print (f"Evaluating DT ...")

  # Let's store training and validation accuracies for all folds
  train_acc_for_all_folds = []
  valid_acc_for_all_folds = []

  #Iterate over all folds
  for i, fold in enumerate(folds_data):
    x_train, y_train, x_valid, y_valid = fold

    # Train the model
    _ = dt.fit(x_train,y_train.flatten())

    # Evluate model on training data
    y_pred_train = dt.predict(x_train)

    # Evaluate the model on validation data
    y_pred_valid = dt.predict(x_valid)

    # Compute training accuracy
    train_acc = accuracy_score(y_pred_train , y_train)

    # Store training accuracy for each folds
    train_acc_for_all_folds.append(train_acc)

    # Compute validation accuracy
    valid_acc = accuracy_score(y_pred_valid , y_valid.flatten())

    # Store validation accuracy for each folds
    valid_acc_for_all_folds.append(valid_acc)

  #average training accuracy across k folds
  avg_training_acc = sum(train_acc_for_all_folds)/k

  print (f"Average training accuracy for model DT = {avg_training_acc}")

  #average validation accuracy across k folds
  avg_validation_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average validation accuracy for model DT = {avg_validation_acc}")

  #average validation accuracy across k folds
  avg_test_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average test accuracy for model DT = {avg_test_acc}")

  print (f"Evaluating DT ...")
  # Test accuracy on our test dataset
  y_pred_test = dt.predict(x_test)
  # Compute accuracy on test data
  accuracy = accuracy_score(y_pred_test, y_test.flatten())
  # Print accuracy on the test data
  print (f"Accuracy on test data {accuracy}")

  return accuracy

In [None]:
# average accuracy over 20 experiments

dt_model_accuracies = []

for x in range(20):
  dt_model_accuracies.append(split_train_test_dt_model())

Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), (460, 1))
Fold 3 validation data shape = ((77, 7), (77, 1))
Fold 4 training data shape = ((460, 7), (460, 1))
Fold 4 validation data shape = ((77, 7), (77, 1))
Fold 5 training data shape = ((460, 7), (460, 1))
Fold 5 validation data shape = ((77, 7), (77, 1))
Fold 6 training data shape = ((461, 7), (461, 1))
Fold 6 validation data shape = ((76, 7), (76, 1))
Fold 7 training data shape = ((461, 7), (461, 1))
Fold 7 validation data shape = ((76, 7), (76, 1))
We are working with the classifier DT
Evaluating DT ...
Average training accuracy for model DT = 1.0
Average validation accuracy for model DT = 0.6982960648374181
Average test accuracy for model DT = 0.6982960648374181
Evaluating DT ...
Accuracy on test data 0.7186147186147186
Fold 1 training data s

In [None]:
# print average accuracy over 20 experiments

average_accuracy = np.mean(dt_model_accuracies)

print(f'DT mean accuracy: {average_accuracy}')

DT mean accuracy: 0.6989177489177489


Our Decision Tree's average accuracy scores also very closely approaches the article's at 0.6989 while their's was 0.699.

#### 3.2.3 Oblique Decision Tree

The Oblique Decision Tree Classifier differs from the traditional Decision Tree in its splitting method. It incorporates multivariate splitting to divide the data over a hyperplane, allowing for more accurate results. However, it is known to be susceptible to noise in the data.

In [None]:
!pip install scikit-obliquetree
!pip install scikit-learn scikit-obliquetree
!pip install scikit-obliquetree



In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from scikit_obliquetree.HHCART import HouseHolderCART
from scikit_obliquetree.segmentor import MSE, MeanSegmentor

Similar to our previous models, we implemented a function to call over 20 trials for splitting, training, and testing.

In [None]:
# function for splitting, training, and testing ODT model

def split_train_test_odt_model():
  # design a Decision Tree
  state = random.randint(1, 100)

  # data splitting
  features = diabetes_features_scaled.to_numpy() # First extract our test data and store it in x_test, y_test
  labels = diabetes_label.to_numpy()
  _x, x_test, _y, y_test = train_test_split(features, labels, test_size=0.30, random_state=state) # set test size to 30 for 70-30 train-test split

  k = 7 # set k = 7, for 60% training and 10% validation

  kfold_spliter = KFold(n_splits=k)

  folds_data = [] # this is an inefficient way but still do it

  fold = 1

  for train_index, validation_index in kfold_spliter.split(_x):
    x_train , x_valid = _x[train_index,:],_x[validation_index,:]
    y_train , y_valid = _y[train_index,:] , _y[validation_index,:]
    print (f"Fold {fold} training data shape = {(x_train.shape,y_train.shape)}")
    print (f"Fold {fold} validation data shape = {(x_valid.shape,y_valid.shape)}")
    fold+=1
    folds_data.append((x_train,y_train,x_valid,y_valid))

  # initialize regressor
  reg = BaggingRegressor(
      HouseHolderCART(MSE(), MeanSegmentor(), max_depth=3),
      n_estimators=100,
      n_jobs=-1,
  )

  print (f"We are working with the classifier ODT")

  print (f"Evaluating ODT ...")

  # Let's store training and validation accuracies for all folds
  train_acc_for_all_folds = []
  valid_acc_for_all_folds = []

  #Iterate over all folds
  for i, fold in enumerate(folds_data):
    x_train, y_train, x_valid, y_valid = fold

    # Train the model
    _ = reg.fit(x_train,y_train.flatten())

    # Evluate model on training data
    y_pred_train = reg.predict(x_train)

    # Evaluate the model on validation data
    y_pred_valid = reg.predict(x_valid)

    #y_pred_train.reshape(-1, 1)

    # Compute training accuracy
    train_acc = reg.score(x_train, y_train)

    # Store training accuracy for each folds
    train_acc_for_all_folds.append(train_acc)

    # Compute validation accuracy
    valid_acc = reg.score(x_valid , y_valid)

    # Store validation accuracy for each folds
    valid_acc_for_all_folds.append(valid_acc)

  #average training accuracy across k folds
  avg_training_acc = sum(train_acc_for_all_folds)/k

  print (f"Average training accuracy for model DT = {avg_training_acc}")

  #average validation accuracy across k folds
  avg_validation_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average validation accuracy for model DT = {avg_validation_acc}")

  #average validation accuracy across k folds
  avg_test_acc = sum(valid_acc_for_all_folds)/k

  print (f"Average test accuracy for model ODT = {avg_test_acc}")

  print (f"Evaluating ODT ...")
  # Test accuracy on our test dataset
  y_pred_test = reg.predict(x_test)
  # Compute accuracy on test data
  accuracy = reg.score(x_test, y_test)
  # Print accuracy on the test data
  print (f"Accuracy on test data {accuracy}")

  return accuracy

In [None]:
# average accuracy over 20 experiments

odt_model_accuracies = []

for x in range(20):
  odt_model_accuracies.append(split_train_test_odt_model())

Fold 1 training data shape = ((460, 7), (460, 1))
Fold 1 validation data shape = ((77, 7), (77, 1))
Fold 2 training data shape = ((460, 7), (460, 1))
Fold 2 validation data shape = ((77, 7), (77, 1))
Fold 3 training data shape = ((460, 7), (460, 1))
Fold 3 validation data shape = ((77, 7), (77, 1))
Fold 4 training data shape = ((460, 7), (460, 1))
Fold 4 validation data shape = ((77, 7), (77, 1))
Fold 5 training data shape = ((460, 7), (460, 1))
Fold 5 validation data shape = ((77, 7), (77, 1))
Fold 6 training data shape = ((461, 7), (461, 1))
Fold 6 validation data shape = ((76, 7), (76, 1))
Fold 7 training data shape = ((461, 7), (461, 1))
Fold 7 validation data shape = ((76, 7), (76, 1))
We are working with the classifier ODT
Evaluating ODT ...
Average training accuracy for model DT = 0.8948302709312986
Average validation accuracy for model DT = 0.2514001604020456
Average test accuracy for model ODT = 0.2514001604020456
Evaluating ODT ...
Accuracy on test data 0.28875353930712855
Fo

In [None]:
# print average accuracy over 20 experiments

average_accuracy = np.mean(odt_model_accuracies)

print(f'ODT mean accuracy: {average_accuracy}')

ODT mean accuracy: 0.2649072023872139


Our average accuracy score for ODT was relatively low likely because of adversarial noise in the data and differences in cleaning methods.

#Conclusions

We noticed a decent amount of success with replicating the ensemble methods, Decision Tree Classifier, and Symbolic Regression-Enhanced Decision Tree with our average accuracy scores over 20 trials reaching a similar range to Fong and Motani's average accuracy scores. However, our Oblique Decision Tree was not the best replicated, likely due to differences in cleaning the dataset and reducing noise.

