In [1]:
from brif import brif
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import time
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

Compare RandomForestClassifier and brif in terms of training time and test set AUC and Accuracy.
Note: RandomForestClassifier does not take non-numeric input variables, so datasets used here all have numeric X. 
Version in comparison:
scikit-learn 1.2.0
brif 1.2.5

In [2]:
def make_X_y(df, target_var):
    X_vars = list(df.columns.values)
    X_vars.remove(target_var)
    X = df[X_vars].to_numpy()
    y = df[target_var].to_numpy()
    return X, y

In [3]:
# Binary classification 
csv_names = ['auto','heloc','HTRU2','magic04','optdigits','spambase','thyroid']
resp_vars = ['origin','RiskPerformance','class','class','class','spam','Class']
n_datasets = len(csv_names)

In [4]:
d = 2
this_csv = csv_names[d]
target_var = resp_vars[d]

In [5]:
# Load the data
df = pd.read_csv("data/"+this_csv+".csv")
# Split into training and test
df_train = df.sample(frac = 0.7)
df_test = df.drop(df_train.index)
all_vars = list(df.columns.values)

In [6]:
X_train, y_train = make_X_y(df_train, target_var)
X_test, y_test = make_X_y(df_test, target_var)

In [7]:
# Run randomForest
start = time.time()
clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
clf.fit(X_train, y_train)
end = time.time()
cpu_rf = end - start
pred_rf_label = clf.predict(X_test)
pred_rf_score = clf.predict_proba(X_test)
pos_label = clf.classes_[0]
score_rf = pred_rf_score[:,0]
fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
auc_rf = metrics.auc(fpr, tpr)  # Get AUC
accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

In [8]:
# Run brif
start = time.time()
bf = brif.brif({"nthreads":2,'ntrees':200})
bf.fit(df_train, target_var)
end = time.time()
cpu_brif = end - start
cpu_brif

0.922598123550415

In [9]:
all_vars = list(df.columns.values)
all_vars.remove(target_var)
start = time.time()
pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
end = time.time()
pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
pos_label = list(pred_brif_score.columns.values)[0]
score_brif = np.array(pred_brif_score.iloc[:,0])
fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
auc_brif = metrics.auc(fpr, tpr)  # Get AUC
accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy
print("Prediction time: {:.3f}".format(end - start))

Prediction time: 0.543


In [19]:
(auc_brif, accu_brif, cpu_brif)

(0.9697341663232395, 0.9759731793630099, 1.0138967037200928)

In [5]:
# Batch run
log_df = pd.DataFrame({"dataset":[], "n":[], "p":[], "run":[], "train_n":[], "AUC_rf":[], "AUC_brif":[], "ACCU_rf":[], "ACCU_brif":[], 
                       "CPU_rf":[], "CPU_brif":[]})
for d in range(n_datasets):
    this_csv = csv_names[d]
    target_var = resp_vars[d]
    # Load the data
    df = pd.read_csv("data/"+this_csv+".csv")
    
    for run in range(10):
        # Split into training and test
        df_train = df.sample(frac = 0.7, random_state = run*101393)
        df_test = df.drop(df_train.index)
        all_vars = list(df.columns.values)

        # Make numpy matrices for randomForest
        X_train, y_train = make_X_y(df_train, target_var)
        X_test, y_test = make_X_y(df_test, target_var)

        # Run randomForest
        start = time.time()
        clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
        clf.fit(X_train, y_train)
        end = time.time()
        cpu_rf = end - start
        pred_rf_label = clf.predict(X_test)
        pred_rf_score = clf.predict_proba(X_test)
        pos_label = clf.classes_[0]
        score_rf = pred_rf_score[:,0]
        fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
        auc_rf = metrics.auc(fpr, tpr)  # Get AUC
        accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

        # Run brif
        start = time.time()
        bf = brif.brif({"nthreads":4,'ntrees':200})
        bf.fit(df_train, target_var)
        end = time.time()
        cpu_brif = end - start

        all_vars = list(df.columns.values)
        all_vars.remove(target_var)
        #bf.nthreads = 1
        pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
        pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
        pos_label = list(pred_brif_score.columns.values)[0]
        score_brif = np.array(pred_brif_score.iloc[:,0])
        fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
        auc_brif = metrics.auc(fpr, tpr)  # Get AUC
        accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy

        this_entry = pd.DataFrame({"dataset":[this_csv], "n":[df.shape[0]], "p":[df.shape[1]-1], "run":[run], "train_n":[df_train.shape[0]], 
                      "AUC_rf":[auc_rf], "AUC_brif":[auc_brif], 
                      "ACCU_rf":[accu_rf], "ACCU_brif":[accu_brif], 
                      "CPU_rf":[cpu_rf], "CPU_brif":[cpu_brif]})
        log_df = pd.concat([log_df, this_entry], ignore_index = True)
        print(this_entry)
        
display(log_df)
log_df.to_csv("results/runlog.csv")

  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    0      274  0.966737  0.943962  0.932203   0.898305   

     CPU_rf  CPU_brif  
0  0.175499  0.013004  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    1      274  0.956522  0.951494  0.898305   0.898305   

     CPU_rf  CPU_brif  
0  0.181248  0.009573  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    2      274  0.964559  0.959706  0.923729    0.90678   

     CPU_rf  CPU_brif  
0  0.188381  0.010006  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    3      274  0.942192  0.923978  0.847458   0.838983   

     CPU_rf  CPU_brif  
0  0.185259    0.0092  
  dataset    n  p  run  train_n   AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    4      274  0.95312   0.94825  0.872881   0.898305   

     CPU_rf  CPU_brif  
0  0.190457  0.009163

     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    0     3934     1.0       1.0  0.984579   0.985172   

     CPU_rf  CPU_brif  
0  0.568133  0.652888  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    1     3934     1.0       1.0  0.983393   0.985765   

     CPU_rf  CPU_brif  
0  0.529152  0.550105  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif  ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    2     3934     1.0       1.0   0.9828   0.985172   

     CPU_rf  CPU_brif  
0  0.517597  0.574192  
     dataset     n   p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    3     3934  0.999978   0.99998  0.983986   0.986358   

    CPU_rf  CPU_brif  
0  0.49199  0.536562  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    4     3934     1.0       1.0  0.987544   0.989917   

     CPU_rf  

Unnamed: 0,dataset,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
0,auto,392.0,7.0,0.0,274.0,0.966737,0.943962,0.932203,0.898305,0.175499,0.013004
1,auto,392.0,7.0,1.0,274.0,0.956522,0.951494,0.898305,0.898305,0.181248,0.009573
2,auto,392.0,7.0,2.0,274.0,0.964559,0.959706,0.923729,0.906780,0.188381,0.010006
3,auto,392.0,7.0,3.0,274.0,0.942192,0.923978,0.847458,0.838983,0.185259,0.009200
4,auto,392.0,7.0,4.0,274.0,0.953120,0.948250,0.872881,0.898305,0.190457,0.009163
...,...,...,...,...,...,...,...,...,...,...,...
65,thyroid,3772.0,21.0,5.0,2640.0,0.999970,0.999788,0.994700,0.979682,0.310594,0.058322
66,thyroid,3772.0,21.0,6.0,2640.0,0.999839,0.999679,0.996466,0.966431,0.320984,0.067774
67,thyroid,3772.0,21.0,7.0,2640.0,0.998463,0.998512,0.992933,0.968198,0.299121,0.059255
68,thyroid,3772.0,21.0,8.0,2640.0,0.999826,0.999478,0.992933,0.970848,0.305050,0.064799


In [6]:
log_df.groupby('dataset').mean()

Unnamed: 0_level_0,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HTRU2,17898.0,8.0,4.5,12529.0,0.971197,0.973582,0.979531,0.97724,1.74655,0.606897
auto,392.0,7.0,4.5,274.0,0.963058,0.956112,0.901695,0.894915,0.189283,0.010309
heloc,10459.0,23.0,4.5,7321.0,0.787316,0.788493,0.720395,0.718356,0.808405,1.735315
magic04,19020.0,10.0,4.5,13314.0,0.934043,0.930387,0.877585,0.869523,3.807571,3.697979
optdigits,5620.0,64.0,4.5,3934.0,0.999988,0.999994,0.982859,0.984757,0.530694,0.558575
spambase,4601.0,57.0,4.5,3221.0,0.987974,0.988943,0.954855,0.957246,0.471675,0.235715
thyroid,3772.0,21.0,4.5,2640.0,0.99969,0.999291,0.995848,0.971555,0.299928,0.064963


Observation: brif and RandomForestClassifier performed neck and neck in all aspects. 