In [13]:
import brif
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import time
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

Compare RandomForestClassifier and brif in terms of training time and test set AUC and Accuracy.
Note: RandomForestClassifier does not take non-numeric input variables, so datasets used here all have numeric X. 
Version in comparison:
scikit-learn 1.2.0
brif 1.2.5

In [14]:
def make_X_y(df, target_var):
    X_vars = list(df.columns.values)
    X_vars.remove(target_var)
    X = df[X_vars].to_numpy()
    y = df[target_var].to_numpy()
    return X, y

In [15]:
# Binary classification 
csv_names = ['auto','heloc','HTRU2','magic04','optdigits','spambase','thyroid']
resp_vars = ['origin','RiskPerformance','class','class','class','spam','Class']
n_datasets = len(csv_names)

In [4]:
d = 2
this_csv = csv_names[d]
target_var = resp_vars[d]

In [5]:
# Load the data
df = pd.read_csv("data/"+this_csv+".csv")
# Split into training and test
df_train = df.sample(frac = 0.7)
df_test = df.drop(df_train.index)
all_vars = list(df.columns.values)

In [6]:
X_train, y_train = make_X_y(df_train, target_var)
X_test, y_test = make_X_y(df_test, target_var)

In [10]:
# Run randomForest
start = time.time()
clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
clf.fit(X_train, y_train)
end = time.time()
cpu_rf = end - start
pred_rf_label = clf.predict(X_test)
pred_rf_score = clf.predict_proba(X_test)
pos_label = clf.classes_[0]
score_rf = pred_rf_score[:,0]
fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
auc_rf = metrics.auc(fpr, tpr)  # Get AUC
accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

In [9]:
# Run brif
start = time.time()
bf = brif.brif({"nthreads":8,'ntrees':200})
bf.fit(df_train, target_var)
end = time.time()
cpu_brif = end - start
cpu_brif

0.2950122356414795

In [10]:
all_vars = list(df.columns.values)
all_vars.remove(target_var)
start = time.time()
pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
end = time.time()
pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
pos_label = list(pred_brif_score.columns.values)[0]
score_brif = np.array(pred_brif_score.iloc[:,0])
fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
auc_brif = metrics.auc(fpr, tpr)  # Get AUC
accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy
print("Prediction time: {:.3f}".format(end - start))

Prediction time: 0.188


In [11]:
(auc_brif, accu_brif, cpu_brif)

(0.9746562749603505, 0.9769044514807227, 0.2950122356414795)

In [16]:
# Batch run
log_df = pd.DataFrame({"dataset":[], "n":[], "p":[], "run":[], "train_n":[], "AUC_rf":[], "AUC_brif":[], "ACCU_rf":[], "ACCU_brif":[], 
                       "CPU_rf":[], "CPU_brif":[]})
for d in range(n_datasets):
    this_csv = csv_names[d]
    target_var = resp_vars[d]
    # Load the data
    df = pd.read_csv("data/"+this_csv+".csv")
    
    for run in range(10):
        # Split into training and test
        df_train = df.sample(frac = 0.7, random_state = run*101393)
        df_test = df.drop(df_train.index)
        all_vars = list(df.columns.values)

        # Make numpy matrices for randomForest
        X_train, y_train = make_X_y(df_train, target_var)
        X_test, y_test = make_X_y(df_test, target_var)

        # Run randomForest
        start = time.time()
        clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
        clf.fit(X_train, y_train)
        end = time.time()
        cpu_rf = end - start
        pred_rf_label = clf.predict(X_test)
        pred_rf_score = clf.predict_proba(X_test)
        pos_label = clf.classes_[0]
        score_rf = pred_rf_score[:,0]
        fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
        auc_rf = metrics.auc(fpr, tpr)  # Get AUC
        accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

        # Run brif
        start = time.time()
        bf = brif.brif({"nthreads":4,'ntrees':200})
        bf.fit(df_train, target_var)
        end = time.time()
        cpu_brif = end - start

        all_vars = list(df.columns.values)
        all_vars.remove(target_var)
        pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
        pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
        pos_label = list(pred_brif_score.columns.values)[0]
        score_brif = np.array(pred_brif_score.iloc[:,0])
        fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
        auc_brif = metrics.auc(fpr, tpr)  # Get AUC
        accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy

        this_entry = pd.DataFrame({"dataset":[this_csv], "n":[df.shape[0]], "p":[df.shape[1]-1], "run":[run], "train_n":[df_train.shape[0]], 
                      "AUC_rf":[auc_rf], "AUC_brif":[auc_brif], 
                      "ACCU_rf":[accu_rf], "ACCU_brif":[accu_brif], 
                      "CPU_rf":[cpu_rf], "CPU_brif":[cpu_brif]})
        log_df = pd.concat([log_df, this_entry], ignore_index = True)
        print(this_entry)
        
display(log_df)
log_df.to_csv("results/runlog.csv")

  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    0      274  0.970333  0.943962  0.923729   0.864407   

     CPU_rf  CPU_brif  
0  0.169966     0.007  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    1      274  0.957557  0.918367  0.881356   0.855932   

     CPU_rf  CPU_brif  
0  0.176373     0.008  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    2      274  0.966471  0.947059  0.915254   0.889831   

   CPU_rf  CPU_brif  
0  0.1745  0.007996  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    3      274  0.932373  0.912258  0.855932   0.855932   

     CPU_rf  CPU_brif  
0  0.171213  0.007999  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    4      274  0.953425  0.925723  0.872881   0.830508   

     CPU_rf  CPU_brif  
0  0.166947  0.007999  

     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    0     3934     1.0       1.0  0.985172   0.986358   

     CPU_rf  CPU_brif  
0  0.543031  0.844039  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif  ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    1     3934     1.0       1.0  0.98102   0.985765   

     CPU_rf  CPU_brif  
0  0.566253  0.770547  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    2     3934     1.0       1.0  0.982206   0.985765   

     CPU_rf  CPU_brif  
0  0.499374  0.795021  
     dataset     n   p  run  train_n   AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    3     3934  0.99997  0.999972  0.985172   0.986951   

     CPU_rf  CPU_brif  
0  0.479421  0.779966  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    4     3934     1.0       1.0  0.988138   0.988138   

     CPU_rf  

Unnamed: 0,dataset,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
0,auto,392.0,7.0,0.0,274.0,0.970333,0.943962,0.923729,0.864407,0.169966,0.007000
1,auto,392.0,7.0,1.0,274.0,0.957557,0.918367,0.881356,0.855932,0.176373,0.008000
2,auto,392.0,7.0,2.0,274.0,0.966471,0.947059,0.915254,0.889831,0.174500,0.007996
3,auto,392.0,7.0,3.0,274.0,0.932373,0.912258,0.855932,0.855932,0.171213,0.007999
4,auto,392.0,7.0,4.0,274.0,0.953425,0.925723,0.872881,0.830508,0.166947,0.007999
...,...,...,...,...,...,...,...,...,...,...,...
65,thyroid,3772.0,21.0,5.0,2640.0,1.000000,0.999698,0.994700,0.973498,0.274781,0.073003
66,thyroid,3772.0,21.0,6.0,2640.0,0.999812,0.999705,0.997350,0.964664,0.273702,0.067122
67,thyroid,3772.0,21.0,7.0,2640.0,0.998399,0.998415,0.992933,0.969081,0.266004,0.051000
68,thyroid,3772.0,21.0,8.0,2640.0,0.999896,0.999548,0.994700,0.968198,0.267000,0.058473


In [17]:
log_df.groupby('dataset').mean()

Unnamed: 0_level_0,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HTRU2,17898.0,8.0,4.5,12529.0,0.969996,0.973728,0.979456,0.976998,1.77327,0.529146
auto,392.0,7.0,4.5,274.0,0.962275,0.939606,0.9,0.868644,0.171762,0.008099
heloc,10459.0,23.0,4.5,7321.0,0.787306,0.788034,0.721765,0.718356,0.736542,1.542041
magic04,19020.0,10.0,4.5,13314.0,0.933943,0.930242,0.877655,0.869331,3.875166,3.362029
optdigits,5620.0,64.0,4.5,3934.0,0.999982,0.999993,0.982918,0.984579,0.527516,0.763653
spambase,4601.0,57.0,4.5,3221.0,0.988332,0.989074,0.956232,0.957464,0.482917,0.243382
thyroid,3772.0,21.0,4.5,2640.0,0.99971,0.999331,0.996201,0.969435,0.273066,0.05861


Observation: brif and RandomForestClassifier performed neck and neck in all aspects. 