In [1]:
import brif
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import time
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

Compare RandomForestClassifier and brif in terms of training time and test set AUC and Accuracy.
Note: RandomForestClassifier does not take non-numeric input variables, so datasets used here all have numeric X. 

In [2]:
def make_X_y(df, target_var):
    X_vars = list(df.columns.values)
    X_vars.remove(target_var)
    X = df[X_vars].to_numpy()
    y = df[target_var].to_numpy()
    return X, y

In [3]:
# Binary classification 
csv_names = ['auto','heloc','HTRU2','magic04','optdigits','spambase','thyroid']
resp_vars = ['origin','RiskPerformance','class','class','class','spam','Class']
n_datasets = len(csv_names)

In [139]:
d = 2
this_csv = csv_names[d]
target_var = resp_vars[d]

In [140]:
# Load the data
df = pd.read_csv("data/"+this_csv+".csv")
# Split into training and test
df_train = df.sample(frac = 0.7)
df_test = df.drop(df_train.index)
all_vars = list(df.columns.values)

In [141]:
X_train, y_train = make_X_y(df_train, target_var)
X_test, y_test = make_X_y(df_test, target_var)

In [142]:
# Run randomForest
start = time.time()
clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
clf.fit(X_train, y_train)
end = time.time()
cpu_rf = end - start
pred_rf_label = clf.predict(X_test)
pred_rf_score = clf.predict_proba(X_test)
pos_label = clf.classes_[0]
score_rf = pred_rf_score[:,0]
fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
auc_rf = metrics.auc(fpr, tpr)  # Get AUC
accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

In [143]:
# Run brif
start = time.time()
bf = brif.brif({"nthreads":4,'ntrees':200})
bf.fit(df_train, target_var)
end = time.time()
cpu_brif = end - start


In [144]:
all_vars = list(df.columns.values)
all_vars.remove(target_var)
start = time.time()
pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
end = time.time()
pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
pos_label = list(pred_brif_score.columns.values)[0]
score_brif = np.array(pred_brif_score.iloc[:,0])
fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
auc_brif = metrics.auc(fpr, tpr)  # Get AUC
accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy
print("Prediction time: {:.3f}".format(end - start))

Prediction time: 0.265


In [5]:
# Batch run
log_df = pd.DataFrame({"dataset":[], "n":[], "p":[], "run":[], "train_n":[], "AUC_rf":[], "AUC_brif":[], "ACCU_rf":[], "ACCU_brif":[], 
                       "CPU_rf":[], "CPU_brif":[]})
for d in range(n_datasets):
    this_csv = csv_names[d]
    target_var = resp_vars[d]
    # Load the data
    df = pd.read_csv("data/"+this_csv+".csv")
    
    for run in range(10):
        # Split into training and test
        df_train = df.sample(frac = 0.7, random_state = run*101393)
        df_test = df.drop(df_train.index)
        all_vars = list(df.columns.values)

        # Make numpy matrices for randomForest
        X_train, y_train = make_X_y(df_train, target_var)
        X_test, y_test = make_X_y(df_test, target_var)

        # Run randomForest
        start = time.time()
        clf = RandomForestClassifier(n_jobs = 4, n_estimators=200)
        clf.fit(X_train, y_train)
        end = time.time()
        cpu_rf = end - start
        pred_rf_label = clf.predict(X_test)
        pred_rf_score = clf.predict_proba(X_test)
        pos_label = clf.classes_[0]
        score_rf = pred_rf_score[:,0]
        fpr, tpr, thresholds = metrics.roc_curve(y_test, score_rf, pos_label=pos_label)
        auc_rf = metrics.auc(fpr, tpr)  # Get AUC
        accu_rf = accuracy_score(pred_rf_label, y_test)  # Get Accuracy

        # Run brif
        start = time.time()
        bf = brif.brif({"nthreads":4,'ntrees':200})
        bf.fit(df_train, target_var)
        end = time.time()
        cpu_brif = end - start

        all_vars = list(df.columns.values)
        all_vars.remove(target_var)
        pred_brif_label = bf.predict(df_test[all_vars], type = 'label')
        pred_brif_score = bf.predict(df_test[all_vars], type = 'score')
        pos_label = list(pred_brif_score.columns.values)[0]
        score_brif = np.array(pred_brif_score.iloc[:,0])
        fpr, tpr, thresholds = metrics.roc_curve(df_test[target_var].astype(str), score_brif, pos_label=pos_label)
        auc_brif = metrics.auc(fpr, tpr)  # Get AUC
        accu_brif = accuracy_score(pred_brif_label, df_test[target_var].astype(str))  # Get Accuracy

        this_entry = pd.DataFrame({"dataset":[this_csv], "n":[df.shape[0]], "p":[df.shape[1]-1], "run":[run], "train_n":[df_train.shape[0]], 
                      "AUC_rf":[auc_rf], "AUC_brif":[auc_brif], 
                      "ACCU_rf":[accu_rf], "ACCU_brif":[accu_brif], 
                      "CPU_rf":[cpu_rf], "CPU_brif":[cpu_brif]})
        log_df = pd.concat([log_df, this_entry], ignore_index = True)
        print(this_entry)
        
display(log_df)
log_df.to_csv("results/runlog.csv")

  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    0      274  0.967486  0.959245  0.923729    0.90678   

     CPU_rf  CPU_brif  
0  0.220073     0.015  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    1      274  0.954008  0.947944  0.889831    0.90678   

     CPU_rf  CPU_brif  
0  0.239875     0.011  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    2      274  0.963824  0.962647  0.923729   0.923729   

     CPU_rf  CPU_brif  
0  0.225131  0.013653  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    3      274  0.933481  0.933164  0.847458   0.830508   

     CPU_rf  CPU_brif  
0  0.219997     0.011  
  dataset    n  p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0    auto  392  7    4      274  0.954186  0.943379  0.881356   0.872881   

   CPU_rf  CPU_brif  
0  0.2182  0.011004  

     dataset     n   p  run  train_n  AUC_rf  AUC_brif  ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    1     3934     1.0       1.0   0.9828   0.985765   

     CPU_rf  CPU_brif  
0  0.599486  1.061721  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    2     3934     1.0       1.0  0.982206   0.983393   

     CPU_rf  CPU_brif  
0  0.630608  1.059459  
     dataset     n   p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    3     3934  0.999968  0.999984  0.984579   0.986951   

     CPU_rf  CPU_brif  
0  0.583429  1.006139  
     dataset     n   p  run  train_n  AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    4     3934     1.0       1.0  0.985765   0.986951   

     CPU_rf  CPU_brif  
0  0.641191  1.031105  
     dataset     n   p  run  train_n    AUC_rf  AUC_brif   ACCU_rf  ACCU_brif  \
0  optdigits  5620  64    5     3934  0.999974       1.0  0.978055   0.980427   

     CP

Unnamed: 0,dataset,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
0,auto,392.0,7.0,0.0,274.0,0.967486,0.959245,0.923729,0.906780,0.220073,0.015000
1,auto,392.0,7.0,1.0,274.0,0.954008,0.947944,0.889831,0.906780,0.239875,0.011000
2,auto,392.0,7.0,2.0,274.0,0.963824,0.962647,0.923729,0.923729,0.225131,0.013653
3,auto,392.0,7.0,3.0,274.0,0.933481,0.933164,0.847458,0.830508,0.219997,0.011000
4,auto,392.0,7.0,4.0,274.0,0.954186,0.943379,0.881356,0.872881,0.218200,0.011004
...,...,...,...,...,...,...,...,...,...,...,...
65,thyroid,3772.0,21.0,5.0,2640.0,0.999955,0.999667,0.993816,0.972615,0.292132,0.076047
66,thyroid,3772.0,21.0,6.0,2640.0,0.999839,0.999679,0.996466,0.965548,0.269531,0.069050
67,thyroid,3772.0,21.0,7.0,2640.0,0.998334,0.998350,0.991166,0.967314,0.283914,0.072000
68,thyroid,3772.0,21.0,8.0,2640.0,0.999878,0.999687,0.995583,0.969965,0.275993,0.075999


In [6]:
log_df.groupby('dataset').mean()

Unnamed: 0_level_0,n,p,run,train_n,AUC_rf,AUC_brif,ACCU_rf,ACCU_brif,CPU_rf,CPU_brif
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HTRU2,17898.0,8.0,4.5,12529.0,0.969807,0.973543,0.9794,0.976849,1.940101,0.773739
auto,392.0,7.0,4.5,274.0,0.962414,0.955291,0.900847,0.894915,0.221824,0.011873
heloc,10459.0,23.0,4.5,7321.0,0.787063,0.787712,0.719726,0.717814,0.882402,2.229878
magic04,19020.0,10.0,4.5,13314.0,0.933934,0.93035,0.877655,0.868524,4.099525,4.603109
optdigits,5620.0,64.0,4.5,3934.0,0.999983,0.999996,0.982503,0.984816,0.668762,1.119832
spambase,4601.0,57.0,4.5,3221.0,0.988577,0.989062,0.955942,0.957319,0.532901,0.326058
thyroid,3772.0,21.0,4.5,2640.0,0.999684,0.999275,0.995936,0.970318,0.302679,0.076854


Observation: brif and RandomForestClassifier performed neck and neck in all aspects. 