In [16]:
from sklearn.datasets.mldata import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn import metrics
from skater.core.global_interpretation.interpretable_models.rule_lists import BayesianRuleLists
%matplotlib inline

feature_labels = ["Pregnant","Glucose_concentration_test","Blood_pressure(mmHg)","Triceps_skin_fold_thickness(mm)","2-Hour_serum_insulin_(mu_U/ml)","Body_mass_index","Diabetes_pedigree_function","Age_(years)"]
    
data = fetch_mldata("diabetes") # get dataset
y = (data.target+1)/2 # to bound the target label from (-1, 1) -> (0, 1)
print(data.DESCR)

mldata.org dataset: diabetes


In [47]:
data_df = pd.DataFrame(data.data, columns=feature_labels)

## Observation:
If we take a quick look at the data, we will notice that this dataset has a bunch of continuous features. There might be a need for for discretization to get better accuracy

In [48]:
data_df.head()

Unnamed: 0,Pregnant,Glucose_concentration_test,Blood_pressure(mmHg),Triceps_skin_fold_thickness(mm),2-Hour_serum_insulin_(mu_U/ml),Body_mass_index,Diabetes_pedigree_function,Age_(years)
0,6.0,148.0,72.0,35.0,0.0,33.599998,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.299999,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.099998,2.288,33.0


In [49]:
# Sample code for filtering pandas dataframe by column
# data_df = data_df[["Pregnant","Glucose_concentration_test"]]

In [51]:
# Default test split-size = 0.25
Xtrain, Xtest, ytrain, ytest = train_test_split(data_df, y) # split
# Target Labels: 1:positive 0:negative
np.unique(ytrain)

array([ 0.,  1.])

In [52]:
print(Xtrain.head())
print(Xtrain.shape)
print(ytrain.shape)

     Pregnant  Glucose_concentration_test  Blood_pressure(mmHg)  \
612       7.0                       168.0                  88.0   
232       1.0                        79.0                  80.0   
516       9.0                       145.0                  88.0   
402       5.0                       136.0                  84.0   
290       0.0                        78.0                  88.0   

     Triceps_skin_fold_thickness(mm)  2-Hour_serum_insulin_(mu_U/ml)  \
612                             42.0                           321.0   
232                             25.0                            37.0   
516                             34.0                           165.0   
402                             41.0                            88.0   
290                             29.0                            40.0   

     Body_mass_index  Diabetes_pedigree_function  Age_(years)  
612        38.200001                       0.787         40.0  
232        25.400000                

In [54]:
# Features to be used for training a Rule List model
print(Xtrain.columns)

Index(['Pregnant', 'Glucose_concentration_test', 'Blood_pressure(mmHg)',
       'Triceps_skin_fold_thickness(mm)', '2-Hour_serum_insulin_(mu_U/ml)',
       'Body_mass_index', 'Diabetes_pedigree_function', 'Age_(years)'],
      dtype='object')


In [55]:
# Create an instance of the estimator
sbrl_inst = BayesianRuleLists(min_rule_len=1, max_rule_len=12, iterations=50000, n_chains=60, lambda_=12)
# Train a model, by default discretizer is enabled. So, you wish to exclude features then exclude them using 
# the undiscretize_feature_list parameter
model = sbrl_inst.fit(Xtrain, ytrain, undiscretize_feature_list=["Pregnant"])

Eclat


parameter specification:


 tidLists
 support
 minlen
 maxlen
            target
   ext


    FALSE
     0.1
      1
     12
 frequent itemsets
 FALSE



algorithmic control:


 sparse
 sort
 verbose


      7
   -2
    TRUE



Absolute minimum support count:
 
38
 



create itemset ... 

set transactions ...
[886 item(s),
 386 transaction(s)] done 
[0.00s].


sorting and recoding items ... 
[35 item(s)] 
done [0.00s].


creating bit matrix ... 
[35 row(s), 386 column(s)] 
done [0.00s].

writing  ... 
[104 set(s)] done 
[0.00s].

Creating S4 object  ... 
done 
[0.00s].

Eclat


parameter specification:


 tidLists
 support
 minlen
 maxlen
            target
   ext


    FALSE
     0.1
      1
     12
 frequent itemsets
 FALSE



algorithmic control:


 sparse
 sort
 verbose


      7
   -2
    TRUE



Absolute minimum support count:
 
19
 



create itemset ... 

set transactions ...
[625 item(s),
 190 transaction(s)] done 
[0.00s].


sorting and recoding items ... 
[32 item(s

In [56]:
sbrl_inst.print_model()

The rules list is : 

If      {Glucose_concentration_test_q_label=4} (rule[79]) then positive probability = 0.33793103

else if {Body_mass_index_q_label=1} (rule[28]) then positive probability = 0.95081967

else if {Diabetes_pedigree_function_q_label=4,Age_.years._q_label=3} (rule[42]) then positive probability = 0.33333333

else if {Diabetes_pedigree_function_q_label=4,Age_.years._q_label=4} (rule[43]) then positive probability = 0.23529412

else if {Glucose_concentration_test_q_label=1} (rule[55]) then positive probability = 0.90000000

else  (default rule)  then positive probability = 0.69565217



In [57]:
features_to_descritize = sbrl_inst.filter_to_be_discretize(Xtest.columns, ["Pregnant"])
Xtest_filtered = sbrl_inst.discretizer(Xtest, features_to_descritize)
        

results_test_sbrl = sbrl_inst.predict_prob(Xtest_filtered)
print
fpr_sbrl, tpr_sbrl, thresholds_sbrl = metrics.roc_curve(ytest ,results_test_sbrl[1], pos_label=1)
roc_auc_sbrl = metrics.auc(fpr_sbrl, tpr_sbrl)
print("AUC-ROC using SBRL: {}".format(roc_auc_sbrl))
b = sbrl_inst.predict(Xtest_filtered)[1]
print("Accuracy: {}".format(metrics.accuracy_score(ytest, b)))

AUC-ROC using SBRL: 0.7955465587044535
Accuracy: 0.7395833333333334


In [58]:
data_df.head()

Unnamed: 0,Pregnant,Glucose_concentration_test,Blood_pressure(mmHg),Triceps_skin_fold_thickness(mm),2-Hour_serum_insulin_(mu_U/ml),Body_mass_index,Diabetes_pedigree_function,Age_(years)
0,6.0,148.0,72.0,35.0,0.0,33.599998,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.299999,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.099998,2.288,33.0


In [59]:
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(data_df, y)

rf_model = RandomForestClassifier(n_estimators=100, random_state=123).fit(x_train, y_train)
results_test_rf = pd.DataFrame(rf_model.predict_proba(x_test))

In [60]:
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, results_test_rf[1], pos_label=1)
roc_auc_rf = metrics.auc(fpr_rf, tpr_rf)
print("AUC-ROC using Random Forest: {}".format(roc_auc_rf))
print("Accuracy: {}".format(metrics.accuracy_score(y_test, rf_model.predict(x_test))))

AUC-ROC using Random Forest: 0.8139104477611939
Accuracy: 0.734375


In [61]:
# access all rules, or filter rules as needed for debugging and building intuition
sbrl_inst.access_learned_rules("all")['rulenames']

['{Age_.years._q_label=1}',
 '{Age_.years._q_label=2}',
 '{Age_.years._q_label=3}',
 '{Age_.years._q_label=4}',
 '{Age_.years.=21,Age_.years._q_label=1}',
 '{Age_.years.=21}',
 '{Age_.years.=22,Age_.years._q_label=1}',
 '{Age_.years.=22}',
 '{Blood_pressure.mmHg._q_label=1,Age_.years._q_label=1}',
 '{Blood_pressure.mmHg._q_label=1,Body_mass_index_q_label=1}',
 '{Blood_pressure.mmHg._q_label=1,Triceps_skin_fold_thickness.mm._q_label=2}',
 '{Blood_pressure.mmHg._q_label=1}',
 '{Blood_pressure.mmHg._q_label=2,X2.Hour_serum_insulin_.mu_U.ml._q_label=4}',
 '{Blood_pressure.mmHg._q_label=2}',
 '{Blood_pressure.mmHg._q_label=3,Diabetes_pedigree_function_q_label=4}',
 '{Blood_pressure.mmHg._q_label=3}',
 '{Blood_pressure.mmHg._q_label=4,Age_.years._q_label=4}',
 '{Blood_pressure.mmHg._q_label=4,Body_mass_index_q_label=4}',
 '{Blood_pressure.mmHg._q_label=4,Diabetes_pedigree_function_q_label=4}',
 '{Blood_pressure.mmHg._q_label=4,Triceps_skin_fold_thickness.mm._q_label=4}',
 '{Blood_pressure.mm