# Extracting rules from Random Forest

In [11]:
# Importing

import pandas as pd

from pkg import utils
from pkg import model_vis

In [12]:
# Importing model

parent = 'models'
filename = 'v01.pkl'

path = utils.get_path(parent, filename)

rf = utils.read_model(path)
features = rf.feature_names_in_

In [13]:
# Importing feature df

parent = 'data'
filename = 'featured.csv'

path = utils.get_path(parent, filename)

df = pd.read_csv(path, index_col = 0)

y = df.iloc[:,0]
X = df.iloc[:,1:]

In [14]:
# Configurations

bins = 10

In [15]:
feature = X.iloc[:,0]

feature.quantile(0.2)

0.2318344782390788

In [61]:
# Creating binning df

def binning_serie(feature, bins=10):
    key = feature.name
    values = []
    q = 1/bins

    for b in range(1,bins+1):
        value = feature.quantile(b*q)
        values.append(value)

    return key, values


def binning_df(X, bins=10):
    bin_dict = {}
    
    for column in X:
        key, values = binning_serie(X[column],bins=bins)
        bin_dict[key] = values
    
    bin_df = pd.DataFrame.from_dict(bin_dict)

    return bin_df


bin_df = binning_df(X, bins=bins)

bin_df


Unnamed: 0,dominance_simpson,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,P108-PWY,...,RH_temporal_theta_t1,Occipital_low_alpha_t1,RH_lateral_frontal_high_alpha_t1,LH_lateral_frontal_beta_t1,LH_parietal_beta_t1,RH_temporal_beta_t1,LH_temporal_beta_t1,b04_t1,renda_familiar_total_t0,a08_t1
0,0.187102,0.0,0.0,0.0,0.0,0.0,0.0,662.146895,0.0,0.0,...,6.057718,4.510006,1.111489,1.368319,0.73339,1.021357,1.118883,1.0,7.245227,1.0
1,0.231834,0.0,0.0,0.0,0.0,0.0,0.0,1130.342156,0.0,0.0,...,7.485468,5.296529,1.337243,1.649866,0.961341,1.28233,1.39075,1.0,7.824446,1.0
2,0.275396,0.0,0.0,0.0,0.0,0.0,0.0,1526.57223,0.0,16.173716,...,8.578899,5.965594,1.534865,1.978558,1.128128,1.526605,1.652472,1.0,8.160804,1.0
3,0.300424,0.0,0.0,0.0,0.0,0.0,0.0,1789.62681,0.0,43.216242,...,9.67612,7.054356,1.71901,2.413841,1.24507,1.738249,1.932215,1.0,8.517393,1.0
4,0.34253,0.0,0.0,0.0,0.0,0.0,0.0,2117.169278,0.0,73.497507,...,10.7946,7.848929,1.955834,2.841292,1.423092,1.961781,2.106749,1.0,8.779711,1.0
5,0.386348,0.0,0.0,0.0,0.0,0.0,0.0,2543.294511,0.0,127.552734,...,11.932213,8.674407,2.240022,3.221389,1.559242,2.211223,2.453371,1.0,9.029205,1.0
6,0.447285,0.0,0.0,3.100621,0.0,0.0,0.0,2945.321358,0.0,172.171421,...,13.298986,10.063115,2.436846,3.8454,1.776544,2.467078,2.849434,2.0,9.392779,2.0
7,0.515101,0.0,51.153225,34.653072,0.0,13.843803,0.0,3587.762606,0.0,277.357637,...,15.291361,11.473245,2.763049,4.648066,2.03015,2.841841,3.328359,2.0,9.741086,2.0
8,0.62725,0.0,190.679134,96.519175,37.558899,38.386303,0.0,4391.945951,87.439654,471.775848,...,18.804778,13.637315,3.371064,5.737608,2.64582,3.81484,4.051358,3.0,10.126711,2.0
9,1.0,226.40735,1235.695108,1175.410031,694.760671,423.689316,140.735788,7769.741507,1386.891505,1415.168736,...,34.623157,20.739614,6.006123,10.628805,5.584622,7.328217,7.521854,5.0,11.678457,5.0


In [66]:
rules = []

for tree in rf:
    new_rules = find_rules(tree, features)
    rules += new_rules


'''
unique_rules = []

for rule in rules:
    if rule not in unique_rules:
        unique_rules.append(rule)

'''

for rule in rules:
  print(rule)


[('12DICHLORETHDEG-PWY', 47.71), ('AEROBACTINSYN-PWY', 0.66)]
[('12DICHLORETHDEG-PWY', 47.71), ('not AEROBACTINSYN-PWY', 0.66), ('CRNFORCAT-PWY', 0.0), ('DENITRIFICATION-PWY', 968.4), ('DHGLUCONATE-PYR-CAT-PWY', 804.34)]
[('12DICHLORETHDEG-PWY', 47.71), ('not AEROBACTINSYN-PWY', 0.66), ('CRNFORCAT-PWY', 0.0), ('DENITRIFICATION-PWY', 968.4), ('not DHGLUCONATE-PYR-CAT-PWY', 804.34)]
[('12DICHLORETHDEG-PWY', 47.71), ('not AEROBACTINSYN-PWY', 0.66), ('CRNFORCAT-PWY', 0.0), ('not DENITRIFICATION-PWY', 968.4)]
[('12DICHLORETHDEG-PWY', 47.71), ('not AEROBACTINSYN-PWY', 0.66), ('not CRNFORCAT-PWY', 0.0), ('P125-PWY', 0.48)]
[('12DICHLORETHDEG-PWY', 47.71), ('not AEROBACTINSYN-PWY', 0.66), ('not CRNFORCAT-PWY', 0.0), ('not P125-PWY', 0.48)]
[('not 12DICHLORETHDEG-PWY', 47.71), ('P621-PWY', 11.48), ('POLYAMINSYN3-PWY', 5.82)]
[('not 12DICHLORETHDEG-PWY', 47.71), ('P621-PWY', 11.48), ('not POLYAMINSYN3-PWY', 5.82), ('PWY-1861', 32.6)]
[('not 12DICHLORETHDEG-PWY', 47.71), ('P621-PWY', 11.48), ('no

In [70]:
tree = rf[0]


from sklearn.tree import _tree

def find_rules(tree, features):
    dt = tree.tree_
    
    rules = []
    
    def visitor(node, depth, feat, thresholds):
        indent = ' ' * 4 * depth
        
        if dt.feature[node] != _tree.TREE_UNDEFINED:
            feat.append(dt.feature[node])
            thresholds.append(dt.threshold[node])


            visitor(dt.children_left[node], depth + 1, features.copy(), thresholds.copy())
            feat.pop()  # Remove the last added rule
            thresholds.pop()

            feat.append('not ' + dt.feature[node])
            thresholds.append(dt.threshold[node])

            visitor(dt.children_right[node], depth + 1, features.copy(), thresholds.copy())
            feat.pop()  # Remove the last added rule
            thresholds.pop()
        else:
            rule = {'features':feat, 'thresholds': thresholds}
            rules.append(rule.copy())
    
    visitor(0, 1, [], [])
    return rules

rules = find_rules(tree, features)

AttributeError: 'numpy.ndarray' object has no attribute 'append'