In [1]:
import irf
from sklearn.datasets import load_boston, load_breast_cancer
import irf.ensemble.wrf as wrf
from irf.ensemble.wrf import wrf_reg
import numpy as np

from sklearn.model_selection import train_test_split

from irf import irf_utils
from irf import irf_jupyter_utils

In [2]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
data = load_breast_cancer()
X = data['data']
y = data['target']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
        data.data, data.target, train_size=0.9)

In [5]:
rf = wrf()
weight = np.ones((X.shape[1],)) / X.shape[1]
rf.fit(X, y, feature_weight=weight)
print(rf.feature_importances_)

[ 0.          0.          0.          0.          0.          0.
  0.00706097  0.11343517  0.          0.          0.          0.          0.
  0.00615402  0.          0.          0.          0.          0.          0.
  0.          0.04457264  0.31979639  0.21288434  0.          0.          0.
  0.29609648  0.          0.        ]


In [6]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf, X_train=X_train, X_test=X_test, y_test=y_test)

In [8]:
def convert(leaf_node_paths, feature_paths):
    leafnode_to_features = {}
    for i in range(len(leaf_node_paths)):
        leaf_node = leaf_node_paths[i][-1]
        leafnode_to_features[leaf_node] = feature_paths[i]
    return leafnode_to_features

In [9]:
interactions = np.array([])
inp = X_train[y_train==1]

for i in range(len(rf.estimators_)):
    key = "dtree" + str(i)
    tree_dict = all_rf_tree_data[key]
    #assume that ith entry of feature_paths corresponds to ith entry of leaf_node_paths
    feature_paths = tree_dict["all_uniq_leaf_paths_features"]
    leaf_nodes = tree_dict["all_leaf_nodes"]
    leaf_node_paths = tree_dict["all_leaf_node_paths"]
    decision_tree = rf.estimators_[i]
    
    end = decision_tree.apply(inp)
    leafnode_to_features = convert(leaf_node_paths, feature_paths)
    temp_features = np.vectorize(leafnode_to_features.get, otypes=[np.ndarray])(end)
    interactions = np.concatenate((interactions, temp_features))
    


In [10]:
#schema = StructType([StructField("id", IntegerType(), True), 
#                     StructField("list", ArrayType(), True)])

spark = SparkSession \
    .builder \
    .appName("iterative Random Forests with FP-Growth") \
    .getOrCreate()
    
input_list = [(i, interactions[i].tolist()) for i in range(len(interactions))]

df = spark.createDataFrame(input_list, ["id", "items"])
df.show()

+---+---------------+
| id|          items|
+---+---------------+
|  0|[7, 21, 23, 27]|
|  1|[7, 21, 23, 27]|
|  2|    [7, 21, 27]|
|  3|[7, 21, 23, 27]|
|  4|[7, 21, 23, 27]|
|  5|[7, 21, 23, 27]|
|  6|[7, 21, 23, 27]|
|  7|[7, 21, 23, 27]|
|  8|   [21, 23, 27]|
|  9|[7, 21, 23, 27]|
| 10|[7, 21, 23, 27]|
| 11|[7, 21, 23, 27]|
| 12|[7, 21, 23, 27]|
| 13|[7, 21, 23, 27]|
| 14|[7, 21, 23, 27]|
| 15|[7, 21, 23, 27]|
| 16|[7, 21, 23, 27]|
| 17|[7, 21, 23, 27]|
| 18|   [21, 23, 27]|
| 19|[7, 21, 23, 27]|
+---+---------------+
only showing top 20 rows



In [11]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.6)
model = fpGrowth.fit(df)

model.freqItemsets.show(40, False)

+----------------+----+
|items           |freq|
+----------------+----+
|[7]             |1036|
|[7, 23]         |991 |
|[7, 23, 27]     |735 |
|[7, 21]         |458 |
|[7, 21, 23]     |429 |
|[7, 21, 23, 27] |399 |
|[7, 21, 27]     |424 |
|[7, 27]         |772 |
|[27]            |2933|
|[13]            |645 |
|[13, 22]        |355 |
|[13, 22, 27]    |355 |
|[13, 23]        |361 |
|[13, 23, 27]    |361 |
|[13, 27]        |645 |
|[23]            |2870|
|[23, 27]        |2612|
|[22]            |1451|
|[22, 23]        |1134|
|[22, 23, 27]    |1104|
|[22, 27]        |1412|
|[21]            |1444|
|[21, 22]        |997 |
|[21, 22, 23]    |699 |
|[21, 22, 23, 27]|671 |
|[21, 22, 27]    |965 |
|[21, 23]        |1133|
|[21, 23, 27]    |1103|
|[21, 27]        |1410|
+----------------+----+

