
- [x] transform the frequent patterns into json format
- [x] create output folders for patterns up to size 6 (vertices)
- [x] create output folders for patterns of size exactly 6 (vertices)
- [x] for each frequency threshold and each random forest, find all occurrences of the frequent patterns in the random forest -> see the notebook 'Frequent Pattern Weighting -- Which frequent subtrees should we replace by function calls?.ipynb' 

In [1]:
%%bash
# create output directories
for dataset in spambase satlog; do
    for variant in NoLeafEdgesWithSplitValues; do
        for patternsize in leq6 eq6; do
            mkdir forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}
        done
    done
done

In [2]:
%%bash
# transform the frequent patterns from cString to json format
for dataset in spambase satlog; do
    for variant in NoLeafEdgesWithSplitValues; do
        for f in forests/rootedFrequentTrees/${dataset}/${variant}/*.patterns; do
            ./SubtreeSelection/cString2json.py leq 6 \
            < ${f} \
            > forests/rootedFrequentTrees/${dataset}/${variant}/leq6/`basename ${f} .patterns`.json
            
            ./SubtreeSelection/cString2json.py eq 6 \
            < ${f} \
            > forests/rootedFrequentTrees/${dataset}/${variant}/eq6/`basename ${f} .patterns`.json
        done
    done
done

In [None]:
%%bash
for dataset in adult wine-quality; do
    for transactionfile in `ls forests/${dataset}/text/*.json | grep -v DT`; do
        transactionBasename=`basename ${transactionfile} .json`        
        for variant in NoLeafEdgesWithSplitValues; do
            for patternsize in leq6 eq6; do
                echo processing ${transactionBasename} : ${variant} ${patternsize}
                for patternfile in `ls forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}/${transactionBasename}_*.json | grep -v allEmbeddings`; do
                    transactionsWithEmbeddings=`basename ${patternfile} .json`_allEmbeddings.json
                    sem --id embeddings --jobs 3 \
                    ./SubtreeSelection/findEmbeddings.py ${patternfile} ${transactionfile} \
                    > forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}/${transactionsWithEmbeddings}
                done
            done
        done
    done
done

sem --id embeddings --wait

In [13]:
# check out the errors from the cell above

for dataset in adult wine-quality; do
    for transactionfile in forests/${dataset}/text/ET_1.json; do
        transactionBasename=`basename ${transactionfile} .json`        
        for variant in WithLeafEdges NoLeafEdges; do
            for patternsize in leq6 eq6; do
                echo processing ${transactionBasename} : ${variant} ${patternsize}
                head -1 ${transactionfile} | cut -c-80
                echo patterns:
                for patternfile in `ls forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}/${transactionBasename}_*.json | grep -v allEmbeddings`; do
                    echo ${patternfile} : `head -1 ${patternfile} | cut -c-10`
                done
            done
        done
    done
done

processing ET_1 : WithLeafEdges leq6
[{"id":0,"numSamples":24420,"probLeft":0.5955364455364456,"probRight":0.40446355
patterns:
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t10.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t11.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t12.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t13.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t14.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t15.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t16.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t17.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t18.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t19.json : [{"pattern
forests/rootedFrequentTrees/adult/WithLeafEdges/leq6/ET_1_t20.json : [{"

forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t16.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t17.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t18.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t19.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t20.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t21.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t22.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t23.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t24.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t25.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/WithLeafEdges/leq6/ET_1_t2.json : [{"pattern
forests/rootedFrequentTrees/wine-quality/Wit

In [17]:
# there were problems with application of patterns on ET_1 and RF_1 datasets

for dataset in adult wine-quality; do
    ls forests/${dataset}/text/*.json | grep '_1\.'
done

forests/adult/text/DT[01;31m[K_1.[m[Kjson
forests/adult/text/ET[01;31m[K_1.[m[Kjson
forests/adult/text/RF[01;31m[K_1.[m[Kjson
forests/wine-quality/text/DT[01;31m[K_1.[m[Kjson
forests/wine-quality/text/ET[01;31m[K_1.[m[Kjson
forests/wine-quality/text/RF[01;31m[K_1.[m[Kjson


In [18]:
# recompute all embeddings for RF_1 and ET_1 datasets, as there were too many patternfiles being applied, due to 
# ET_1 being a substring of ET_10 and ET_15, (same, respectively for RF)

for dataset in adult wine-quality; do
    for transactionfile in `ls forests/${dataset}/text/*.json | grep -v DT | grep '_1\.'`; do
        transactionBasename=`basename ${transactionfile} .json`        
        for variant in WithLeafEdges NoLeafEdges; do
            for patternsize in leq6 eq6; do
                echo processing ${transactionBasename} : ${variant} ${patternsize}
                for patternfile in `ls forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}/${transactionBasename}_*.json | grep -v allEmbeddings`; do
                    transactionsWithEmbeddings=`basename ${patternfile} .json`_allEmbeddings.json
                    sem --id embeddings --jobs 3 \
                    ./SubtreeSelection/findEmbeddings.py ${patternfile} ${transactionfile} \
                    > forests/rootedFrequentTrees/${dataset}/${variant}/${patternsize}/${transactionsWithEmbeddings}
                done
            done
        done
    done
done

sem --id embeddings --wait

processing ET_1 : WithLeafEdges leq6
processing ET_1 : WithLeafEdges eq6
processing ET_1 : NoLeafEdges leq6
processing ET_1 : NoLeafEdges eq6
processing RF_1 : WithLeafEdges leq6
processing RF_1 : WithLeafEdges eq6
processing RF_1 : NoLeafEdges leq6
processing RF_1 : NoLeafEdges eq6
processing ET_1 : WithLeafEdges leq6
processing ET_1 : WithLeafEdges eq6
processing ET_1 : NoLeafEdges leq6
processing ET_1 : NoLeafEdges eq6
processing RF_1 : WithLeafEdges leq6
processing RF_1 : WithLeafEdges eq6
processing RF_1 : NoLeafEdges leq6
processing RF_1 : NoLeafEdges eq6
