In [1]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

import importlib
from IPython.display import display, Markdown, Latex
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

from src.utils import tree_utils
from src.features import feature_utils

### Load the maximum likelihood subtrees for SNLI

In [2]:
data = feature_utils.load_trees(output_dir="data/qqp", dataset="qqp")

parsed 65528/65536 trees
parsed 16382/16384 trees


### Enumerate all of the subtrees and sort by mutual information

In [3]:
feature_utils.add_features(data, "Subtrees", feature_utils.subtrees)
subtrees = feature_utils.get_subtree_feature_table(data)
subtrees.head(15)

683959 Subtrees features


Unnamed: 0,Root,Subtree,Yield,MI,Count,No paraphrase,Paraphrase,Majority label,% majority
0,70,(70 *),[UNK]/ϵ,0.012509,3216.0,2682.0,534.0,No paraphrase,0.833748
1,49,(49 *),ϵ/[UNK],0.006571,2452.0,1919.0,533.0,No paraphrase,0.782396
2,59,(59 *),how/how,0.00532,11742.0,4579.0,7163.0,Paraphrase,0.610014
3,75,(75 *),i/i,0.004217,7634.0,2856.0,4778.0,Paraphrase,0.625851
4,49,(49 *),ϵ/in,0.003368,3469.0,2331.0,1138.0,No paraphrase,0.671852
5,74,(74 *),best/best,0.002809,4469.0,1619.0,2850.0,Paraphrase,0.637665
6,49,(49 *),ϵ/a,0.002739,3364.0,2213.0,1151.0,No paraphrase,0.657754
7,32,(32 *),improve/improve,0.002725,523.0,59.0,464.0,Paraphrase,0.885714
8,71,(71 *),com/com,0.00246,277.0,271.0,6.0,No paraphrase,0.97491
9,87,(87 *),[UNK]/[UNK],0.002299,1918.0,1329.0,589.0,No paraphrase,0.692708


### Create aggregated features by merging subtrees by root non-terminal and majority class label

In [4]:
feature_utils.add_merges(data, "Subtrees", K=1000, merge_name="Subtree groups")
subtree_groups = feature_utils.get_merged_feature_table(data, merge_name="Subtree groups")
subtree_groups.head(15).sort_values(by=["Majority label idx"])

Unnamed: 0,Root,Examples,MI,Count,No paraphrase,Paraphrase,Majority label idx,Majority label,% majority,Support count,Counter count
0,49,"ϵ/[UNK], ϵ/in, ϵ/a, ϵ/-, ϵ/for, ϵ/the, ϵ/use, ϵ/an",0.013399,23987.0,14570.0,9417.0,0,No paraphrase,0.607403,14570.0,9417.0
1,70,"[UNK]/ϵ, in/ϵ, a/ϵ, like/ϵ, of/ϵ, ""/ϵ, the/ϵ, for/ϵ",0.012958,21299.0,13112.0,8187.0,0,No paraphrase,0.615605,13112.0,8187.0
13,59,"why/how, why/what, how/why, why/can, what/is, how/should, how/are, what/why",0.004815,3348.0,2372.0,976.0,0,No paraphrase,0.708358,2372.0,976.0
2,14,"how can/how can, how do/how can, how can/how do, how do/how do, how can /what is the, how can/how should, why do/why do, how can/what should",0.009291,9684.0,3269.0,6415.0,1,Paraphrase,0.662399,6415.0,3269.0
3,25,"new year/new year, world war/world war, donald trump/donald trump, hillary clinton/hillary clinton, long distance/long distance, your life/your life, time travel/time travel, digital marketing/digital marketing",0.008065,2179.0,373.0,1806.0,1,Paraphrase,0.828519,1806.0,373.0
4,59,"how/how, why/why, when/when, how/what",0.007567,17264.0,6861.0,10403.0,1,Paraphrase,0.602572,10403.0,6861.0
5,32,"improve/improve, increase/increase, earn/earn, control/control, make/make, earn/make, become/become, make/earn",0.007101,3183.0,771.0,2412.0,1,Paraphrase,0.757614,2412.0,771.0
6,81,"can/can, can/do, do/can, do/do, can/should, did/did, do/should, can/are",0.006662,11692.0,4404.0,7288.0,1,Paraphrase,0.623311,7288.0,4404.0
7,75,"i/i, one/i, i/we, i/one, i/best, you/you, we/i, i/someone",0.00658,11155.0,4171.0,6984.0,1,Paraphrase,0.626064,6984.0,4171.0
8,71,"quora/quora, 2017/2017, india/india, 2016/2016, delhi/delhi, why/why, why/ϵ, clinton/clinton",0.00658,2995.0,730.0,2265.0,1,Paraphrase,0.756089,2265.0,730.0


To group the top 1000 subtrees, restricted to subtrees with at least two leaves:

In [5]:
feature_utils.add_merges(data, "Subtrees", K=1000, merge_name="Subtree groups (len >= 2)",
                         filter_=lambda w: w[0].count("*") >= 2)
subtree_groups_len2 = feature_utils.get_merged_feature_table(data, merge_name="Subtree groups (len >= 2)")
subtree_groups_len2.head(30)

filtering


Unnamed: 0,Root,Examples,MI,Count,No paraphrase,Paraphrase,Majority label idx,Majority label,% majority,Support count,Counter count
0,25,"new year/new year, world war/world war, donald trump/donald trump, hillary clinton/hillary clinton, long distance/long distance, your life/your life, time travel/time travel, digital marketing/digital marketing",0.013539,3351.0,538.0,2813.0,1,Paraphrase,0.839248,2813.0,538.0
1,14,"how can/how can, how do/how can, how can/how do, how do/how do, how can /what is the, how can/how should, why do/why do, how can/what should",0.011871,12024.0,4072.0,7952.0,1,Paraphrase,0.661317,7952.0,4072.0
2,31,"improve my/improve my, earn money/earn money, make money/make money, earn money/make money, lose weight/lose weight, improve my/improve , make money/earn money, get pregnant/get pregnant",0.009879,3151.0,622.0,2529.0,1,Paraphrase,0.80241,2529.0,622.0
3,27,"candy imported/candy imported, lose weight/lose weight, writing skills/writing skills, commit suicide/commit suicide, scientifically tested/scientifically tested, presence of mind/presence of mind, belly fat/belly fat, porn addiction/porn addiction",0.009253,1350.0,93.0,1257.0,1,Paraphrase,0.930473,1257.0,93.0
4,14,"is /what is, is /how do, is /what are, do /what is, should /how do, why do/how do, can /what is, do /how do",0.007106,2169.0,1759.0,410.0,0,No paraphrase,0.810686,1759.0,410.0
5,3,"candy imported in/candy imported in, not be/not be, traffic to/traffic on, resolutions for/resolutions for, way to learn/way to learn, pronunciation in/english , traffic on/traffic to, marked as/marked as",0.005732,915.0,76.0,839.0,1,Paraphrase,0.916031,839.0,76.0
6,10,"i improve my/i improve my, you have/you have, i earn money/i earn money, people over/people over, time travel/time travel, donald trump/donald trump, i get rid of/i get rid of, i improve my/i do to improve my",0.005286,1652.0,316.0,1336.0,1,Paraphrase,0.808343,1336.0,316.0
7,7,"saltwater taffy/saltwater taffy, way to/way to, purpose of/purpose of, ways to/way to, presence of/presence of, rid of/rid of, allowed to/allowed to, day of/day of",0.004448,1348.0,251.0,1097.0,1,Paraphrase,0.813333,1097.0,251.0
8,24,"do to/ϵ, to learn/to learn, is //ϵ, m 18 ./ϵ, ' m 18 ./ϵ, i ' m 18 ./ϵ, 18 ./ϵ, and why/ϵ",0.00325,1128.0,231.0,897.0,1,Paraphrase,0.79469,897.0,231.0
9,4,"ϵ/do to, ϵ/way to, ϵ/and why, ϵ/ways to, ϵ/you think, ϵ/? why, ϵ/possible to, ϵ/? what",0.002913,2415.0,742.0,1673.0,1,Paraphrase,0.692594,1673.0,742.0
