In [1]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

import importlib
from IPython.display import display, Markdown, Latex
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

from src.utils import tree_utils
from src.features import feature_utils

### Load the maximum likelihood subtrees for Subj

In [2]:
data = feature_utils.load_trees(output_dir="data/subj", dataset="subj")

parsed 5992/5996 trees
parsed 1999/2000 trees


### Enumerate all of the subtrees and sort by mutual information

In [7]:
importlib.reload(tree_utils)
importlib.reload(feature_utils)

<module 'src.features.feature_utils' from '/n/fs/nlp-df22/project/ShortcutGrammar/src/features/feature_utils.py'>

In [9]:
feature_utils.add_features(data, "Subtrees", feature_utils.pcfg_subtrees)
subtrees = feature_utils.get_subtree_feature_table(data)
subtrees.head(15)

157459 Subtrees features


Unnamed: 0,Root,Subtree,Yield,MI,Count,Subjective,Objective,Majority label,% majority
0,46.0,(46 *),his,0.036769,730.0,110.0,620.0,Objective,0.848361
1,41.0,(41 *),he,0.026941,488.0,63.0,425.0,Objective,0.869388
2,41.0,(41 *),it,0.025076,732.0,580.0,152.0,Subjective,0.791553
3,,(34),,0.022432,398.0,49.0,349.0,Objective,0.875
4,,(),,0.022398,401.0,50.0,351.0,Objective,0.873449
5,39.0,(39 *),.,0.017274,182.0,177.0,5.0,Subjective,0.967391
6,41.0,(41 *),she,0.015951,241.0,22.0,219.0,Objective,0.90535
7,46.0,(46 *),her,0.015941,310.0,42.0,268.0,Objective,0.862179
8,38.0,(38 *),',0.014798,2069.0,1279.0,790.0,Subjective,0.618059
9,46.0,(46 *),their,0.013892,377.0,71.0,306.0,Objective,0.810026


### Create aggregated features by merging subtrees by root non-terminal and majority class label

In [10]:
feature_utils.add_merges(data, "Subtrees", merge_name="Subtree groups")
subtree_groups = feature_utils.get_merged_feature_table(data, merge_name="Subtree groups")
subtree_groups.head(15).sort_values(
    by=["Majority label idx"]
).reset_index()[["Majority label", "Root", "Examples", "Count", "% majority"]]

Unnamed: 0,Majority label,Root,Examples,Count,% majority
0,Subjective,84.0,"movie, film, cast, films, humor, comedy, performance, performances",876.0,0.84738
1,Subjective,41.0,"it, you, i, there, this, what, we",1250.0,0.778754
2,Subjective,13.0,"the movie, but it, the film, if you, if it, this film, [ [UNK] ], [ [UNK]",586.0,0.882653
3,Subjective,27.0,"a movie, the film, the movie, this movie, the screen, its [UNK], the characters, [ [UNK]",562.0,0.888298
4,Subjective,69.0,"movie, film, plot, actors, script, acting, picture, director",485.0,0.86653
5,Subjective,37.0,"so, as, too, more, pretty, less, something, nothing",776.0,0.784062
6,Subjective,64.0,"movie, film, screen, characters, bit, audience, story, plot",471.0,0.862579
7,Objective,46.0,"his, her, their, each, los, new",1449.0,0.827016
8,Objective,41.0,"he, she, they, who, things, in",1114.0,0.863799
9,Objective,27.0,"his life, his wife, his father, his mother, their lives, his family, the world, their own",1169.0,0.783091


In [11]:
feature_utils.add_merges(data, "Subtrees", K=1000, merge_name="Subtree groups (len >= 2)",
                         filter_=lambda w: w[0].count("*") >= 2)
subtree_groups_len2 = feature_utils.get_merged_feature_table(data, merge_name="Subtree groups (len >= 2)")
subtree_groups_len2.head(30)

filtering


Unnamed: 0,Root,Examples,MI,Count,Subjective,Objective,Majority label idx,Majority label,% majority,Support count,Counter count
0,27,"his life, his wife, his father, his mother, their lives, his family, the world, their own",0.071373,1628.0,323.0,1305.0,1,Objective,0.801227,1305.0,323.0
1,27,"a movie, the film, the movie, this movie, the screen, its [UNK], the characters, [ [UNK]",0.056368,980.0,846.0,134.0,0,Subjective,0.862525,846.0,134.0
2,13,"the movie, but it, the film, if you, if it, this film, [ [UNK] ], [ [UNK]",0.04483,702.0,624.0,78.0,0,Subjective,0.887784,624.0,78.0
3,28,"decides to, order to, "" "", has been, begins to, try to, returns to, decide to",0.031567,747.0,133.0,614.0,1,Objective,0.821095,614.0,133.0
4,2,""" "", best friend, young man, young [UNK], [UNK] girl, hit man, old man, drug dealer",0.030521,471.0,48.0,423.0,1,Objective,0.896406,423.0,48.0
5,3,"finds himself, finds out, falls in love, is [UNK], is sent, is killed, finds herself, [UNK] her",0.026083,460.0,57.0,403.0,1,Objective,0.874459,403.0,57.0
6,28,"' t, ' s, ' re, ' ll, ' s not, ' m, ' ve, ' d",0.025259,2149.0,1396.0,753.0,0,Subjective,0.649465,1396.0,753.0
7,2,"[UNK] movie, [UNK] film, running time, romantic comedy, [UNK] plot, best films, visual [UNK], good time",0.025223,293.0,279.0,14.0,0,Subjective,0.949153,279.0,14.0
8,13,"the two, when he, where he, the gang, the girls, so he, when [UNK], as they",0.019427,250.0,16.0,234.0,1,Objective,0.93254,234.0,16.0
9,1,". ., as [UNK], , too, in a way, in the right place, in its [UNK], so [UNK], than [UNK]",0.016206,444.0,359.0,85.0,0,Subjective,0.807175,359.0,85.0
