In [1]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

import importlib
from IPython.display import display, Markdown, Latex
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', 500)

from src.utils import tree_utils
from src.features import feature_utils

### Load the maximum likelihood subtrees for IMDB

In [2]:
data = feature_utils.load_trees(output_dir="data/imdb", dataset="imdb")

parsed 210556/213948 trees
21515 imdb predictions
parsed 105187/106816 trees
10837 imdb predictions


In [3]:
print(data["dev"]["tree"][0][0])

(29
  (31
    (9
      (19
        (32 the)
        (21 (36 start) (24 (64 of) (15 (32 the) (48 film)))))
      (39 was))
    (25 (95 rather) (92 slow)))
  (17
    (14
      (58 ,)
      (22
        (9
          (19
            (32 the)
            (13 (30 (90 fake) (34 -)) (13 (46 looking) (38 gore))))
          (39 not))
        (25 (67 much) (24 (64 of) (41 interest)))))
    (66 .)))


### Enumerate all of the subtrees and sort by mutual information

In [4]:
feature_utils.add_features(data, "Subtrees", feature_utils.pcfg_subtrees_by_depth(7))
subtrees = feature_utils.get_subtree_feature_table(data)
subtrees.head(10)

2354296 Subtrees features


Unnamed: 0,Root,Subtree,Yield,MI,Count,Negative,Positive,Majority label,% majority
0,35,(35 *),worst,0.033091,1480.0,1390.0,90.0,Negative,0.938596
1,92,(92 *),bad,0.029023,2455.0,2034.0,421.0,Negative,0.828246
2,46,(46 *),great,0.017171,3731.0,1139.0,2592.0,Positive,0.694616
3,46,(46 *),bad,0.0147,2163.0,1636.0,527.0,Negative,0.75612
4,66,(66 *),?,0.014287,4334.0,2914.0,1420.0,Negative,0.672279
5,36,(36 *),waste,0.013073,536.0,515.0,21.0,Negative,0.959108
6,68,(68 *),t,0.011613,11962.0,6860.0,5102.0,Negative,0.57347
7,37,(37 *),no,0.011447,4197.0,2762.0,1435.0,Negative,0.658014
8,35,(35 *),best,0.011404,2290.0,663.0,1627.0,Positive,0.710297
9,93,(93 *),waste,0.010748,500.0,470.0,30.0,Negative,0.938247


### Create aggregated features by merging subtrees by root non-terminal and majority class label

In [5]:
feature_utils.add_merges(data, "Subtrees", K=2000, merge_name="Subtree groups",
                         filter_=lambda w: w[0].count("*") >= 2)

filtering


In [6]:
feature_utils.get_merged_feature_table(data, merge_name="Subtree groups").head(30)

Unnamed: 0,Root,Examples,MI,Count,Negative,Positive,Majority label idx,Majority label,% majority,Support count,Counter count
0,25,"so bad, waste your time, not funny, even worse, the worst movie, a waste of time, that bad, as bad",0.074647,7056.0,5477.0,1579.0,0,Negative,0.776141,5477.0,1579.0
1,25,"highly recommended, a must see, a great movie, a great job, a great film, very well, very good, pleasantly surprised",0.04328,5676.0,1495.0,4181.0,1,Positive,0.736527,4181.0,1495.0
2,10,"waste of time, bad movie, good thing, terrible movie, good idea, horror movie, complete waste of time, awful movie",0.032665,2210.0,1911.0,299.0,0,Negative,0.864376,1911.0,299.0
3,10,"must see, great job, great movie, great film, wonderful movie, excellent movie, must - see, excellent job",0.031599,3188.0,675.0,2513.0,1,Positive,0.788088,2513.0,675.0
4,31,"don ' t waste your time, i mean, don ' t bother, it fails, it was so bad, it seemed, this movie sucks, it tries",0.028956,2602.0,2130.0,472.0,0,Negative,0.818356,2130.0,472.0
5,18,"at all, at all costs, at least, at best, than this, to be funny, to work with, a 2",0.028706,7997.0,5280.0,2717.0,0,Negative,0.660208,5280.0,2717.0
6,21,"worst movie, worst film, worst movies, piece of crap, worst films, only good thing, only thing, whole movie",0.027346,3039.0,2389.0,650.0,0,Negative,0.785926,2389.0,650.0
7,31,"i loved it, i recommend it, i love this movie, i loved this movie, i highly recommend it, this is a great movie, i enjoyed it, i first saw this movie",0.025799,1569.0,191.0,1378.0,1,Positive,0.877785,1378.0,191.0
8,18,"on dvd, as well, in love, at the same time, for everyone, so well, very well, for the first time",0.023037,4711.0,1436.0,3275.0,1,Positive,0.695099,3275.0,1436.0
9,30,"well -, must -, heart -, fun ,, fun and, very entertaining, all time, funny ,",0.021321,2801.0,683.0,2118.0,1,Positive,0.755976,2118.0,683.0


Print out the top subtrees with root node 29:

In [7]:
feature_utils.add_merges(data, "Subtrees", merge_name="Root 29", K=1000, by_template=True,
                         filter_=lambda w: w[0].startswith("(29 "))

filtering


In [8]:
feature_utils.get_merged_feature_table(data, merge_name="Root 29").head(20)

Unnamed: 0,Root,Examples,MI,Count,Negative,Positive,Majority label idx,Majority label,% majority,Support count,Counter count
0,(29 (16 (69 *) (34 *)) (69 *)),"10 / 10, 8 / 10, 7 / 10, 9 / 10, : - d, 5 / 5",0.007876,266.0,3.0,263.0,1,Positive,0.985075,263.0,3.0
1,(29 (16 (69 *) (34 *)) (69 *)),"4 / 10, 3 / 10, 1 / 10, 2 / 10, 0 / 10, 2 / 5",0.005995,195.0,195.0,0.0,0,Negative,0.994924,195.0,0.0
2,(29 (31 (59 *) (7 (93 *) (15 (32 *) (48 *)))) (66 *)),"i loved this movie ., i love this movie ., i enjoyed this film ., i love this movie !, i liked this movie ., i love this film ., i loved this movie !, i enjoyed this movie .",0.002245,74.0,0.0,74.0,1,Positive,0.986842,74.0,0.0
3,(29 (25 (95 *) (92 *)) (66 *)),"highly recommended ., very funny ., beautifully done ., not many ., really funny ., really good ., totally empty ., very effective .",0.002148,71.0,0.0,71.0,1,Positive,0.986301,71.0,0.0
4,(29 (31 (9 (19 (32 *) (48 *)) (39 *)) (73 *)) (66 *)),"this movie was horrible ., this film is awful ., the acting is atrocious ., this movie was terrible ., the acting was terrible ., this movie is terrible ., this movie was awful ., this movie is bad .",0.002075,71.0,71.0,0.0,0,Negative,0.986301,71.0,0.0
5,(29 (31 (9 (19 (32 *) (48 *)) (39 *)) (73 *)) (66 *)),"the acting is superb ., this movie is wonderful ., this movie was excellent ., the acting is excellent ., this movie is great !, this movie is great ., the music is good ., the acting was great .",0.00162,60.0,1.0,59.0,1,Positive,0.967742,59.0,1.0
6,(29 (16 (69 *) (16 (72 *) (64 *))) (16 (69 *) (66 *))),"10 out of 10 ., 7 out of 10 ., 8 out of 10 ., 9 out of 10 ., 10 out of 10 !",0.001506,51.0,0.0,51.0,1,Positive,0.981132,51.0,0.0
7,(29 (31 (9 (19 (54 *) (56 *)) (39 *)) (73 *)) (66 *)),"my vote is eight ., my vote is seven ., my vote is nine ., my vote is ten .",0.001187,41.0,0.0,41.0,1,Positive,0.976744,41.0,0.0
8,(29 (16 (69 *) (34 *)) (16 (69 *) (66 *))),"2 / 10 ., 4 / 10 ., 3 / 10 ., 1 / 10 ., 0 / 10 ., ho - hum .",0.001176,42.0,42.0,0.0,0,Negative,0.977273,42.0,0.0
9,(29 (31 (59 *) (7 (77 *) (44 *))) (66 *)),"i loved it ., i loved it !, go see it !, i recommend it ., just watch it ., i love it ., go see it ., i enjoyed it .",0.001146,65.0,6.0,59.0,1,Positive,0.895522,59.0,6.0


The subtrees with roots 5 and 8:

In [9]:
feature_utils.add_merges(data, "Subtrees", merge_name="Root 5/8", K=100,
                         filter_=lambda w: w[0].startswith("(5 ") or w[0].startswith("(8 "))

filtering


In [10]:
feature_utils.get_merged_feature_table(data, merge_name="Root 5/8")

Unnamed: 0,Root,Examples,MI,Count,Negative,Positive,Majority label idx,Majority label,% majority,Support count,Counter count
0,5,"walter matthau, james stewart, jon voight, william powell, philo vance, victor mclaglen, frank sinatra, day - lewis",0.01542,946.0,112.0,834.0,1,Positive,0.880802,834.0,112.0
1,8,"30 minutes, 10 minutes, 90 minutes, five minutes, 2 hours, 20 minutes, any sense, 15 minutes",0.011133,1480.0,1144.0,336.0,0,Negative,0.772605,1144.0,336.0
2,5,"ed wood, steven seagal, van damme, uwe boll, tom savini, ajay devgan, claude van damme, jessica simpson",0.004739,200.0,192.0,8.0,0,Negative,0.955446,192.0,8.0
3,8,"many years, few movies, most people, many ways",0.000986,312.0,101.0,211.0,1,Positive,0.675159,211.0,101.0
