In [1]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
from collections import defaultdict
from scipy import spatial
from IPython.display import HTML, display
import tabulate
import json
import pickle
import re
import nltk

# Dataset Prep

### 1. Test set 

In [2]:
df = pd.read_excel("InputTestSet-Reviews48_Ann_NEW.xlsx")

In [3]:
df.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,0,2019_SJf_XhCqKm,Reject,The authors propose to use k-DPP to select a s...,0,,
1,1,2019_SJf_XhCqKm,Reject,"This paper covers the related work nicely, wit...",0,,
2,2,2019_SJf_XhCqKm,Reject,The rest of the paper are also clearly written.,0,,
3,3,2019_SJf_XhCqKm,Reject,"However, I have some concerns about the propos...",0,,
4,4,2019_SJf_XhCqKm,Reject,"- It is not clear how to define the kernel, th...",0,,


In [4]:
df.shape

(1505, 7)

In [5]:
gt_dict = {}

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    if not pid in gt_dict:
        gt_dict[pid] = {"dec": df.loc[i]["Dec"], "mcomp": set(), "not_mcomp": set()}
    if df.loc[i]["MComp"] == 1:
        gt_dict[pid]["mcomp"].add(df.loc[i]["UID"])
    else:
        gt_dict[pid]["not_mcomp"].add(df.loc[i]["UID"])

In [6]:
stats_dict = {"Accept": [0, 0], "Reject": [0, 0]}

for k, v in gt_dict.items():
    #print(len(v["mcomp"]), len(v["not_mcomp"]), v["dec"])
    stats_dict[v["dec"]][0] += len(v["mcomp"])
    stats_dict[v["dec"]][1] += len(v["not_mcomp"])
    
print(stats_dict)

{'Accept': [49, 643], 'Reject': [68, 745]}


In [7]:
test_set = list(gt_dict.keys())
print("TestSet length: %d\n"%len(test_set), test_set)

TestSet length: 32
 ['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl', '2017_S1_pAu9xl', '2018_SyYYPdg0-', '2017_BJAA4wKxg', '2019_HyVxPsC9tm', '2019_HylTBhA5tQ', '2019_B1l08oAct7', '2018_H135uzZ0-', '2017_H1oyRlYgg', '2017_r1y1aawlg', '2020_r1eX1yrKwB', '2020_Byg79h4tvB', '2019_H1lFZnR5YX', '2020_BkeWw6VFwr', '2018_HyHmGyZCZ', '2018_HyUNwulC-', '2020_HkgsPhNYPS']


In [8]:
for k in test_set:
    print('{:20}{}'.format(k, gt_dict[k]["mcomp"]))

2019_SJf_XhCqKm     {39, 17, 27, 28, 30}
2017_Bk0MRI5lg      {57}
2020_SyevYxHtDB     {76, 87}
2018_rJBiunlAW      {108, 110, 112, 113, 124, 126}
2020_rkltE0VKwH     {160, 184, 155, 157, 159}
2018_Hki-ZlbA-      {267, 235, 236, 271}
2019_BJx0sjC5FX     {292, 287}
2020_r1e_FpNFDr     {312, 322, 315, 308}
2020_B1lsXREYvr     {376, 401}
2018_SkZxCk-0Z      {449, 443, 445, 486}
2019_rJzoujRct7     {518, 519}
2018_HkfXMz-Ab      {573, 566}
2017_BJ9fZNqle      {627, 623, 615}
2019_SyxZJn05YX     {672, 673, 657, 669, 671}
2017_B1ckMDqlg      {714, 707}
2017_HJ0NvFzxl      {739}
2017_S1_pAu9xl      {806, 808, 809, 810, 792}
2018_SyYYPdg0-      {834, 867, 868, 869, 870, 872, 873, 844, 830}
2017_BJAA4wKxg      {884}
2019_HyVxPsC9tm     {931, 933, 905, 909, 912, 913, 919, 926}
2019_HylTBhA5tQ     {972, 950}
2019_B1l08oAct7     {994, 996, 1064, 1004, 1007, 1044, 1047, 1048, 1055}
2018_H135uzZ0-      {1072, 1079}
2017_H1oyRlYgg      set()
2017_r1y1aawlg      {1125, 1162, 1100, 1102, 1168}
2020_r1eX

In [9]:
sents_for_test = defaultdict(list)

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    test_sent_raw = str(df.loc[i]["Sent"])
    
    # Replace URLs with [URL]
    test_sent_raw = re.sub(r'http[s]?://[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    test_sent_raw = re.sub(r'papers.nips.cc/paper/[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    test_sent_raw = re.sub(r'arxiv.org/[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    
    sents_for_test[pid].append((df.loc[i]["UID"], test_sent_raw))

### 2. Train Set

In [10]:
df_train = pd.read_excel("InputTrainSet-Reviews7_Ann.xlsx")
df_train.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,243,2020_ryen_CEFwr,Reject,It extends this approach by introducing an add...,0,,
1,179,2018_H1LAqMbRW,Reject,"Experimentally, the results are rather weak co...",0,,
2,157,2017_HyTqHL5xg,Accept,The experiments are interesting but I'm still ...,0,,
3,146,2017_HyTqHL5xg,Accept,Section 2.2 says they do the latter in the int...,0,,
4,90,2017_ByToKu9ll,Reject,4)This paper proposed an improved version of t...,0,,


In [11]:
train_sets = {"mcomp": [], "non_mcomp": []}

In [12]:
for i in range(0, df_train.shape[0]):
    pid = df_train.loc[i]["PID"]
    train_sent_raw = str(df_train.loc[i]["Sent"])
    
    type_comp = df_train.loc[i]["MComp"]
    
    if type_comp == 1:
        train_sets["mcomp"].append(train_sent_raw)
    else:
        train_sets["non_mcomp"].append(train_sent_raw)

In [13]:
len(train_sets["mcomp"]), len(train_sets["non_mcomp"])

(26, 270)

## Load entities

In [14]:
with open("entities_dict_smaller", "r") as f:
    entity_dict = json.load(f)

In [15]:
set(entity_dict.values())

{'Material', 'Method', 'Metric', 'Task'}

In [16]:
list(entity_dict.items())[0:20]

[('convolutional neural networks', 'Method'),
 ('convnets', 'Method'),
 ('recognition', 'Task'),
 ('visual recognition tasks', 'Task'),
 ('age estimation', 'Task'),
 ('head pose estimation', 'Task'),
 ('multi - label classification', 'Task'),
 ('semantic segmentation', 'Task'),
 ('classification', 'Task'),
 ('deep convnets', 'Method'),
 ('dldl', 'Method'),
 ('feature learning', 'Task'),
 ('deep learning', 'Method'),
 ('image classification', 'Task'),
 ('deep learning methods', 'Method'),
 ('image classification tasks', 'Task'),
 ('human pose estimation', 'Task'),
 ('convnet', 'Method'),
 ('recognition tasks', 'Task'),
 ('ensemble', 'Method')]

In [17]:
entity_key_map = {}
for i in entity_dict:
    s = re.sub('[^0-9a-zA-Z,:;.?!\- ]+', '', i)
    while s.find("  ") > -1:
        s = s.replace("  ", " ")
    if len(s) > 2:
        cl = re.sub('[^0-9a-zA-Z ]+', '', i)
        while cl.find("  ") > -1:
            cl = cl.replace("  ", " ")
        entity_key_map[cl.strip()] = i
print(len(entity_key_map))

1784


In [18]:
coun = 0
for i in entity_dict:
    if len(i) < 5:
        coun +=1
#         print(i)
print(coun)

212


In [19]:
list(entity_key_map.items())[0:5]

[('convolutional neural networks', 'convolutional neural networks'),
 ('convnets', 'convnets'),
 ('recognition', 'recognition'),
 ('visual recognition tasks', 'visual recognition tasks'),
 ('age estimation', 'age estimation')]

In [20]:
from collections import Counter
c = Counter(entity_dict.values())
c

Counter({'Method': 1191, 'Task': 289, 'Metric': 158, 'Material': 165})

In [21]:
# dir(c)
reverse_map = defaultdict(list)

for k, v in entity_dict.items():
    reverse_map[v].append(k)

In [22]:
# for i in reverse_map["Task"]:
#     print(i)

In [23]:
"MNIST" in entity_key_map, "mnist" in entity_key_map

(False, True)

## A. SciBERT

In [24]:
import spacy
import torch

In [25]:
!pip3.7 list | grep -E 'transformers|tokenizers'

spacy-transformers            0.6.2
tokenizers                    0.7.0
transformers                  2.9.0
You should consider upgrading via the '/usr/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [26]:
from transformers import AutoTokenizer, AutoModel

In [27]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased")

In [28]:
import transformers
print(transformers.__version__)

2.9.0


In [29]:
def embed_text_using_roberta(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

In [30]:
def mask_entities(sentence, replace_with_dataset=True):
    cleaned_sent = re.sub('[^0-9a-zA-Z,:;.?!\- ]+', ' ', sentence)
    while cleaned_sent.find("  ") > -1:
        cleaned_sent = cleaned_sent.replace("  ", " ")
    
    entity_key_map_keys = list(entity_key_map.keys()) # As we will be dunamically adding entries to this dict an dthat will throw an error.
    entities_found = []
    for i in entity_key_map_keys:
        if cleaned_sent.find(" " + i + " ") > -1:
            entities_found.append(i)
        elif cleaned_sent.lower().find(" " + i + " ") > -1:
            found_idx = cleaned_sent.lower().find(" " + i + " ")
            entity_dict[cleaned_sent[found_idx:found_idx+len(" " + i + " ")]] = entity_dict[i]
            entity_key_map[cleaned_sent[found_idx:found_idx+len(" " + i + " ")]] = entity_key_map[i]
    
    entities_found.sort(key=lambda s: len(s))
    len_sorted_entities = entities_found.copy()
    
    subset_entities = []
    # Remove subset entities (eg: Among cnn and 3-layer-cnn, prefer the latter)
    for fe in len_sorted_entities:
        for other_ent in len_sorted_entities:
            if fe != other_ent and other_ent.find(fe) > -1:
                subset_entities.append(fe)
                break
    for se in subset_entities:
        len_sorted_entities.remove(se)
    for maxents in len_sorted_entities:
        mask_name = " " + entity_dict[entity_key_map[i]].lower() + " "
        if replace_with_dataset:
            if mask_name == " material ":
                mask_name = " dataset "
        cleaned_sent = cleaned_sent.replace(" " + maxents + " ", mask_name)
    words_cleaned = nltk.word_tokenize(cleaned_sent)
    dups_removed = [v for i, v in enumerate(words_cleaned) if i == 0 or v != words_cleaned[i-1]]
    new_dup_removed_sent = " ".join(dups_removed)
    return new_dup_removed_sent.strip()

#     #print(cleaned_sent)
#     for i in entity_key_map:
#         if cleaned_sent.find(" " + i + " ") > -1:
#             #print("Substituting ent: {} with mask: {}".format(i, entity_dict[entity_key_map[i]].lower()))
#             cleaned_sent = cleaned_sent.replace(i, entity_dict[entity_key_map[i]].lower())
#     return cleaned_sent

In [31]:
nlp = spacy.load('en_core_web_sm')
sp_toks = ["result", "method", "task", "dataset", "metric", "baseline", "fair", "unfair"]

In [32]:
def extract_chunks_using_spacy_dp(conssentence, replace_with_dataset=True):
    
    conssentence = mask_entities(conssentence, replace_with_dataset)
#     print(conssentence)
    doc = nlp(conssentence)
    verb_subtree = []

    for s in doc.sents:
#         find_special_tokens = {"compar": [], "result": [], "method": [], "technique": [], "task": [], "dataset": [], "material": [], "metric": []}
        find_special_tokens = {"compar": [], "result": [], "method": [], "baseline": [], "task": [], 
                               "dataset": [],  "metric": [], "unfair": [], "fair": []}

        for tok in s:

            if tok.text.lower().startswith("compar"):
                find_special_tokens["compar"].append(tok)
            else:
                for k in sp_toks:
                    if tok.text.lower().startswith(k):
                        find_special_tokens[k].append(tok)
                        break

        verb_tokens = []
        if find_special_tokens["compar"]:
            for t in find_special_tokens["compar"]:
#                     verb_subtree.append(t.subtree)
                if t == s.root:
                    simplified_sent = ""
                    for chh in t.lefts:
                        simplified_sent = simplified_sent + " " + chh.text
                    simplified_sent = simplified_sent + " " + t.text
                    for chh in t.rights:
                        simplified_sent = simplified_sent + " " + chh.text
#                         print("SIMP: ", simplified_sent)
                    verb_subtree.append(simplified_sent)
                else:
                    verb_subtree.append(t.subtree)
        else:
            for k in sp_toks:
                for i in find_special_tokens[k]:
                    local_vt = []
                    for j in i.ancestors:
                        if j.pos_ == "NOUN":
                            local_vt.append(j)
                    if not local_vt:
                        for j in i.ancestors:
                            if j.pos_ == "VERB":
                                local_vt.append(j)
                    verb_tokens = verb_tokens + local_vt


            for i in verb_tokens:
                verb_subtree.append(i.subtree)

    eecc = []
    for i in verb_subtree:
        if type(i) == str:
            eecc.append(i)
        else:
            local_chunk = ""
            for lcaltok in i:
                local_chunk = local_chunk + " " + lcaltok.text
            eecc.append(local_chunk)
#     if not eecc:
#         print(conssentence)
    return list(set(eecc))


### Computing vectors of the initial training pool of sentences

In [33]:
train_pool_roberta_vecs = {"mcomp": [], "non_mcomp": []}
single_train_pool_roberta_vecs = {"mcomp": [], "non_mcomp": []}

In [34]:
for i in train_sets["mcomp"]:
    mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(i)
    if mcomp_chunks_from_sent:
        final_chunks = mcomp_chunks_from_sent
    else:
        final_chunks = [i]
    
    for single_chunk in final_chunks:
        vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
        train_pool_roberta_vecs["mcomp"].append(vec/norm(vec))
    
    collated_chunk = " ".join(final_chunks)
    vec = embed_text_using_roberta(collated_chunk.strip()).mean(1).detach().numpy()
    single_train_pool_roberta_vecs["mcomp"].append(vec/norm(vec))


for i in train_sets["non_mcomp"]:
    mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(i)
    if mcomp_chunks_from_sent:
        final_chunks = mcomp_chunks_from_sent
    else:
        final_chunks = [i]
    
    for single_chunk in final_chunks:
        vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
        train_pool_roberta_vecs["non_mcomp"].append(vec/norm(vec))
    
    collated_chunk = " ".join(final_chunks)
    vec = embed_text_using_roberta(collated_chunk.strip()).mean(1).detach().numpy()
    single_train_pool_roberta_vecs["non_mcomp"].append(vec/norm(vec))


In [35]:
roberta_vectors = defaultdict(dict)
skip_uids = []

for pid in gt_dict:
    roberta_vectors[pid] = {}
    
    for mcs in gt_dict[pid]["mcomp"]:
        try:
            mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(df.loc[mcs]["Sent"])
            if mcomp_chunks_from_sent:
                final_chunks = mcomp_chunks_from_sent
            else:
                final_chunks = [df.loc[mcs]["Sent"]]
            
            roberta_vectors[pid][mcs] = []
            for single_chunk in final_chunks:
                vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
                roberta_vectors[pid][mcs].append(vec / norm(vec))
        except Exception as ex:
            print(pid, mcs, df.loc[mcs]["Sent"])
            skip_uids.append(mcs)
    
    for mcs in gt_dict[pid]["not_mcomp"]:
        try:
            mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(df.loc[mcs]["Sent"])
            if mcomp_chunks_from_sent:
                final_chunks = mcomp_chunks_from_sent
            else:
                final_chunks = [df.loc[mcs]["Sent"]]
            
            roberta_vectors[pid][mcs] = []
            for single_chunk in final_chunks:
                vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
                roberta_vectors[pid][mcs].append(vec / norm(vec))
        except Exception as ex:
            print(pid, mcs, df.loc[mcs]["Sent"])
            skip_uids.append(mcs)

2019_HyVxPsC9tm 938 2
2019_HyVxPsC9tm 940 3
2020_Byg79h4tvB 1272 [1] Conditional adversarial domain adaptation, Long et.al, in NeurIPS 2018
[2] Towards Accurate Model Selection in Deep Unsupervised Domain Adaptation, You et.al, in ICML 2019
2018_HyHmGyZCZ 1425 2


In [36]:
mcomp_sentences = {}
not_mcomp_sentences = {}

for pid in gt_dict:
    for mcs in gt_dict[pid]["mcomp"]:
        if not mcs in skip_uids:
            mcomp_sentences[mcs] = pid
    for mcs in gt_dict[pid]["not_mcomp"]:
        if not mcs in skip_uids:
            not_mcomp_sentences[mcs] = pid
print(len(mcomp_sentences), len(not_mcomp_sentences))

117 1384


In [37]:
mean_sim_with_mcomp = defaultdict(dict)
mean_sim_with_not_mcomp = defaultdict(dict)
max_sim_with_mcomp = defaultdict(dict)
max_sim_with_not_mcomp = defaultdict(dict)

mean_at_k = ["1", "3", "5", "7", "10", "26"]


for sid in mcomp_sentences:
    
    # 1. With other training set mcomp sentences
    temp_list = []    
    for init_train_vec in train_pool_roberta_vecs["mcomp"]:
        for cvec2 in roberta_vectors[mcomp_sentences[sid]][sid]:
            temp_list.append(np.inner(init_train_vec, cvec2)[0][0])
        
    sorted_temp_list = sorted(temp_list, reverse=True)
    mean_sim_with_mcomp[sid]["mean"] = np.mean(sorted_temp_list)
    max_sim_with_mcomp[sid]["max"] = max(sorted_temp_list)
    for vv in mean_at_k:
        mean_sim_with_mcomp[sid]["mean_{}".format(vv)] = np.mean(sorted_temp_list[0:int(vv)])
#         max_sim_with_mcomp[sid]["max_{}".format(vv)] = max(sorted_temp_list[0:int(vv)])
    
    
    # 2. With other training set non_mcomp sentences
    temp_list = []    
    for init_train_vec in train_pool_roberta_vecs["non_mcomp"]:
        for cvec2 in roberta_vectors[mcomp_sentences[sid]][sid]:
            temp_list.append(np.inner(init_train_vec, cvec2)[0][0])
        
    sorted_temp_list = sorted(temp_list, reverse=True)
    mean_sim_with_not_mcomp[sid]["mean"] = np.mean(sorted_temp_list)
    max_sim_with_not_mcomp[sid]["max"] = max(sorted_temp_list)
    for vv in mean_at_k:
        mean_sim_with_not_mcomp[sid]["mean_{}".format(vv)] = np.mean(sorted_temp_list[0:int(vv)])
#         max_sim_with_not_mcomp[sid]["max_{}".format(vv)] = max(sorted_temp_list[0:int(vv)])




for sid in not_mcomp_sentences:
    
    # 1. With other training set mcomp sentences
    temp_list = []    
    for init_train_vec in train_pool_roberta_vecs["mcomp"]:
        for cvec2 in roberta_vectors[not_mcomp_sentences[sid]][sid]:
            temp_list.append(np.inner(init_train_vec, cvec2)[0][0])
        
    sorted_temp_list = sorted(temp_list, reverse=True)
    mean_sim_with_mcomp[sid]["mean"] = np.mean(sorted_temp_list)
    max_sim_with_mcomp[sid]["max"] = max(sorted_temp_list)
    for vv in mean_at_k:
        mean_sim_with_mcomp[sid]["mean_{}".format(vv)] = np.mean(sorted_temp_list[0:int(vv)])
#         max_sim_with_mcomp[sid]["max_{}".format(vv)] = max(sorted_temp_list[0:int(vv)])
    
    
    # 2. With other training set non_mcomp sentences
    temp_list = []    
    for init_train_vec in train_pool_roberta_vecs["non_mcomp"]:
        for cvec2 in roberta_vectors[not_mcomp_sentences[sid]][sid]:
            temp_list.append(np.inner(init_train_vec, cvec2)[0][0])
        
    sorted_temp_list = sorted(temp_list, reverse=True)
    mean_sim_with_not_mcomp[sid]["mean"] = np.mean(sorted_temp_list)
    max_sim_with_not_mcomp[sid]["max"] = max(sorted_temp_list)
    for vv in mean_at_k:
        mean_sim_with_not_mcomp[sid]["mean_{}".format(vv)] = np.mean(sorted_temp_list[0:int(vv)])
#         max_sim_with_not_mcomp[sid]["max_{}".format(vv)] = max(sorted_temp_list[0:int(vv)])


In [38]:
def precision_at_k(sent_sim_dict, k=10, sim_type="max", mean_at=1):
    """mean_at can take values: 1, 3, 5, 7, 10, 26"""
    
    if sim_type == "max":
        sorted_sims = sorted(sent_sim_dict.items(), key=lambda x: x[1]["max"], reverse=True)
        top_k_sorted_sims = sorted_sims[0:k]
        tp = 0
        fp = 0
        for i in top_k_sorted_sims:
            if df.loc[i[0]]["MComp"] == 1:
                tp += 1
            else:
                fp += 1
        return tp/(tp+fp)
            
    elif sim_type == "mean":
        sorted_sims = sorted(sent_sim_dict.items(), key=lambda x: x[1]["mean_"+str(mean_at)], reverse=True)
        top_k_sorted_sims = sorted_sims[0:k]
        tp = 0
        fp = 0
        for i in top_k_sorted_sims:
            if df.loc[i[0]]["MComp"] == 1:
                tp += 1
            else:
                fp += 1
        return tp/(tp+fp)

In [39]:
def recall_at_k(sent_sim_dict, k=10, sim_type="max", mean_at=1):
    if sim_type == "max":
        sorted_sims = sorted(sent_sim_dict.items(), key=lambda x: x[1]["max"], reverse=True)
        top_k_sorted_sims = sorted_sims[0:k]
        tp = 0
        for i in top_k_sorted_sims:
            if df.loc[i[0]]["MComp"] == 1:
                tp += 1
        return tp/min(k, 117)
            
    elif sim_type == "mean":
        sorted_sims = sorted(sent_sim_dict.items(), key=lambda x: x[1]["mean_"+str(mean_at)], reverse=True)
        top_k_sorted_sims = sorted_sims[0:k]
        tp = 0
        for i in top_k_sorted_sims:
            if df.loc[i[0]]["MComp"] == 1:
                tp += 1
        return tp/min(k, 117)

### Based on max similarity with meaningful group and removing entries more similar to non-meaningful group

In [40]:
# DIff of max similarity of meaningful and non-meaningful sentences

diff_max = {}
preserve_non_negative_max = {}

for x in max_sim_with_mcomp:
    diff_max_sim_for_x = max_sim_with_mcomp[x]["max"] - max_sim_with_not_mcomp[x]["max"]
    diff_max[x] = {"max": diff_max_sim_for_x}
    
    if diff_max_sim_for_x > 0:
        preserve_non_negative_max[x] = {"max": max_sim_with_mcomp[x]["max"]}
    else:
        print(x)
        preserve_non_negative_max[x] = {"max": 0}

39
28
57
76
108
112
113
124
126
155
157
159
235
236
271
287
312
322
308
449
445
486
518
519
573
566
627
623
672
673
657
669
671
714
707
739
806
808
809
810
792
867
868
870
873
844
830
884
905
909
913
926
972
950
994
1004
1007
1044
1047
1048
1055
1072
1079
1125
1162
1100
1102
1177
1202
1212
1243
1268
1281
1284
1318
1333
1347
1406
1390
1426
1451
1452
1504
1464
1497
1502
0
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
18
19
21
22
23
24
25
26
29
31
32
33
34
35
36
37
38
40
41
42
43
44
45
46
48
49
50
51
52
53
54
56
59
60
61
62
64
65
66
67
68
69
70
71
72
73
74
75
77
78
79
80
81
82
83
84
85
86
88
89
90
91
92
93
94
95
96
97
98
99
100
128
101
102
103
104
105
106
107
109
111
114
115
116
117
118
119
120
121
123
125
127
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
156
158
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
185
186
187
188
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
206
207
208
209
210


In [41]:
print("P and R based on preserving non-neg max sim with initial pool of \n meaningful & non meaningful sents:")

patk = [1, 5, 10, 20, 50, 100, 117, 150, 200, 300, 400, 500, 600]
res_table = [["Precision at:"] + patk, ["Val"]]

for k in patk:
    v1 = precision_at_k(preserve_non_negative_max, k) #round(sum(i > 0 for i in diff12[val])/len(diff12[val]), 2)
    res_table[1].append(round(v1, 2))

display(HTML(tabulate.tabulate(res_table, tablefmt='html')))



ratk = [1, 5, 10, 20, 50, 100, 117, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1388]
res_table = [["Recall at:"] + ratk, ["Val"]]

for k in ratk:
    v1 = recall_at_k(preserve_non_negative_max, k) #round(sum(i > 0 for i in diff12[val])/len(diff12[val]), 2)
    res_table[1].append(round(v1,2))

display(HTML(tabulate.tabulate(res_table, tablefmt='html')))

P and R based on preserving non-neg max sim with initial pool of 
 meaningful & non meaningful sents:


0,1,2,3,4,5,6,7,8,9,10,11,12,13
Precision at:,1,5,10.0,20.0,50.0,100.0,117.0,150.0,200.0,300.0,400.0,500.0,600.0
Val,0,0,0.3,0.35,0.36,0.47,0.55,0.65,0.58,0.39,0.29,0.23,0.2


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Recall at:,1,5,10.0,20.0,50.0,100.0,117.0,150.0,200,300,400,500,600,700,800,900,1000,1200,1388
Val,0,0,0.3,0.35,0.36,0.47,0.55,0.83,1,1,1,1,1,1,1,1,1,1,1


### Based on diff of max similarity with initial pool

In [42]:
print("P and R based on diff of max sim with initial pool of \n meaningful & non meaningful sents:")

patk = [1, 10, 20, 50, 100, 117, 150, 200, 300, 400, 500, 600]
res_table = [["Precision at:"] + patk, ["Val"]]

for k in patk:
    v1 = precision_at_k(diff_max, k) #round(sum(i > 0 for i in diff12[val])/len(diff12[val]), 2)
    res_table[1].append(round(v1, 2))

display(HTML(tabulate.tabulate(res_table, tablefmt='html')))



ratk = [117, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1388]
res_table = [["Recall at:"] + ratk, ["Val"]]

for k in ratk:
    v1 = recall_at_k(diff_max, k) #round(sum(i > 0 for i in diff12[val])/len(diff12[val]), 2)
    res_table[1].append(round(v1,2))

display(HTML(tabulate.tabulate(res_table, tablefmt='html')))

P and R based on diff of max sim with initial pool of 
 meaningful & non meaningful sents:


0,1,2,3,4,5,6,7,8,9,10,11,12
Precision at:,1,10.0,20.0,50.0,100.0,117.0,150.0,200.0,300.0,400.0,500.0,600.0
Val,0,0.5,0.55,0.46,0.33,0.31,0.29,0.24,0.18,0.17,0.15,0.14


0,1,2,3,4,5,6,7,8,9,10,11,12,13
Recall at:,117.0,150.0,200.0,300.0,400.0,500.0,600.0,700.0,800.0,900.0,1000.0,1200.0,1388.0
Val,0.31,0.37,0.41,0.46,0.57,0.64,0.71,0.77,0.81,0.84,0.86,0.92,0.97


## SVO triple similarity

In [43]:
# DIff of max similarity of meaningful and non-meaningful sentences

diff_max = {}
preserve_non_negative_max = {}

for x in max_sim_with_mcomp:
    diff_max_sim_for_x = max_sim_with_mcomp[x]["max"] - max_sim_with_not_mcomp[x]["max"]
    diff_max[x] = {"max": diff_max_sim_for_x}
    
    if diff_max_sim_for_x > 0:
        preserve_non_negative_max[x] = {"max": max_sim_with_mcomp[x]["max"]}
    else:
        print(x)
        preserve_non_negative_max[x] = {"max": 0}

39
28
57
76
108
112
113
124
126
155
157
159
235
236
271
287
312
322
308
449
445
486
518
519
573
566
627
623
672
673
657
669
671
714
707
739
806
808
809
810
792
867
868
870
873
844
830
884
905
909
913
926
972
950
994
1004
1007
1044
1047
1048
1055
1072
1079
1125
1162
1100
1102
1177
1202
1212
1243
1268
1281
1284
1318
1333
1347
1406
1390
1426
1451
1452
1504
1464
1497
1502
0
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
18
19
21
22
23
24
25
26
29
31
32
33
34
35
36
37
38
40
41
42
43
44
45
46
48
49
50
51
52
53
54
56
59
60
61
62
64
65
66
67
68
69
70
71
72
73
74
75
77
78
79
80
81
82
83
84
85
86
88
89
90
91
92
93
94
95
96
97
98
99
100
128
101
102
103
104
105
106
107
109
111
114
115
116
117
118
119
120
121
123
125
127
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
156
158
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
185
186
187
188
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
206
207
208
209
210


In [44]:
from openie import StanfordOpenIE

with StanfordOpenIE() as client:
    text = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.'
    print('Text: %s.' % text)
    for triple in client.annotate(text):
        print('|-', triple)

Downloading to /home/shruti/stanfordnlp_resources.


KeyboardInterrupt: 

In [48]:
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
cp = nltk.RegexpParser(grammar)

In [49]:
toks = nltk.word_tokenize("This is a test.")
word_pos = nltk.pos_tag(toks)

In [50]:
print(cp.parse(word_pos))

(S (NP This/DT is/VBZ a/DT test/NN ./.))


In [51]:
nltk.sem??