In [1]:
# https://github.com/allenai/allennlp/blob/master/allennlp/pretrained.py
# https://github.com/allenai/allennlp/issues/1278
# https://github.com/huggingface/neuralcoref
from allennlp import pretrained
import os
import h5py
import json
import sys
import glob
import typing
import copy
import pandas as pd
from typing import List, Dict, Union
sys.path.append("../") # Append visdialch path
import spacy
from collections import defaultdict, Counter
from tqdm import tqdm

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
# Pandas helper functions
def count_unique(df, col_name):
    """ Count unique values in a df column """
    count = df[col_name].nunique()
    return count


def get_unique_column_values(df,col_name):
    """ Returns unique values """
    return df[col_name].unique()


def get_column_stats(df,column_name,to_dict = False):
    if to_dict:
        return df[column_name].value_counts().to_dict()
    else:
        # return df[column_name].value_counts()
        c = df[column_name].value_counts(dropna=False)
        p = df[column_name].value_counts(dropna=False, normalize=True)*100
        m = pd.concat([c,p], axis=1, keys=['counts', '%'])
        return m


In [3]:
nlp = spacy.load('en_core_web_sm')
coref_model = pretrained.neural_coreference_resolution_lee_2017()
const_parser = pretrained.span_based_constituency_parsing_with_elmo_joshi_2018()

  "num_layers={}".format(dropout, num_layers))


In [4]:
path_val_data = "../../data/visdial_1.0_val.json"
path_images_root = "../../data/images/"
dense_annotations_jsonpath = "../../data/visdial_1.0_val_dense_annotations.json"
path_visdial_val = os.path.join(path_images_root, "VisualDialog_val2018")

In [5]:
json_data = json.load(open(path_val_data))

questions = json_data['data']['questions']
len(questions)

answers = json_data['data']['answers']
len(answers)

dialogs = json_data['data']['dialogs']

In [6]:
stats_dic = {
    "image_id": [],
    "pronouns": [],
    "question_list": [],
    "ellipsis": []
}
heuristic_root_cp = ["S", "SQ", "SBARQ", "SINV"]
for dialog_id in tqdm(range(len(dialogs))):
    image_id = dialogs[dialog_id]['image_id']
    dialog_for_image = dialogs[dialog_id]['dialog']
    pronouns = 0
    ellipsis = 0
    question_list = []
    for round_id in range(len(dialog_for_image)):
        question = questions[dialog_for_image[round_id]['question']]
        question_list.append(question)
        answer = answers[dialog_for_image[round_id]['answer']]
        const_results = const_parser.predict(question)
        root = const_results['trees'].replace('(','').split(" ")[0]
        if root not in heuristic_root_cp:
            ellipsis += 1
        doc = nlp(question)
        for token in doc:
            if token.pos_ == "PRON":
                pronouns += 1
                
    stats_dic["image_id"].append(image_id)        
    stats_dic["pronouns"].append(pronouns)
    stats_dic["ellipsis"].append(ellipsis)
    stats_dic["question_list"].append(question_list)
    

100%|██████████| 2064/2064 [1:04:03<00:00,  1.85s/it]


In [6]:
stats_df = pd.DataFrame.from_dict(stats_dic)
stats_df.describe()

Unnamed: 0,image_id,pronouns,ellipsis
count,2064.0,2064.0,2064.0
mean,288579.25436,3.895349,0.761628
std,167147.419799,2.218264,1.177162
min,243.0,0.0,0.0
25%,144005.75,2.0,0.0
50%,288702.5,4.0,0.0
75%,432053.25,5.0,1.0
max,580477.0,11.0,9.0


In [8]:
get_column_stats(stats_df, 'pronouns')

Unnamed: 0,counts,%
3,340,16.472868
4,336,16.27907
2,312,15.116279
5,288,13.953488
6,213,10.319767
1,208,10.077519
7,154,7.46124
0,92,4.457364
8,67,3.246124
9,32,1.550388


In [9]:
get_column_stats(stats_df, 'ellipsis')

Unnamed: 0,counts,%
0,1163,56.346899
1,539,26.114341
2,198,9.593023
3,80,3.875969
4,57,2.761628
7,10,0.484496
5,10,0.484496
6,4,0.193798
8,2,0.096899
9,1,0.04845


In [89]:
print(stats_dic)
print(stats_dic["question_list"][4])

sample_question_list = stats_dic["question_list"][4]

{'image_id': [185565, 284024, 574189, 148816, 88394, 255061, 36690, 76113, 112857, 296319, 67272, 347725, 345606, 240212, 128578, 239030, 555125, 64527, 29869, 29737, 33878, 267272, 221035, 200319, 307868, 214573, 137772, 117081, 99643, 240413, 59107, 249833, 258193, 23817, 384075, 416756, 275254, 244832, 454063, 52542, 239836, 162655, 209271, 186922], 'pronouns': [6, 4, 8, 4, 2, 6, 3, 9, 1, 6, 5, 4, 6, 2, 8, 2, 6, 10, 5, 6, 5, 2, 8, 3, 3, 5, 0, 7, 2, 5, 1, 4, 3, 8, 10, 7, 4, 5, 3, 6, 2, 8, 6, 3], 'question_list': [['is the photo in color', 'is it a professional photo', 'is it well lit', 'is it daytime', 'does this look like an adults bedroom', 'is the room clean', 'can you tell what kind of computer it is', 'is it a flat screen', "what's the desk made out of", 'is there a computer chair'], ['is this in a park', 'are there others around', 'does she have a collection bucket', 'is her hair long', 'is she wearing a dress', 'does she have shoes on', 'is there grass nearby', 'is it a sunny 

# Testing

In [10]:
sentence = "are there other trees?"

In [12]:
sentence = "is it sunny?"

In [13]:
coref_results = coref_model.predict(sentence)
const_results = const_parser.predict(sentence)


In [14]:
const_results

{'class_probabilities': [[0.9999999403953552,
   5.198524920249747e-18,
   7.491370013212872e-08,
   5.959874165229395e-15,
   7.173022826796994e-19,
   3.7213976475033994e-13,
   1.5704685409111477e-15,
   3.0739150916510305e-14,
   2.795679811783136e-13,
   3.0533648467998456e-17,
   3.5314285878689476e-17,
   1.895718300342493e-17,
   2.302231972770019e-12,
   1.5524157277063168e-14,
   5.222882890065783e-11,
   1.5695545991373239e-12,
   1.6684812643367845e-14,
   1.707952447535776e-19,
   6.068824307465571e-16,
   2.009180365483698e-12,
   2.6219100675295426e-12,
   4.965316894624261e-13,
   1.634040884372634e-12,
   1.2290072193789554e-16,
   6.505281475175165e-14,
   1.131256176194461e-16,
   9.37705706675505e-15,
   2.400986909290026e-10,
   1.1298136905202316e-12,
   2.5141989727119063e-13,
   5.4731241240502104e-12,
   7.24065078727687e-12,
   1.0379492404455348e-11,
   1.6099327588357912e-15,
   3.372206403007283e-10,
   1.39703163317198e-13,
   5.4850011522739744e-14,
   1.

In [81]:
const_results['trees']

'(SQ (VBP are) (NP (EX there)) (NP (NNS trees)) (. ?))'

In [82]:
root = const_results['trees'].replace('(','').split(" ")[0]
if root != "S":
    print(root)

SQ


In [11]:
doc = nlp(sentence)
for token in doc:
    print(token.text, token.pos_)

are VERB
there ADV
other ADJ
trees NOUN
? PUNCT


In [11]:
# from allennlp.predictors.predictor import Predictor
# predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
# predictor.predict(
#   sentence="If I bring 10 dollars tomorrow, can you buy me lunch?"
# )

In [12]:

doc = nlp("is it other sunny day?")

for token in doc:
    print(token.text, token.pos_)

doc = nlp("It is true that Google has been in acquisition mode")

for token in doc:
    print(token.text, token.pos_)

    

is 
it 
other 
sunny 
day 
? 
It 
is 
true 
that 
Google 
has 
been 
in 
acquisition 
mode 


In [None]:
    def apply_coreference_resolution(self, sentence):
        results = self.coref_model.predict(sentence)
        # clusters => list of list of list
        clusters = results["clusters"]  # [ [[a,b],[c,d]] , [[e,f]] ]
        document = results["document"]
        for cluster_index in range(len(clusters)):
            cluster = clusters[cluster_index]
            entity_start = cluster[0][0]  # First element of cluster's first element
            entity_stop = cluster[0][1] + 1  # Second element of cluster's first element
            entity = " ".join(document[entity_start:entity_stop])
            for index in range(1, len(cluster)):
                cell = cluster[index]
                # Hack to dynamically change the cluster values
                # Make the document start as entity and remove all the other elements
                document[cell[0]] = entity
                for i in range(cell[0] + 1, cell[1] + 1):
                    document[i] = ""


In [11]:
# from spacy.lang.en import English
# nlp = English()
doc = nlp("It is true that Google has been in acquisition mode")
tokens_text = [t.text for t in doc]


for token in doc:
    print(token.text, token.pos_)

    
    
    

It 
is 
true 
that 
Google 
has 
been 
in 
acquisition 
mode 


In [10]:
tokens_text

['It',
 'is',
 'true',
 'that',
 'Google',
 'has',
 'been',
 'in',
 'acquisition',
 'mode']

In [None]:
doc = nlp(question)
for token in doc:
    if token.pos_ == "PRON":
        pronouns += 1
