## 🧐 ZeroRel data inspection

In [1]:
import json
with open('data/zero_rel_all.jsonl', 'r') as f:
    # data = []
    # for i in range(50_000):
    #     data.append(json.loads(next(f)))
    data = [json.loads(line) for line in f]

In [2]:
# relation labels in ZeroRel

relationship_counts = {}
raw_relationship_string = {}

for item in data:
    relations = item['relations']
    for relation in relations:
        relation_text = relation['relation_text']
        if relation_text in relationship_counts:
            relationship_counts[relation_text] += 1
        else:
            relationship_counts[relation_text] = 1

        raw_relation = relation['raw_relation_text']
        if relation_text not in raw_relationship_string:
            raw_relationship_string[relation_text] = set()
        raw_relationship_string[relation_text].add(relation['raw_relation_text'])

sorted_relationship_counts = sorted(relationship_counts.items(), key=lambda x: x[1], reverse=True)

print(f"Number of unique labels: {len(sorted_relationship_counts)}")
sorted_relationship_counts[:50]

Number of unique labels: 907613


[('no relation', 6221819),
 ('location of', 568006),
 ('member of', 324213),
 ('located in', 197911),
 ('location in', 192124),
 ('location', 191613),
 ('created by', 140518),
 ('author of', 136418),
 ('hosted by', 132378),
 ('self reference', 122550),
 ('part of', 120329),
 ('worked for', 119371),
 ('reported by', 108792),
 ('produced by', 103547),
 ('supports', 101075),
 ('same person', 94551),
 ('collaborator', 90389),
 ('mentioned in', 87205),
 ('played for', 86813),
 ('child of', 77621),
 ('educated at', 77521),
 ('lives in', 77299),
 ('supported by', 74596),
 ('produces', 71913),
 ('born in', 71197),
 ('written by', 68053),
 ('head of', 66121),
 ('subject of', 65914),
 ('owned by', 65763),
 ('belongs to', 63511),
 ('publisher', 62676),
 ('funding', 62132),
 ('sponsor', 58444),
 ('featured in', 57847),
 ('founder', 56663),
 ('designed by', 56451),
 ('hosts', 55946),
 ('published in', 55660),
 ('performs at', 55081),
 ('leads', 53910),
 ('owner', 53724),
 ('inspired by', 52862),
 (

In [3]:
# all the different generations that were parsed for a given label
list(raw_relationship_string['lives in'])[:20]

['    LIVES_IN. This is because the text states that the given monthly income would support',
 "    LIVES_IN. This is because the head 'I' is the speaker and the",
 '    LIVES_IN\n\n    Explanation:\n    The text states that Julie',
 '    LIVES_IN\n\n    Explanation:\n    Ken Oliver is a person',
 '    LIVES_IN\n    OR, COULD_LIVE_IN\n   ',
 '    LIVES_IN\n\n    Explanation:\n    The text describes Brooke',
 '    LIVES_IN\n\n    Explanation:\n    The text states that El',
 "    LIVES_IN. This is incorrect, but it's a common mistake when dealing",
 '    LIVES_IN  # Selvaggia Baros lives in Chicago, but the',
 '    LIVES_IN\n    OR:\n    LOCATED_IN\n    (',
 '    LIVES_IN\n\n    Explanation:\n    Terry is a member of',
 "    LIVES_IN. This is because the text states that 'Bess lives in Boston",
 '    LIVES_IN\n    ORGANIZED_BY\n    ORGAN',
 '"Lives_in"',
 '    LIVES_IN\n\n    Explanation:\n    Megan Buskey lives',
 '    LIVES_IN\n\n    Explanation:\n    Vasilisa lives in',
 '    LIVES_IN

In [4]:
sorted_relationship_lengths = sorted(relationship_counts.items(), key=lambda x: len(x[0]), reverse=False)
sorted_relationship_lengths[:500]

[('a', 383),
 ('i', 451),
 ('o', 208),
 ('by', 205),
 ('pr', 536),
 ('in', 579),
 ('vs', 822),
 ('at', 215),
 ('is', 272),
 ('eq', 312),
 ('mr', 215),
 ('it', 281),
 ('if', 1032),
 ('dj', 308),
 ('ms', 237),
 ('dr', 330),
 ('vp', 240),
 ('gm', 205),
 ('lp', 262),
 ('the', 51099),
 ('coo', 334),
 ('met', 7660),
 ('saw', 4035),
 ('win', 4121),
 ('age', 1040),
 ('son', 890),
 ('won', 3311),
 ('ceo', 5767),
 ('had', 712),
 ('tom', 249),
 ('tie', 398),
 ('led', 673),
 ('hit', 1244),
 ('see', 430),
 ('via', 224),
 ('foe', 344),
 ('fan', 208),
 ('cfo', 276),
 ('buy', 220),
 ('rand', 5),
 ('self', 41646),
 ('prep', 234),
 ('buys', 4563),
 ('owns', 11465),
 ('from', 1254),
 ('like', 1931),
 ('type', 287),
 ('uses', 15157),
 ('head', 10947),
 ('is a', 12042),
 ('said', 3577),
 ('near', 41752),
 ('vote', 3),
 ('beat', 8766),
 ('call', 156),
 ('used', 272),
 ('sold', 3610),
 ('love', 701),
 ('seed', 3),
 ('says', 1472),
 ('seen', 352),
 ('hmrc', 3),
 ('gift', 1297),
 ('asks', 343),
 ('emer', 1),
 

In [42]:
list(raw_relationship_string['pr'])[:10]

['    PR:LOCATION\n    or\n    ORG:HOSTED_BY\n    or',
 '    PR: GOVERNMENT\n    OR: GOVERNOR\n    The relation between',
 '    PR:provided_information',
 '    PR:11_INVESTIGATED_BY',
 '    PR:AT_LOCATION\n    or\n    PR:AT_TIME\n    depending',
 '    PR:PERSON_LOCATION_OF_ORIGIN\n    OR: PR:',
 '    PR:MENTIONED_BY',
 '    PR:EVENT_LOCATION\n    OR:\n    NO_RELATION',
 '    PR:PERFORMER_ORGANIZATION',
 '    PR:associated_with']

In [28]:
i = 4

In [29]:
data[i]['ner'][:5]

[[2, 3, 'PERSON', 'Cassilyn Anderson'],
 [5, 6, 'DATE', 'The day'],
 [7, 7, 'ORG', 'LANY'],
 [13, 15, 'DATE', 'an official holiday'],
 [17, 17, 'GPE', 'Utah']]

In [30]:
len(data[i]['ner'])

12

In [31]:
print(" ".join(data[i]['tokenized_text']))

By : Cassilyn Anderson 
 The day LANY comes to town should be an official holiday in Utah because it ’s a day that every teen and young adult looks forward to . This last Tuesday , LANY made a stop at The Great Saltair in Magna . Fans had been anxiously waiting outside of the venue since as early as Sunday morning in hopes to be squished against the barrier to be as close as possible to their favorite band . Many fans made new friends with each other as they waited for hours on end in line .


In [32]:
[r for r in data[i]['relations'] if r['relation_text'] == 'no relation'][-3:]

[{'head': {'mention': 'hours', 'position': [94, 94], 'type': 'TIME'},
  'tail': {'mention': 'a day', 'position': [21, 22], 'type': 'DATE'},
  'relation_text': 'no relation',
  'raw_relation_text': 'no relation'},
 {'head': {'mention': 'hours', 'position': [94, 94], 'type': 'TIME'},
  'tail': {'mention': 'This last Tuesday',
   'position': [33, 35],
   'type': 'DATE'},
  'relation_text': 'no relation',
  'raw_relation_text': 'no relation'},
 {'head': {'mention': 'hours', 'position': [94, 94], 'type': 'TIME'},
  'tail': {'mention': 'as early as Sunday morning',
   'position': [58, 62],
   'type': 'TIME'},
  'relation_text': 'no relation',
  'raw_relation_text': 'no relation'}]

In [33]:
# get exceptions not considered for relations
# should include self-pairs 

seen_rels = set()

for rel in data[i]['relations']:
    seen_rels.add(((rel['head']['position'][0], rel['head']['position'][1]), (rel['tail']['position'][0], rel['tail']['position'][1])))

exception = []
for ent1 in data[i]['ner']:
    for ent2 in data[i]['ner']:

        if ((ent1[0], ent1[1]), (ent2[0], ent2[1])) not in seen_rels:
            exception.append((ent1, ent2))

In [34]:
exception[:4]

[([2, 3, 'PERSON', 'Cassilyn Anderson'],
  [2, 3, 'PERSON', 'Cassilyn Anderson']),
 ([5, 6, 'DATE', 'The day'], [5, 6, 'DATE', 'The day']),
 ([7, 7, 'ORG', 'LANY'], [7, 7, 'ORG', 'LANY']),
 ([13, 15, 'DATE', 'an official holiday'],
  [13, 15, 'DATE', 'an official holiday'])]

In [35]:
len(exception)

12

In [36]:
should_be_rels = (len(data[i]['ner']) * (len(data[i]['ner'])))
print(f"{should_be_rels} entity-entity pairs")

print(f"So should be {should_be_rels} (entity pairs) - {len(exception)} (self-pairs) => {len(data[i]['relations'])}")

144 entity-entity pairs
So should be 144 (entity pairs) - 12 (self-pairs) => 132
