In [1]:
import json
import re
import pandas as pd

In [2]:
with open('Twitter.json') as fh:
    data = json.load(fh)

In [3]:
text = (re.sub('\[\d+\]', '', (data['abstract'] + data['text']).strip()))

In [4]:
entity_df = pd.read_json('../wiki_crawler/entities.json', lines=True)

In [5]:
entity_names = entity_df['name'].unique()

In [6]:
entity_names

array(['Facebook', 'Summify', 'Snapchat', 'Periscope (app)',
       'Vine (service)', 'Instagram', 'Twitter', 'Mike Schroepfer',
       'Libra (cryptocurrency)', 'Oculus VR', 'WhatsApp',
       'Sheryl Sandberg', 'Vancouver', 'Menlo Park, California',
       'Chris Hughes', 'Dustin Moskovitz', 'Evan Spiegel', 'JavaScript',
       'Scala (programming language)', 'Ruby (programming language)',
       'Omid Kordestani', 'Java (programming language)',
       'Evan Williams (Internet entrepreneur)', 'Biz Stone',
       'Jack Dorsey', 'New York Stock Exchange', 'San Francisco',
       'Snap Inc.', 'Bobby Murphy (businessman)', 'Eduardo Saverin',
       'Mark Zuckerberg', 'Cambridge, Massachusetts', 'Nasdaq',
       'S&P 500 Index', 'S&P 100'], dtype=object)

In [7]:
with open('twitter.txt', 'w') as fh:
    fh.write(text)

In [8]:
!rm twitter_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../twitter.txt > twitter_triples.txt

In [9]:
with open('twitter_triples.txt') as fh:
    triples_raw = fh.read()

In [10]:
def filter_entities(cand):
    # Only consider triples if the source node is in the list of entity names
    if cand[0] not in entity_names:
        return False
    
    # Remove edges and target nodes that are too long    
    if len(cand[1].split()) > 4:
         return False
        
    if len(cand[2].split()) > 8:
        return False
    
    # Remove not useful verbs
    if trip[1] in {'is', 'was', 'has', 'be'}:
        return False
    
    return True

In [11]:
# Generate all potentially useful triple candidates
for t in triples_raw.split('\n'):
    trip = t.split(', ')
    if filter_entities(trip):
        print(trip)

['Twitter', 'is based in', 'California']
['Twitter', 'is based in', 'San Francisco']
['Twitter', 'was created', 'launched in July']
['Twitter', 'was created in', 'March 2006']
['Twitter', 'launched in', 'July']
['Twitter', 'was created', 'launched']
['Twitter', 'was created by', 'Jack Dorsey']
['Twitter', 'was largest source During', '2016 U.S. election']
['Twitter', 'was largest source During', '2016 U.S. presidential election']
['Twitter', 'was source During', '2016 U.S. election']
['Twitter', 'was source During', '2016 U.S. presidential election']
['Jack Dorsey', 'introduced', 'idea']
['Jack Dorsey', 'introduced', 'idea of individual']
['Jack Dorsey', 'student at', 'New York University']
['Biz Stone', 'formed', 'other members']
['Biz Stone', 'formed', 'members of Odeo']
['Biz Stone', 'formed', 'Dorsey']
['Biz Stone', 'formed', 'other members of Odeo']
['Biz Stone', 'formed Obvious Corporation In', 'October 2006']
['Biz Stone', 'formed', 'Obvious Corporation']
['Biz Stone', 'formed',

['Twitter', 'Followed', 'sharing of images']
['Twitter', 'Followed', 'sharing']
['Twitter', 'announced', 'reporting including blocking mechanism devised']
['Twitter', 'announced', 'reporting including mechanism devised']
['Twitter', 'announced', 'new reporting including mechanism devised by Randi Harper']
['Twitter', 'announced', 'reporting including blocking mechanism devised by Randi Harper']
['Twitter', 'announced', 'new reporting including mechanism devised']
['Twitter', 'announced', 'new reporting including mechanism']
['Twitter', 'announced', 'new reporting including blocking mechanism']
['Twitter', 'announced', 'reporting including blocking mechanism']
['Twitter', 'announced', 'new reporting including blocking mechanism devised']
['Twitter', 'announced', 'new reporting']
['Twitter', 'announced', 'reporting']
['Twitter', 'announced', 'reporting including mechanism']
['Twitter', 'announced', 'reporting including mechanism devised by Randi Harper']
['Twitter', 'lost users as', 'res

In [12]:
# Manual inspection revealed these as better triple candidates
'''
['Twitter', 'was created in', 'March 2006']
['Jack Dorsey', 'student at', 'New York University']
['Twitter', 'acquired', 'Crashlytics']
['Twitter', 'acquired', 'Trendrr']
['Twitter', 'announced', 'partnership with IBM']
['Twitter', 'announced', 'its acquisition of Periscope']
['Twitter', 'is backed by', 'Union Square Ventures']
['Twitter', 'is backed by', 'Digital Garage']
['Twitter', 'is backed by', 'Spark Capital']
['Twitter', 'became', 'available in Arabic']
['Twitter', 'is banned in', 'Iran']
['Twitter', 'been blocked in', 'countries including Egypt']
['Twitter', 'acquiring', 'SecondSync three months']
['Twitter', 'acquired Bluefin Labs for', 'US$ 50 million $ 100 million']
'''

"\n['Twitter', 'was created in', 'March 2006']\n['Jack Dorsey', 'student at', 'New York University']\n['Twitter', 'acquired', 'Crashlytics']\n['Twitter', 'acquired', 'Trendrr']\n['Twitter', 'announced', 'partnership with IBM']\n['Twitter', 'announced', 'its acquisition of Periscope']\n['Twitter', 'is backed by', 'Union Square Ventures']\n['Twitter', 'is backed by', 'Digital Garage']\n['Twitter', 'is backed by', 'Spark Capital']\n['Twitter', 'became', 'available in Arabic']\n['Twitter', 'is banned in', 'Iran']\n['Twitter', 'been blocked in', 'countries including Egypt']\n['Twitter', 'acquiring', 'SecondSync three months']\n['Twitter', 'acquired Bluefin Labs for', 'US$ 50 million $ 100 million']\n"

In [13]:
# Formatted Candidates
new_entities = [
    ['Twitter', 'created_in', 'March 2006'],
    ['Jack Dorsey', 'studied_at', 'New York University'],
    ['Twitter', 'acquired', ['Crashlytics', 'Trendrr', 'SecondSync', 'Bluefin Labs']],
    ['Twitter', 'partners', ['IBM']],
    ['Twitter', 'backers', ['Union Square Ventures', 'Digital Garage', 'Spark Capital']],
    ['Twitter', 'banned_in', ['Iran', 'Egypt']]
]

In [14]:
entities = {}
with open('../wiki_crawler/entities.json') as fh:
    for entity in fh.readlines():
        entity_data = json.loads(entity)
        entities[entity_data['name']] = entity_data

In [15]:
for new_e in new_entities:
    entities[new_e[0]][new_e[1]] = new_e[2]

In [16]:
!rm all_entities.json

In [17]:
# Update file with all entities
with open('all_entities.json', 'w') as fh:
    for key, entity in entities.items():
        json.dump(entity, fh)
        fh.write('\n')