In [1]:
import json
import re
import pandas as pd

### Setup

In [2]:
entity_df = pd.read_json('../wiki_crawler/entities.json', lines=True)

In [3]:
entity_names = entity_df['name'].unique()

In [4]:
entity_names

array(['Facebook', 'Summify', 'Periscope (app)', 'Vine (service)',
       'Pinterest', 'Snapchat', 'WeChat', 'Reddit', 'Instagram',
       'Twitter', 'Dom Hofmann', 'Libra (cryptocurrency)', 'Oculus VR',
       'WhatsApp', 'Mike Schroepfer', 'Vancouver', 'Sheryl Sandberg',
       'Chairperson', 'Menlo Park, California', 'Chris Hughes',
       'Evan Spiegel', 'Tencent', 'JavaScript',
       'Scala (programming language)', 'Ruby (programming language)',
       'Java (programming language)', 'Internet',
       'Evan Williams (Internet entrepreneur)', 'Omid Kordestani',
       'Biz Stone', 'Jack Dorsey', 'Multilingualism',
       'React (web framework)', 'Python (programming language)',
       'Web banner', 'Online service provider', 'San Francisco',
       'Chief executive officer', 'Steve Huffman',
       'Cross-platform software', 'Alexis Ohanian', 'Multimedia',
       'Aaron Swartz', 'Instant messaging', 'Image sharing', 'Snap Inc.',
       'Bobby Murphy (businessman)', 'Videotelephony

In [5]:
len(entity_names)

58

In [30]:
# Add "synonyms" to entity_names
entity_names = list(entity_names) + ['Vine', 'Periscope', 'Libra']

In [22]:
def filter_entities(cand):
    # Only consider triples if the source node is in the list of entity names
    if cand[0] not in entity_names:
        return False
    
    # Remove edges and target nodes that are too long    
    if len(cand[1].split()) > 4:
         return False
        
    if len(cand[2].split()) > 8:
        return False
    
    # Remove not useful verbs
    if cand[1] in {'is', 'was', 'has', 'be'}:
        return False
    
    return True

In [50]:
# Generate all potentially useful triple candidates
def gen_triples(triples_raw, filter_flag=True):
    for t in triples_raw.split('\n'):
        trip = t.split(', ')
        if filter_flag:
            if filter_entities(trip):
                print(trip)
        else:
            print(trip)

### Facebook

In [11]:
company = 'Facebook'

In [12]:
with open(f'{company}.json') as fh:
    data = json.load(fh)

In [15]:
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [16]:
!rm facebook_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../facebook.txt > facebook_triples.txt

rm: facebook_triples.txt: No such file or directory


In [17]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()

In [24]:
# Print triple candidates
gen_triples(triples_raw)

['Facebook', 'claimed', 'that']
['Facebook', 'partnered In', '2017']
['Facebook', 'partnered with', 'fact']
['Facebook', 'at', 'IPO']
['Dustin Moskovitz', 'joined', 'Zuckerberg']
['Dustin Moskovitz', 'joined', 'Chris Hughes']
['Facebook', 'expanded to', 'Columbia']
['Facebook', 'expanded In', 'March 2004']
['Facebook', 'hired', 'its intern']
['Facebook', 'hired', 'Julie Zhou']
['Facebook', 'hired', 'its first intern']
['Facebook', 'opened to', 'everyone']
['Facebook', 'opened old to', 'everyone']
['Facebook', 'opened old On', 'September 26']
['Facebook', 'opened On', 'September 26']
['Facebook', 'value of', 'around $ 15 billion']
['Facebook', 'implied value of', '$ 15 billion']
['Facebook', 'total implied value of', '$ 15 billion']
['Facebook', 'implied value of', 'around $ 15 billion']
['Facebook', 'total value of', '$ 15 billion']
['Facebook', 'total value of', 'around $ 15 billion']
['Facebook', 'total implied value of', 'around $ 15 billion']
['Facebook', 'value of', '$ 15 billion'

In [25]:
# Manual inspection revealed these as better triple candidates
'''
['Facebook', 'acquired', 'Instagram']
['Facebook', 'acquired', 'Onavo']
['Facebook', 'founded', 'Libra Networks']
['Facebook', 'won', 'Crunchie Best Startup']
['Facebook', 'exposing', 'data of 50 million users']
['WhatsApp', 'sued', 'Israeli surveillance firm']
['Facebook', 'banned', 'number of commentators including Alex Jones']
['Facebook', 'mishandling by', 'Cambridge Analytica']
['Facebook', 'founded', 'Internet.org']
'''

"\n['Facebook', 'acquired', 'Instagram']\n['Facebook', 'acquired', 'Onavo']\n['Facebook', 'founded', 'Libra Networks']\n['Facebook', 'won', 'Crunchie Best Startup']\n['Facebook', 'exposing', 'data of 50 million users']\n['WhatsApp', 'sued', 'Israeli surveillance firm']\n['Facebook', 'banned', 'number of commentators including Alex Jones']\n['Facebook', 'mishandling by', 'Cambridge Analytica']\n['Facebook', 'founded', 'Internet.org']\n"

In [10]:
# Formatted Candidates
fb_entities = [
    ['Facebook', 'acquired', ['Instagram', 'Onavo']],
    ['Facebook', 'founded', ['Libra Networks', 'Internet.org']],
    ['Facebook', 'associated_with', ['Cambridge Analytica']],
    ['Facebook', 'won', ['Crunchie Best Startup']],
    ['Facebook', 'banned', ['Alex Jones']],
    ['Facebook', 'breached_data', '50 million'],
    ['WhatsApp', 'sued', 'Israeli surveillance firm']
]

### Vine

In [26]:
company = 'Vine'
with open(f'{company}.json') as fh:
    data = json.load(fh)
    
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [27]:
!rm vine_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../vine.txt > vine_triples.txt

rm: vine_triples.txt: No such file or directory


In [31]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()
    
gen_triples(triples_raw)

['Vine', 'through', 'social network']
['Vine', 'competed with', 'social-media services']
['Vine', 'competed with', 'other services']
['Vine', 'competed with', 'services']
['Vine', 'competed with', 'other social-media services']
['Vine', 'had', '200 million users']
['Vine', 'had', '200 million active users']
['Twitter', 'announced On', 'October 27 2016']
['Twitter', 'allowing', 'people']
['Twitter', 'launched', 'Internet archive of Vine videos']
['Vine', "'s successor is", 'V2']
['Vine', 'on', 'successor']
['Vine', 'launched as', 'app for iOS devices']
['Vine', 'launched as', 'free app for iOS devices']
['Vine', 'launched on', 'January 24 2013']
['Vine', 'launched as', 'free app']
['Vine', 'launched as', 'app']
['Vine', 'became video with', 'low adoption of app']
['Vine', 'became', 'used video']
['Vine', 'became video with', 'even low adoption']
['Vine', 'became video with', 'even adoption']
['Vine', 'became video with', 'adoption']
['Vine', 'became video with', 'even low adoption of ap

In [11]:
vine_entities = [
    ['Vine (service)', 'launched', 'Vine Kids'],
    ['Vine (service)', 'won', "Time's '50 Android Apps"]
]

### Snapchat

In [32]:
company = 'Snapchat'
with open(f'{company}.json') as fh:
    data = json.load(fh)
    
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [33]:
!rm snapchat_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../snapchat.txt > snapchat_triples.txt

rm: snapchat_triples.txt: No such file or directory


In [34]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()
    
gen_triples(triples_raw)

['Snapchat', 'places', 'significant emphasis']
['Snapchat', 'has become', 'known']
['Snapchat', 'places', 'emphasis on users interacting with virtual stickers']
['Snapchat', 'places', 'emphasis on users']
['Snapchat', 'places', 'significant emphasis on users']
['Snapchat', 'places', 'significant emphasis on users interacting']
['Snapchat', 'places', 'emphasis']
['Snapchat', 'places', 'emphasis on users interacting']
['Snapchat', 'places', 'significant emphasis on users interacting with virtual stickers']
['Snapchat', 'was experiencing', 'difficulties']
['Snapchat', 'was released as', 'Android app on October 29 2012']
['Snapchat', 'was released as', 'Android app']
['Snapchat', 'Also introduced Snapkidz under', '13 years of age']
['Snapchat', 'introduced Snapkidz under', '13 years']
['Snapchat', 'Also introduced Snapkidz in', 'June 2013']
['Snapchat', 'introduced Snapkidz under', '13 years of age']
['Snapchat', 'Also introduced Snapkidz under', '13 years']
['Snapchat', 'Also introduced',

['Snapchat', 'is increasingly becoming', 'integral part']
['Snapchat', 'part of', 'porn industry']
['Snapchat', 'is becoming', 'integral part of porn industry']
['Snapchat', 'integral part of', 'porn industry']
['Snapchat', 'is becoming', 'integral part of online porn industry']
['Snapchat', 'part of', 'online porn industry']
['Snapchat', 'is increasingly becoming', 'part of online porn industry']
['Snapchat', 'is becoming', 'part of online porn industry']
['Snapchat', 'is increasingly becoming', 'integral part of online porn industry']
['Snapchat', 'is becoming', 'part']
['Snapchat', 'is increasingly becoming', 'integral part of porn industry']
['Snapchat', 'is becoming', 'integral part']
['Snapchat', 'is increasingly becoming', 'part']
['Snapchat', 'is increasingly becoming', 'part of porn industry']
['Snapchat', 'integral part of', 'online porn industry']
['Snapchat', 'is becoming', 'part of porn industry']
['Snapchat', 'was hacked on', 'December 31 2013']
['Snapchat', 'announced On

In [12]:
snap_entities = [
    ['Snapchat', 'raised', '$1.81 billion'],
    ['Snapchat', 'founded', ['Snapkidz', 'Snapcash']],
    ['Snapchat', 'methods', ['AIDA model']],
    ['Snapchat', 'partners', ['Twitch']]
]

### Instagram

In [37]:
company = 'Instagram'
with open(f'{company}.json') as fh:
    data = json.load(fh)
    
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [38]:
!rm instagram_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../instagram.txt > instagram_triples.txt

rm: instagram_triples.txt: No such file or directory


In [39]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()
    
gen_triples(triples_raw)

['Instagram', 'rapidly gained popularity with', 'one million users in two months']
['Instagram', 'gained popularity with', 'one million users']
['Instagram', 'gained popularity After', 'its launch']
['Instagram', 'gained', 'popularity']
['Instagram', 'rapidly gained popularity with', 'one million registered users in two months']
['Instagram', 'gained popularity with', 'one million users in two months']
['Instagram', 'gained popularity After', 'its launch in 2010']
['Instagram', 'rapidly gained popularity with', 'one million users']
['Instagram', 'gained popularity with', 'one million registered users in two months']
['Instagram', 'rapidly gained', 'popularity']
['Instagram', 'rapidly gained popularity After', 'its launch']
['Instagram', 'gained popularity with', 'one million registered users']
['Instagram', 'rapidly gained popularity After', 'its launch in 2010']
['Instagram', 'rapidly gained popularity with', 'one million registered users']
['Facebook', 'acquired service In', 'April 2

In [13]:
ig_entities = [
    ['Instagram', 'launched', ['Videos You Might Like channel', 'Instagram Stories', 
                               'Story Highlights', 'Instagram Direct']],
    ['Instagram', 'won', ['Best Locally Made App']]
]

### Periscope

In [53]:
company = 'Periscope'
with open(f'{company}.json') as fh:
    data = json.load(fh)
    
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [42]:
!rm periscope_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../periscope.txt > periscope_triples.txt

rm: periscope_triples.txt: No such file or directory


In [54]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()
    
gen_triples(triples_raw, filter_flag=False)

['Periscope', 'is', 'live video streaming app for Android']
['Periscope', 'is', 'video streaming app for Android']
['Periscope', 'acquired by', 'Twitter']
['Periscope', 'is', 'video streaming app']
['Periscope', 'is', 'live']
['Periscope', 'is', 'live video streaming app for Android developed by Kayvon Beykpour']
['Periscope', 'is', 'video streaming app for Android developed by Kayvon Beykpour']
['Periscope', 'is', 'live video streaming app']
['Periscope', 'is', 'video streaming app for Android developed']
['Periscope', 'acquired before', 'launch']
['Periscope', 'is', 'live video streaming app for Android developed']
['Periscope', 'acquired before', 'launch in 2015']
['launch', 'is in', '2015']
['Contents Beykpour', 'came', 'Bernstein']
['Contents Beykpour', 'traveling', 'Bernstein']
['Contents Beykpour', 'traveling abroad', 'Bernstein']
['Contents Beykpour', 'traveling abroad in', '2013']
['Contents Beykpour', 'traveling in', '2013']
['He', 'wanted', 'see']
['he', 'turned to', 'Twitte

In [14]:
periscope_entities = [
    ['Periscope (app)', 'launched', ['Apple TV app']]
]

### Summify

In [45]:
company = 'Summify'
with open(f'{company}.json') as fh:
    data = json.load(fh)
    
text = (re.sub('\[\d+\]', '', data['text'].strip()))
with open(f'{company}.txt', 'w') as fh:
    fh.write(text)

In [46]:
!rm summify_triples.txt
!python Stanford-OpenIE-Python/main.py -f ../summify.txt > summify_triples.txt

rm: summify_triples.txt: No such file or directory


In [52]:
with open(f'{company}_triples.txt') as fh:
    triples_raw = fh.read()
    
gen_triples(triples_raw, filter_flag=False)

['Summify', 'was', 'social']
['Summify', 'was', 'social news aggregator']
['Summify', 'was', 'social news aggregator founded']
['Summify', 'was', 'news aggregator']
['Summify', 'was', 'news aggregator founded by Mircea Paşoi']
['Summify', 'was', 'news aggregator founded']
['Summify', 'was', 'social news aggregator founded by Mircea Paşoi']
['service', 'emails', 'its users']
['platform', 'supported', 'Twitter']
['Advisors', 'include', 'CEO of Hootsuite']
['Advisors', 'include', 'Ryan Holmes']
['Advisors', 'include', 'CEO']
['Advisors', 'include', 'CEO']
['Advisors', 'include', 'CEO of Redbeacon']
['Ryan Holmes', 'CEO of', 'Hootsuite']
['Ryan Holmes', 'CEO of', 'Redbeacon']
['Contents', 'is In', '2009']
['Contents', 'created', 'ReadFu']
['ReadFu', 'was', 'accepted']
['ReadFu', 'was accepted In', 'January 2010']
['ReadFu', 'was', 'In January 2010 accepted into Vancouver']
['ReadFu', 'was accepted into', 'Vancouver']
['service', 'was renamed', 'began']
['2010 service', 'was renamed', 'bega

In [15]:
summify_entities = [
    ['Summify', 'advisors', ['Ryan Holmes']]
]

### Combine

In [16]:
# From other notebook
twitter_entities = [
    ['Twitter', 'created_in', 'March 2006'],
#     ['Jack Dorsey', 'studied_at', 'New York University'], (Duplicate)
    ['Twitter', 'acquired', ['Crashlytics', 'Trendrr', 'SecondSync', 'Bluefin Labs']],
    ['Twitter', 'partners', ['IBM']],
    ['Twitter', 'backers', ['Union Square Ventures', 'Digital Garage', 'Spark Capital']],
    ['Twitter', 'banned_in', ['Iran', 'Egypt']]
]

In [17]:
all_entities = twitter_entities + fb_entities + vine_entities + snap_entities \
    + ig_entities + periscope_entities + summify_entities

In [18]:
all_entities

[['Twitter', 'created_in', 'March 2006'],
 ['Twitter',
  'acquired',
  ['Crashlytics', 'Trendrr', 'SecondSync', 'Bluefin Labs']],
 ['Twitter', 'partners', ['IBM']],
 ['Twitter',
  'backers',
  ['Union Square Ventures', 'Digital Garage', 'Spark Capital']],
 ['Twitter', 'banned_in', ['Iran', 'Egypt']],
 ['Facebook', 'acquired', ['Instagram', 'Onavo']],
 ['Facebook', 'founded', ['Libra Networks', 'Internet.org']],
 ['Facebook', 'associated_with', ['Cambridge Analytica']],
 ['Facebook', 'won', ['Crunchie Best Startup']],
 ['Facebook', 'banned', ['Alex Jones']],
 ['Facebook', 'breached_data', '50 million'],
 ['WhatsApp', 'sued', 'Israeli surveillance firm'],
 ['Vine (service)', 'launched', 'Vine Kids'],
 ['Vine (service)', 'won', "Time's '50 Android Apps"],
 ['Snapchat', 'raised', '$1.81 billion'],
 ['Snapchat', 'founded', ['Snapkidz', 'Snapcash']],
 ['Snapchat', 'methods', ['AIDA model']],
 ['Snapchat', 'partners', ['Twitch']],
 ['Instagram',
  'launched',
  ['Videos You Might Like channel

In [30]:
entities = {}
with open('../wiki_crawler/entities.json') as fh:
    for entity in fh.readlines():
        entity_data = json.loads(entity)
        if entity_data['name'] in entities:
            print(entity_data['name'])
        entities[entity_data['name']] = entity_data

Periscope (app)
Instagram
Vine (service)
Facebook
Twitter
WeChat
Reddit
Snapchat
Twitter


In [31]:
for new_e in all_entities:
    source = new_e[0]
    edge = new_e[1]
    if edge == 'acquired':
        edge = 'subsidiaries'
        target = list(set(entities[source]['subsidiaries'] + new_e[2]))
    elif edge == 'founded':
        edge = 'products'
        target = list(set(entities[source].get('products', []) + new_e[2]))
    else:
        target = new_e[2]
    entities[source][edge] = target

In [32]:
!rm all_entities.json

In [33]:
# Update file with all entities
with open('all_entities.json', 'w') as fh:
    for key, entity in entities.items():
        json.dump(entity, fh)
        fh.write('\n')