In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

In [15]:
BASE_URL = 'https://tvtropes.org/'

In [16]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [17]:
MEDIA = ('Anime', 'ComicBook', 'Fanfic', 'Film', 'Literature', 'Series',
         'Myth', 'TabletopGame', 'Toys', 'Franchise', 'VideoGame',
         'Webcomic', 'AudioPlay', 'WesternAnimation', 'Wrestling',
         'Podcast', 'Music', 'Blog', 'ComicStrip', 'Theatre')

In [27]:
def get_page_html(path, url=BASE_URL, user_agent=USER_AGENT):
    url = url + path
    html = requests.get(url, headers=user_agent).text
    return bs(html)

In [28]:
def get_current_url(page, base_url=BASE_URL):
    offset = 0
    url = page.find('p', {'id': 'current_url'}).text
    url = strip_domain(url)
    return url

In [29]:
def strip_domain(url):
    return re.sub(r'http.*.org/', '', url)

In [30]:
def get_info_from_url(url):
    kind, name = re.findall('php/([^/]+).*/([^/]+)$', url)[0]
    return kind, name

In [31]:
def type_from_kind(kind, media=MEDIA):
    if kind == 'Main':
        ptype = 'Trope'
    elif kind == 'Creator':
        ptype = 'Creator'
    elif kind in media:
        ptype = 'Work'
    else:
        ptype = 'Other'
    return ptype

In [32]:
class Page(object):
    
    def __init__(self, url):
        self.url = strip_domain(url)
        kind, name = get_info_from_url(url)
        ptype = type_from_kind(kind)
        self.kind = kind
        self.name = name
        self.ptype = ptype
    
    def __repr__(self):
        return f'{self.ptype} : {self.name}'
    

In [33]:
def get_references(page):
    references = []
    url = get_current_url(page)
    folders = page.findAll('div', {'class': 'folder'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('li'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        references.extend(links)
    references = set([reference for reference in references if reference['href'] != url])
    references = [Page(reference['href']) for reference in references]
    references = [reference for reference in references if reference.ptype != 'Other']
    return references
        

In [34]:
plague = get_page_html('pmwiki/pmwiki.php/VideoGame/PlagueInc')

In [35]:
pandemic = get_page_html('pmwiki/pmwiki.php/TabletopGame/Pandemic')

In [36]:
l4d = get_page_html('pmwiki/pmwiki.php/VideoGame/Left4Dead')

In [37]:
from neo4j import GraphDatabase, basic_auth

In [38]:
driver = GraphDatabase.driver("bolt://0.0.0.0:7687",
                              basic_auth=("neo4j", "neo4j"),
                              encrypted=False)

In [39]:
session = driver.session()

In [40]:
def delete_all(session=session):
    query = 'MATCH(n) DETACH DELETE n'
    session.run(query)

In [41]:
def delete_node(name, session=session):
    query = 'MATCH (p) WHERE p.name = $name DETACH DELETE (p)'
    session.run(query, name=name)

In [42]:
def add_node(page_obj, session=session):
    query = 'MERGE (p:Page:' + page_obj.ptype + '{name:$name, kind: $kind, url: $url})'
    session.run(query, name=page_obj.name, kind=page_obj.kind, url=page_obj.url)

In [43]:
def add_reference(page_from, page_to, session=session):
    query = 'MATCH (a:Page) WHERE a.name = $from_node'
    query += '\nMATCH (b) WHERE b.name = $to_node'
    query += '\nMERGE (a)-[r:REFERENCE{visual:\'-\'}]->(b)'
    session.run(query, from_node=page_from.name, to_node=page_to.name)

In [44]:
def fill_references(url, session=session):
    page = get_page_html(url)
    current = Page(url)
    add_node(current)
    references = get_references(page)
    for reference in references:
        add_node(reference)
        add_reference(current, reference)

In [52]:
fill_references('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

In [53]:
fill_references('/pmwiki/pmwiki.php/TabletopGame/Pandemic')

In [54]:
fill_references('/pmwiki/pmwiki.php/Film/Contagion')

In [55]:
fill_references('/pmwiki/pmwiki.php/Film/TwentyEightDaysLater')

In [56]:
fill_references('/pmwiki/pmwiki.php/VideoGame/Left4Dead')

In [50]:
fill_references('/pmwiki/pmwiki.php/Main/ZombieApocalypse')

In [51]:
delete_all()