In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

In [2]:
BASE_URL = 'https://tvtropes.org/'

In [3]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [4]:
MEDIA = ('Anime', 'ComicBook', 'Fanfic', 'Film', 'Literature', 'Series',
         'Myth', 'TabletopGame', 'Toys', 'Franchise', 'VideoGame',
         'Webcomic', 'AudioPlay', 'WesternAnimation', 'Wrestling',
         'Podcast', 'Music', 'Blog', 'ComicStrip', 'Theatre')

In [5]:
def get_page_html(path, url=BASE_URL, user_agent=USER_AGENT):
    url = url + path
    html = requests.get(url, headers=user_agent).text
    return bs(html)

In [6]:
def get_current_url(page, base_url=BASE_URL):
    offset = 0
    url = page.find('p', {'id': 'current_url'}).text
    url = strip_domain(url)
    return url

In [7]:
def strip_domain(url):
    return re.sub(r'http.*.org/', '', url)

In [8]:
def get_info_from_url(url):
    kind, name = re.findall('php/([^/]+).*/([^/]+)$', url)[0]
    return kind, name

In [9]:
def type_from_kind(kind, media=MEDIA):
    if kind == 'Main':
        ptype = 'Trope'
    elif kind == 'Creator':
        ptype = 'Creator'
    elif kind in media:
        ptype = 'Work'
    else:
        ptype = 'Other'
    return ptype

In [10]:
class Page(object):
    
    def __init__(self, url):
        self.url = strip_domain(url)
        kind, name = get_info_from_url(url)
        ptype = type_from_kind(kind)
        self.kind = kind
        self.name = name
        self.ptype = ptype
    
    def __repr__(self):
        return f'{self.ptype} : {self.name}'
    

In [11]:
def get_references(page):
    references = []
    url = get_current_url(page)
    folders = page.findAll('div', {'class': 'folder'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('li'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        references.extend(links)
    references = set([reference for reference in references if reference['href'] != url])
    references = [Page(reference['href']) for reference in references]
    references = [reference for reference in references if reference.ptype != 'Other']
    return references
        

In [12]:
from neo4j import GraphDatabase, basic_auth

In [13]:
driver = GraphDatabase.driver("bolt://0.0.0.0:7687",
                              basic_auth=("neo4j", "neo4j"),
                              encrypted=False)

In [14]:
session = driver.session()

In [15]:
def add_node(page_obj, session=session):
    query = 'MERGE (p:Page:' + page_obj.ptype + '{name:$name, kind: $kind, url: $url})'
    session.run(query, name=page_obj.name, kind=page_obj.kind, url=page_obj.url)

In [16]:
add_node(Page('/pmwiki/pmwiki.php/VideoGame/PlagueInc'))

In [17]:
def list_node_properties(url, session=session):
    query = 'MATCH (n:Page{url: $url}) RETURN n'
    return session.run(query, url=url).value()

In [18]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[<Node id=1508 labels={'Work', 'Page'} properties={'name': 'PlagueInc', 'kind': 'VideoGame', 'url': '/pmwiki/pmwiki.php/VideoGame/PlagueInc'}>]

In [19]:
def update_name(url, name, session=session):
    query = 'MATCH (n:Page{url: $url}) SET n.name = $name'
    session.run(query, url=url, name=name)

In [20]:
update_name('/pmwiki/pmwiki.php/VideoGame/PlagueInc', 'test')

In [21]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[<Node id=1508 labels={'Work', 'Page'} properties={'name': 'test', 'kind': 'VideoGame', 'url': '/pmwiki/pmwiki.php/VideoGame/PlagueInc'}>]

In [22]:
def delete_node(url, session=session):
    query = 'MATCH (p:Page{url: $url}) DETACH DELETE (p)'
    session.run(query, url=url)

In [23]:
delete_node('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

In [24]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[]

In [25]:
def delete_all(session=session):
    query = 'MATCH(n) DETACH DELETE n'
    session.run(query)

In [26]:
delete_all()

In [27]:
def add_reference(page_from, page_to, session=session):
    query = 'MATCH (a:Page) WHERE a.name = $from_node'
    query += '\nMATCH (b) WHERE b.name = $to_node'
    query += '\nMERGE (a)-[r:REFERENCE{visual:\'-\'}]->(b)'
    session.run(query, from_node=page_from.name, to_node=page_to.name)

In [28]:
def fill_references(url, session=session):
    '''
    Create a node and add references.
    '''
    page = get_page_html(url)
    current = Page(url)
    add_node(current)
    references = get_references(page)
    for reference in references:
        add_node(reference)
        add_reference(current, reference)

In [29]:
def get_nodes_n(session=session):
    '''
    Return the total number of nodes in the datebase.
    '''
    query = 'MATCH (n) RETURN count(n) as count'
    return session.run(query).single()['count']

In [30]:
def get_edges_n(session=session):
    '''
    Return the total number of edges in the datebase.
    '''
    query = 'MATCH ()-[r]-() RETURN count(r) as count'
    return session.run(query).single()['count']

In [31]:
def get_node_edges(url, session=session):
    query = 'MATCH ()-[r]-({url: $url}) RETURN count(r)'
    return session.run(query, url=url).value()[0]

In [32]:
def get_urls(session=session):
    '''
    List url of each node it the database.
    '''
    query = 'MATCH (n:Page) RETURN n.url'
    result = session.run(query).values()
    return  [record[0] for record in result]

In [33]:
from collections import Counter

def list_mutual_neigbors(n=10, session=session):
    '''
    Returns n most common tropes referenced by at least two works.
    '''
    query = 'MATCH (a:Work)-[:REFERENCE]->(t:Trope)<-[:REFERENCE]-(b:Work) RETURN t.url'
    records = session.run(query).records()
    records = Counter([record.value() for record in records])
    return records.most_common(n)

[The Common Neighbors algorithm](https://neo4j.com/docs/graph-algorithms/current/labs-algorithms/common-neighbors/)

In [34]:
def common_neighbors(url1, url2, session=session):
    query = 'MATCH (p1:Page{url:$url1})'
    query += 'MATCH (p2:Page{url:$url2})'
    query += 'RETURN algo.linkprediction.commonNeighbors(p1, p2) AS score'
    return session.run(query, url1=url1, url2=url2).value()[0]

Random Walk

In [35]:
def random_walk(url, steps=3, walks=1, session=session):
    query = '''MATCH (home:Page {url: $url})
               CALL algo.randomWalk.stream(id(home), $steps, $walks)
               YIELD nodeIds

               UNWIND nodeIds AS nodeId

               RETURN algo.asNode(nodeId).url AS page'''
    return session.run(query, url=url, steps=steps, walks=walks)

Similarity algorithms utilize node properties which aren't used at the moment.

Graph statistics

In [36]:
def graph_stats(session=session):
    '''
    Returns stats of the graph in the current database.
    '''
    query = '''MATCH (p:Page)
               RETURN avg(apoc.node.degree(p,'REFERENCE')) as average_refs,
               stdev(apoc.node.degree(p,'REFERENCE')) as stdev_refs,
               max(apoc.node.degree(p,'REFERENCE')) as max_refs,
               min(apoc.node.degree(p,'REFERENCE')) as min_refs'''
    records = session.run(query)
    records = [record for record in records.records()][0]
    result = {}
    result['average_refs'] = records['average_refs']
    result['stdev_refs'] = records['stdev_refs']
    result['max_refs'] = records['max_refs']
    result['min_refs'] = records['min_refs']
    return result

Article materials:

In [37]:
fill_references('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

In [38]:
fill_references('/pmwiki/pmwiki.php/TabletopGame/Pandemic')

In [39]:
fill_references('/pmwiki/pmwiki.php/Film/Contagion')

In [40]:
fill_references('/pmwiki/pmwiki.php/Film/TwentyEightDaysLater')

In [41]:
fill_references('/pmwiki/pmwiki.php/VideoGame/Left4Dead')

In [42]:
fill_references('/pmwiki/pmwiki.php/Main/ZombieApocalypse')

In [43]:
list_mutual_neigbors()

[('/pmwiki/pmwiki.php/Main/ZombieApocalypse', 30),
 ('/pmwiki/pmwiki.php/Main/ShoutOut', 30),
 ('/pmwiki/pmwiki.php/Main/ThePlague', 20),
 ('/pmwiki/pmwiki.php/Main/AwesomeButImpractical', 20),
 ('/pmwiki/pmwiki.php/Main/PatientZero', 20),
 ('/pmwiki/pmwiki.php/Main/ApocalypseHow', 20),
 ('/pmwiki/pmwiki.php/Main/LighterAndSofter', 12),
 ('/pmwiki/pmwiki.php/Main/SyntheticPlague', 12),
 ('/pmwiki/pmwiki.php/Main/KillEmAll', 12),
 ('/pmwiki/pmwiki.php/Main/NiceJobBreakingItHero', 12)]

In [44]:
get_urls()[:10]

['/pmwiki/pmwiki.php/Main/TwentyMinutesIntoTheFuture',
 '/pmwiki/pmwiki.php/Main/ArtificialLimbs',
 '/pmwiki/pmwiki.php/Main/BrainwashingForTheGreaterGood',
 '/pmwiki/pmwiki.php/Main/CompositeCharacter',
 '/pmwiki/pmwiki.php/Main/TheBlackDeath',
 '/pmwiki/pmwiki.php/Main/HiroshimaAsAUnitOfMeasure',
 '/pmwiki/pmwiki.php/Main/MarathonLevel',
 '/pmwiki/pmwiki.php/Main/SaveScumming',
 '/pmwiki/pmwiki.php/Main/CrackDefeat',
 '/pmwiki/pmwiki.php/Main/IDidWhatIHadToDo']

Testing statistics functions

Summary of graph stats

In [45]:
graph_stats()

{'average_refs': 3.1379529561347757,
 'stdev_refs': 28.862346199199703,
 'max_refs': 868,
 'min_refs': 1}

Total edges

In [46]:
get_edges_n()

4936

Total nodes

In [47]:
get_nodes_n()

1573

Overall ratio of edges to nodes

In [48]:
get_edges_n()/get_nodes_n()

3.1379529561347743

Number of edges of a particular node

In [49]:
get_node_edges('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

207

Testing Common Neighbors

Adding more nodes for comparison

In [50]:
fill_references('/pmwiki/pmwiki.php/VideoGame/Halo3')

In [51]:
fill_references('/pmwiki/pmwiki.php/VideoGame/ResidentEvil2')

In [52]:
fill_references('/pmwiki/pmwiki.php/VideoGame/CrashBandicoot')

In [53]:
fill_references('/pmwiki/pmwiki.php/Film/Convict13')

In [54]:
fill_references('/pmwiki/pmwiki.php/Film/ThePlayhouse')

In [55]:
common_neighbors('/pmwiki/pmwiki.php/VideoGame/PlagueInc', '/pmwiki/pmwiki.php/TabletopGame/Pandemic')

14.0

In [56]:
common_neighbors('/pmwiki/pmwiki.php/VideoGame/ResidentEvil2', '/pmwiki/pmwiki.php/VideoGame/Halo3')

16.0

In [57]:
common_neighbors('/pmwiki/pmwiki.php/Film/TwentyEightDaysLater', '/pmwiki/pmwiki.php/VideoGame/CrashBandicoot')

13.0

In [58]:
common_neighbors('/pmwiki/pmwiki.php/VideoGame/PlagueInc', '/pmwiki/pmwiki.php/Film/Convict13')

0.0

In [59]:
common_neighbors('/pmwiki/pmwiki.php/Film/ThePlayhouse', '/pmwiki/pmwiki.php/Film/Convict13')

5.0

Testing Random Walk

In [60]:
random_walk('/pmwiki/pmwiki.php/VideoGame/Halo3', steps=5).values()

[['/pmwiki/pmwiki.php/VideoGame/Halo3'],
 ['/pmwiki/pmwiki.php/Main/Tyrannicide'],
 ['/pmwiki/pmwiki.php/VideoGame/Halo3'],
 ['/pmwiki/pmwiki.php/Literature/HaloHuntersInTheDark'],
 ['/pmwiki/pmwiki.php/VideoGame/Halo3'],
 ['/pmwiki/pmwiki.php/VideoGame/NinjaGaiden']]