In [70]:
from bs4 import BeautifulSoup as bs
import requests

Creating tools to parse attributes from individual pages

In [2]:
BASE_URL = 'https://allthetropes.fandom.com'

In [3]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [4]:
def get_page_html(name):
    global BASE_URL
    global USER_AGENT
    url = BASE_URL + '/wiki/' + name
    html = requests.get(url, headers=USER_AGENT).text
    return bs(html)

In [5]:
def get_keywords(page):
    return page.find('meta', {'name':'keywords'})['content']

In [537]:
def parse_references(p):
    refs = p.findAll('a')
    d = {}
    for ref in refs:
        try:
            title = ref['title']
            href = ref['href']
            d[href] = title
        except:
            pass
    return d

In [377]:
def get_examples(page):
    examples = page.find('div', id='mw-content-text').findAll('ul')[0].findAll('li')
    example_refs = {}
    for example in examples:
        text = example.text.strip('\n')
        text = text.replace('\xa0', ' ')
        text = text.replace('"', '')
        ref = example.findAll('a')
        try:
            ref = ref[0]['href']
            example_refs[ref] = text
        except:
            pass
    return example_refs

In [378]:
def parse_text(page):
    paras = page.find('div', id='mw-content-text').findAll('p')
    description = []
    references = {}
    for p in paras:
        description.append(p.text.strip('\n').replace('\xa0', ' ').replace('"', ''))
        references.update(parse_references(p))
    return ''.join(description), references

In [379]:
test = get_page_html('The_Witcher')

In [155]:
def get_page_attributes(name):
    page = get_page_html(name)
    identifier = '/wiki/' + name
    description, references = parse_text(page)
    examples = get_examples(page)
    keywords = get_keywords(page)
    return identifier, description, references, examples, keywords

Connecting to neo4j DB

In [325]:
from neo4j import GraphDatabase, basic_auth

In [73]:
driver = GraphDatabase.driver("bolt://0.0.0.0:7687",
                              basic_auth=("neo4j", "neo4j"),
                              encrypted=False)

In [327]:
session = driver.session()

In [429]:
identifier, description, references, examples, keywords = get_page_attributes('The_Witcher')

In [444]:
def add_node(identificator, keywords):
    global session
    ref = f'ref:"{identifier}"'
    kws = str(keywords.split(','))
    query = 'MERGE(p:page{' + ref + '})'
    query += '\n' + f'SET p.keywords = {kws}'
    session.run(query)
   # print(query)

In [433]:
add_node('/wiki/The_Witcher', keywords)

MERGE(p:page{ref:"/wiki/The_Witcher"})
SET p.keywords = ['All The Tropes Wiki', 'allthetropes', 'The Witcher', 'Video Game', 'Shut UP', ' Hannibal', 'Vendor Trash', 'True Neutral', 'The Verse', 'The Witcher 2: Assassins of Kings', 'Karma Houdini', 'Anti-Hero', 'Aristocrats Are Evil', 'Complete Monster']


In [445]:
def add_reference(identifier, references):
    global session
    base_node = f'ref:"{identifier}"'
    for reference in references.items():
        ref_node = f'ref:"{reference[0]}"'
        query = 'MATCH(a:page{' + f'{base_node}' + '})'
        query += '\n' + 'MERGE(a)-[r:REFERS_TO{text:' + f'"{reference[1]}"' + '}]->' + \
        '(b:page{' + f'{ref_node}' + '})'
       # print(query)
        session.run(query)


In [434]:
add_reference('/wiki/The_Witcher', references)

MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:"Neverwinter Nights"}]->(b:page{ref:"/wiki/Neverwinter_Nights"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:"The Verse"}]->(b:page{ref:"/wiki/The_Verse"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:"The Witcher 2: Assassins of Kings"}]->(b:page{ref:"/wiki/The_Witcher_2:_Assassins_of_Kings"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:"The Witcher/Characters"}]->(b:page{ref:"/wiki/The_Witcher/Characters"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:"Shut UP, Hannibal"}]->(b:page{ref:"/wiki/Shut_UP,_Hannibal"})


In [435]:
add_reference('/wiki/The_Witcher', examples)

MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:" Action Commands: Sword attacks are chained with correctly timed mouse-clicks."}]->(b:page{ref:"/wiki/Action_Commands"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:" Ambiguously Evil: Possibly Abigail. There's evidence to suggest that the accusations levied against her may have some merit, and she's implied to be a member of the Cult of the Lionheaded Spider. However, Abigail always proves to be helpful to Geralt and appears to be harmless when left alone."}]->(b:page{ref:"/wiki/Ambiguously_Evil"})
MATCH(a:page{ref:"/wiki/The_Witcher"})
MERGE(a)-[r:REFERS_TO{text:" And That's Terrible: Every character (even the addicts, worried about their supply) give significant condemnation at Salamandra taking control of the drug trade. Very out of place in a World Half Empty where rapists are Karma Houdinis.
 However, it is due to the fact that Salamandra is taking over the market with the drug trade as oppos

In [426]:
def delete_node(identifier):
    global session
    query = 'MATCH(p:page{' + f'ref:"{identifier}"' + '}) DETACH DELETE p' 
    print(query)
    session.run(query)
    

In [446]:
# delete_node(identifier)

MATCH(p:page{ref:"/wiki/The_Witcher"}) DETACH DELETE p


In [447]:
def delete_all():
    global session
    query = 'MATCH(n) DETACH DELETE n'
    print(query)
    session.run(query)

In [448]:
# delete_all()

MATCH(n) DETACH DELETE n


In [449]:
def page_to_graph(name):
    identifier, description, references, examples, keywords = get_page_attributes(name)
    add_node(identifier, keywords)
    add_reference(identifier, references)
    add_reference(identifier, examples)

In [450]:
page_to_graph('The_Witcher')

In [529]:
def get_references(identifier):
    global session
    query = 'MATCH (p:page{ref:' +  f"'{identifier}'" + '})-[REFERS_TO]->(x) return x'
    result = session.run(query)
    print(query)
    return [r['x']['ref'] for r in result]

In [531]:
get_references(identifier)[:10]

MATCH (p:page{ref:'/wiki/The_Witcher'})-[REFERS_TO]->(x) return x


['/wiki/Humans_Are_Bastards',
 '/wiki/Hot_Witch',
 '/wiki/Hotter_and_Sexier',
 '/wiki/Heroes_Want_Redheads',
 '/wiki/Heroic_Albino',
 '/wiki/Hannibal_Lecture',
 '/wiki/Black_and_Grey_Morality',
 '/wiki/True_Neutral',
 '/wiki/Grey_and_Gray_Morality',
 '/wiki/Green-Skinned_Space_Babe']

In [535]:
page_to_graph('Heroic_Albino')

In [538]:
page_to_graph('The_Witcher_2:_Assassins_of_Kings')

In [541]:
get_references('/wiki/The_Witcher_2:_Assassins_of_Kings')[:10]

MATCH (p:page{ref:'/wiki/The_Witcher_2:_Assassins_of_Kings'})-[REFERS_TO]->(x) return x


['/wiki/Your_Terrorists_Are_Our_Freedom_Fighters',
 '/wiki/You_No_Take_Candle',
 '/wiki/We_Used_to_Be_Friends',
 '/wiki/Walking_Armory',
 '/wiki/Fridge_Brilliance',
 '/wiki/Villain_Forgot_to_Level_Grind',
 '/wiki/Ungrateful_Bastard',
 '/wiki/Undying_Loyalty',
 '/wiki/Tutorial_Failure',
 '/wiki/Cutscene_Power_to_the_Max']