In [1]:
from bs4 import BeautifulSoup as bs
import requests

Creating tools to parse attributes from individual pages

In [2]:
BASE_URL = 'https://allthetropes.fandom.com'

In [3]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [4]:
def get_page_html(name):
    global BASE_URL
    global USER_AGENT
    url = BASE_URL + '/wiki/' + name
    html = requests.get(url, headers=USER_AGENT).text
    return bs(html)

In [5]:
def get_keywords(page):
    return page.find('meta', {'name':'keywords'})['content']

In [6]:
def parse_references(p):
    refs = p.findAll('a')
    d = {}
    for ref in refs:
        try:
            title = ref['title']
            href = ref['href']
            d[href] = title
        except:
            pass
    return d

In [7]:
def get_examples(page):
    examples = page.find('div', id='mw-content-text').findAll('ul')[0].findAll('li')
    example_refs = {}
    for example in examples:
        text = example.text.strip('\n')
        text = text.replace('\xa0', ' ')
        text = text.replace('"', '')
        ref = example.findAll('a')
        try:
            ref = ref[0]['href']
            example_refs[ref] = text
        except:
            pass
    return example_refs

In [8]:
def parse_text(page):
    paras = page.find('div', id='mw-content-text').findAll('p')
    description = []
    references = {}
    for p in paras:
        description.append(p.text.strip('\n').replace('\xa0', ' ').replace('"', ''))
        references.update(parse_references(p))
    return ''.join(description), references

In [9]:
test = get_page_html('The_Witcher')

In [10]:
def get_page_attributes(name):
    page = get_page_html(name)
    identifier = '/wiki/' + name
    description, references = parse_text(page)
    examples = get_examples(page)
    keywords = get_keywords(page)
    return identifier, description, references, examples, keywords

Connecting to neo4j DB

In [11]:
from neo4j import GraphDatabase, basic_auth

In [12]:
driver = GraphDatabase.driver("bolt://0.0.0.0:7687",
                              basic_auth=("neo4j", "neo4j"),
                              encrypted=False)

In [13]:
session = driver.session()

In [14]:
identifier, description, references, examples, keywords = get_page_attributes('The_Witcher')

In [15]:
def add_node(identifier, keywords):
    global session
    ref = f'ref:"{identifier}"'
    kws = str(keywords.split(','))
    query = 'MERGE(p:page{' + ref + '})'
    query += '\n' + f'SET p.keywords = {kws}'
    session.run(query)
   # print(query)

In [16]:
add_node('/wiki/The_Witcher', keywords)

In [17]:
def add_reference(identifier, references):
    global session
    base_node = f'ref:"{identifier}"'
    for reference in references.items():
        ref_node = f'ref:"{reference[0]}"'
        query = 'MATCH(a:page{' + f'{base_node}' + '})'
        query += '\n' + 'MERGE(a)-[r:REFERS_TO{text:' + f'"{reference[1]}"' + '}]->' + \
        '(b:page{' + f'{ref_node}' + '})'
       # print(query)
        session.run(query)


In [18]:
add_reference('/wiki/The_Witcher', references)

In [19]:
add_reference('/wiki/The_Witcher', examples)

In [20]:
def delete_node(identifier):
    global session
    query = 'MATCH(p:page{' + f'ref:"{identifier}"' + '}) DETACH DELETE p' 
    print(query)
    session.run(query)
    

In [21]:
# delete_node(identifier)

In [22]:
def delete_all():
    global session
    query = 'MATCH(n) DETACH DELETE n'
    print(query)
    session.run(query)

In [23]:
# delete_all()

In [24]:
def page_to_graph(name):
    identifier, description, references, examples, keywords = get_page_attributes(name)
    add_node(identifier, keywords)
    add_reference(identifier, references)
    add_reference(identifier, examples)

In [25]:
page_to_graph('The_Witcher')

In [26]:
def get_references(identifier):
    global session
    query = 'MATCH (p:page{ref:' +  f"'{identifier}'" + '})-[REFERS_TO]->(x) return x'
    result = session.run(query)
    print(query)
    return [r['x']['ref'] for r in result]

In [27]:
get_references(identifier)[:10]

MATCH (p:page{ref:'/wiki/The_Witcher'})-[REFERS_TO]->(x) return x


['/wiki/Humans_Are_Bastards',
 '/wiki/Hot_Witch',
 '/wiki/Hotter_and_Sexier',
 '/wiki/Heroes_Want_Redheads',
 '/wiki/Heroic_Albino',
 '/wiki/Hannibal_Lecture',
 '/wiki/Black_and_Grey_Morality',
 '/wiki/True_Neutral',
 '/wiki/Grey_and_Gray_Morality',
 '/wiki/Green-Skinned_Space_Babe']

In [28]:
page_to_graph('Heroic_Albino')

In [29]:
page_to_graph('The_Witcher_2:_Assassins_of_Kings')

In [30]:
get_references('/wiki/The_Witcher_2:_Assassins_of_Kings')[:10]

MATCH (p:page{ref:'/wiki/The_Witcher_2:_Assassins_of_Kings'})-[REFERS_TO]->(x) return x


['/wiki/Your_Terrorists_Are_Our_Freedom_Fighters',
 '/wiki/You_No_Take_Candle',
 '/wiki/We_Used_to_Be_Friends',
 '/wiki/Walking_Armory',
 '/wiki/Fridge_Brilliance',
 '/wiki/Villain_Forgot_to_Level_Grind',
 '/wiki/Ungrateful_Bastard',
 '/wiki/Undying_Loyalty',
 '/wiki/Tutorial_Failure',
 '/wiki/Cutscene_Power_to_the_Max']

In [31]:
page_to_graph('Contractual_Boss_Immunity')