# Exercise 4 Custom: Assembling your own graph from Wikidata

## Introduction

We are now going to assemble our own graph based on our own search terms!  This exercise is really just for fun.  Make it your own!  Be sure in particular to adjust your P-codes to reflect claims that are appropriate to your graph.

The remainder of the course will use our pre-populated graphs.  This code is really just the framework that we used before.  And as a reminder, don't get frustrated if Wikidata doesn't cooperate.  Get a cup of coffee, restart the kernel, and try again.  Have fun!  :)

In [None]:
%matplotlib inline

import json
import re
import urllib
from pprint import pprint
import time
from tqdm import tqdm

from neo4j import GraphDatabase

import numpy as np
import pandas as pd
import wikipedia

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

from pywikibot.data import api
import pywikibot
import wikipedia

print(spacy.__version__)
print(pywikibot.__version__)
print(wikipedia.__version__)

In [None]:
non_nc = spacy.load('en_core_web_md')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('merge_noun_chunks')

print(nlp.pipe_names)

In [None]:
# Put your own search term here
# Be sure to check what spacy identifies as the named entities.  Make sure that there are
# enough of them to make for an interesting graph.  But remember that the more you have 
# the longer it will take to query Wikidata.

text = wikipedia.summary('')
doc = nlp(text)
text

In [None]:
ent_ignore_ls = ['DATE']
ner_list = []

for el in doc.ents:
    if el.label_ not in ent_ignore_ls:
        #print(el, el.label_)
        if el.text not in ner_list:
            temp_doc = nlp(el.text)
            ner_list.append(el.text)

ner_list[0:5]

In [None]:
def remove_special_characters(text):
    
    regex = re.compile(r'[\n\r\t]')
    clean_text = regex.sub(" ", text)
    
    return clean_text


def remove_stop_words_and_punct(text, print_text=False):
    
    result_ls = []
    rsw_doc = non_nc(text)
    
    for token in rsw_doc:
        if print_text:
            print(token, token.is_stop)
            print('--------------')
        if not token.is_stop and not token.is_punct and not token.is_space:
            result_ls.append(str(token))
    
    result_str = ' '.join(result_ls)

    return result_str

In [None]:
node_text_ls = []

for el in ner_list:
    clean_text = remove_special_characters(el)
    no_sw = remove_stop_words_and_punct(clean_text)
    if no_sw not in node_text_ls:
        node_text_ls.append(no_sw)

node_text_ls

In [None]:
def getItems(site, itemtitle):
    params = { 'action' :'wbsearchentities' , 'format' : 'json' , 'language' : 'en', 'type' : 'item', 'search': itemtitle}
    request = api.Request(site=site,**params)
    return request.submit()

def getItem(site, wdItem, token):
    request = api.Request(site=site,
                          action='wbgetentities',
                          format='json',
                          ids=wdItem)    
    return request.submit()

def prettyPrint(variable):
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(variable)

# Login to wikidata
token = open('.wiki_api_token').read()
wikidata = pywikibot.Site('wikidata', 'wikidata')
site = pywikibot.Site("wikidata", "wikidata")

In [None]:
%time

item_ls = []
i = 0

for el in node_text_ls:

    wikidataEntries = getItems(site, el)
    try:
        tup = (wikidataEntries['search'][0]['id'], el)
        item_ls.append(tup)
    except:
        i += 1
        print('Missing ', i,'th entry for ', el)
    
dedup_item_ls = []

for item in item_ls:
    if item not in dedup_item_ls:
        dedup_item_ls.append(item)
        
dedup_item_ls

In [None]:
# Choose some P-codes that are appropriate to your initial search

%%time
p_dc = {
    
       }

full_node_tup_ls = []

for el in tqdm(item_ls):
    itempage = pywikibot.ItemPage(wikidata, el[0])
    itemdata = itempage.get()
    source_node = itemdata['labels']['en']
    #print(el, source_node)

    for key in p_dc.keys():
        #print(source_node, key, p_dc[key])
        #print(itemdata['claims'])
        try:
            for i in itemdata['claims'][key]:
                target = i.getTarget()
                #print(target.id)
                tup = (source_node, el[0], key, p_dc[key], target.labels['en'], target.id)
                if tup not in full_node_tup_ls:
                    full_node_tup_ls.append(tup)
        except:
            continue

In [None]:
full_node_tup_ls[0:5]

In [None]:
df = pd.DataFrame(full_node_tup_ls, columns=['source_name', 'source_q', 'rel_p', 'rel_name', 'target_name', 'target_q'])
df.head()

In [None]:
df.shape

In [None]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [None]:
uri = ''
user = 'neo4j'
pwd = ''

conn = Neo4jConnection(uri=uri, user=user, pwd=pwd)
conn.query("MATCH (n) RETURN COUNT(n)")

In [None]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [None]:
uri = ''
user = 'neo4j'
pwd = ''

conn = Neo4jConnection(uri=uri, user=user, pwd=pwd)
conn.query("MATCH (n) RETURN COUNT(n)")

In [None]:
conn.query('CREATE CONSTRAINT q_value IF NOT EXISTS ON (n:Node) ASSERT n.id IS UNIQUE')

In [None]:
source_df = df[['source_name', 'source_q']].drop_duplicates()
source_df.columns = ['name', 'id']
target_df = df[['target_name', 'target_q']].drop_duplicates()
target_df.columns = ['name', 'id']
all_nodes_df = pd.concat([source_df, target_df]).drop_duplicates()
all_nodes_df.shape

In [None]:
def get_p31(row):
    # P31 corresponds to "instance of"
    
    itempage = pywikibot.ItemPage(wikidata, row)
    itemdata = itempage.get()
    try:
        target = itemdata['claims']['P31'][0].getTarget()
        target.get()
        return target.labels['en']
    except:
        return 'Unknown'
    

def add_nodes(rows, batch_size=10000):
    # Adds author nodes to the Neo4j graph as a batch job.

    query = '''UNWIND $rows AS row
               MERGE (:Node {name: row.name, id: row.id, type: row.node_label})
               RETURN count(*) as total
    '''
    return insert_data(query, rows, batch_size)


def add_edges(rows, batch_size=50000):
    
    
    query = """UNWIND $rows AS row
               MATCH (src:Node {id: row.source_q}), (tar:Node {id: row.target_q})
               CREATE (src)-[:%s]->(tar)
    """ % edge
    
    return insert_data(query, rows, batch_size)


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.

    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        try:
            total += res[0]['total']
        except:
            total += 0
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result

In [None]:
%%time
all_nodes_df['node_label'] = all_nodes_df['id'].map(get_p31)
all_nodes_df.head()

In [None]:
add_nodes(all_nodes_df)

In [None]:
edge_ls = df['rel_name'].unique().tolist()

In [None]:
query = """MATCH (n:Node) 
           WITH n.name AS name, COLLECT(n) AS nodes 
           WHERE SIZE(nodes)>1 
           FOREACH (el in nodes | DETACH DELETE el)
"""

conn.query(query)

In [None]:
query = """MATCH (n:Node) 
           SET n.type_ls = apoc.convert.toStringList(n.type)
"""

conn.query(query)

In [None]:
query = """MATCH (n:Node) 
           CALL apoc.create.addLabels(n, n.type_ls) 
           YIELD node RETURN node
"""

conn.query(query)