## Script S3

Run all cells to generate relationship between nodes in your database. No extra modification is required if you want to use default WIKIDATA.

In [1]:
#THIS NOTEBOOK WILL USE INTERNET API TO GET A KNOWLEDGE GRAPH OF A GIVEN ENTITY
#IT NEED TO CONNECT TO DB TO FETCH A EXISTED TAG LIST
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from neo4j import GraphDatabase, basic_auth
from neo4j.exceptions import Neo4jError
import neo4j.time
import csv
import json
import time
import os
from flask_restful_swagger_2 import Api, swagger, Schema


def cprint(content,module='DEBUG',*args):
    if args:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + '\033[1;35m' +str(args) +' \033[0m' + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()) )
    else:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()))

        
DATABASE_USERNAME="neo4j"
DATABASE_PASSWORD="spade-discounts-switch"
DATABASE_URL="bolt://localhost:7687"
DATA_FILE_PATH = 'dev-to-articles.csv'

def db_fetch_all_tags(session):
    '''
    Fetch all nodes with label 'Tag'
    Args:
        session: db session,driver.session()
    Return:
        Cypher result, all nodes with label 'Tag'
    '''
    def _cypher(tx):
        return list(tx.run(
        '''
        MATCH (n:Tag)-[:HAS_TAG]-(ARTICLE) RETURN DISTINCT n
        '''
        ))
    result = session.read_transaction(_cypher)
    cprint(str(len(result))+'record fetched','DB')
    return result       
def serialize_tag(tag):
    return{
        'name':tag['name'],
        'keywords_for_search':tag['keywords_for_search']
    }


def db_create_subsume_relation(session,parent_tag,child_tag):
    '''
    Create relation: (parent_tag)-[SUBSUME]->(child_tag)
    Args:
        session: db session,driver.session()
        parent_tag: string, name of tag
        child_tag: string, name of tag
    Return:
        Cypher result of created relation
    '''
    if not parent_tag:
        return 'parent tag is empty'
    if not child_tag:
        return 'child tag is empty'
    def _cypher(tx,parent_tag,child_tag):
        return list(tx.run(
        '''
        MERGE (n:Tag {name:$parent_tag})
        MERGE (m:Tag {name:$child_tag})
        MERGE (n)-[subsume:SUBSUME]-> (m)
        RETURN subsume
        ''',{'parent_tag': parent_tag , 'child_tag': child_tag}
        ))
    result = session.write_transaction(_cypher,parent_tag,child_tag)
    return result

def db_add_keywords_to_tag(session,tag_name,keyword):
    '''
    Update attribution 'keywords' of Tag node
    Args:
        session: db session,driver.session()
        tag_name: string of tag name
        keyword: string of keyword to add
    Return:
        Cypher result of updated node
    Error Return:
        String with specific info
    '''
    
    if not keyword:
        return 'keyword is empty'
    if not tag_name:
        return 'tag name is empty'
    else:
        keyword = keyword.replace("+", " ")
    def _cypher_get_node(tx,tag_name):
        return list(tx.run(
        '''
        MATCH (n:Tag {name:$tag_name})
        RETURN n
        ''',{'tag_name':tag_name}
        ))
    def _cypher_update_node(tx,tag_name,new_keyword):
        return list(tx.run(
        '''
        MATCH (n:Tag {name:$tag_name})
        SET n.keywords_for_search = $keyword
        RETURN n
        ''',{'tag_name' : tag_name,'keyword' : new_keyword}
        ))
    
    tag_node = session.read_transaction(_cypher_get_node,tag_name)

    if tag_node:
        old_keywords = serialize_tag(tag_node[0]['n'])['keywords_for_search']
    else:
        return 'No such Tag node'
    if (old_keywords):
        if keyword in old_keywords:
            return 'keyword is already logged'
        new_keyword = old_keywords+'+'+keyword
    else:
        new_keyword = keyword

    result = session.write_transaction(_cypher_update_node,tag_name,new_keyword)
    return result

    


    
driver = GraphDatabase.driver(DATABASE_URL, auth=basic_auth(DATABASE_USERNAME, str(DATABASE_PASSWORD)))
with driver.session() as session: 
    tag_list = db_fetch_all_tags(session)
    session.close()
    
#EXPORT VAR : tag_list

[1;32;43m [DB] [0m 18582record fetched |2021-05-24 23:05:18|


In [None]:
#QUERY FOR A SINGLE TAG
#THIS CAN RUN SEPERATELY IF REMAP VARIABLE 'tag_list'
#IMPUT: tag_list, db driver
from SPARQLWrapper import SPARQLWrapper, JSON
def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True
def get_query(tag):
    '''
    Args: string, name of entity
    Return: string, url for wikidata

    '''
    language_code = 'en'
    cprint('Generating query with:'+tag,'WIKIDATA')
    if (is_all_chinese(tag)):
        language_code = 'zh'
            
    query = '''SELECT ?item ?itemLabel ?P1 ?P1Label ?P2 ?P2Label ?P3 ?P3Label ?item_zh ?P1Label_zh ?P2Label_zh ?P3Label_zh 
       WHERE {?item ?label "'''+tag+'''"@'''+language_code+'''. 
       ?article schema:about ?item .?article schema:inLanguage "en" .
       ?article schema:isPartOf <https://en.wikipedia.org/>. 
       ?item (wdt:P279|wdt:P361|wdt:P101|wdt:P425|wdt:P31|wdt:P277) ?P1.
       OPTIONAL { ?P1 (wdt:P279|wdt:P361|wdt:P101|wdt:P425|wdt:P31|wdt:P277) ?P2. }
       OPTIONAL { ?P2 (wdt:P279|wdt:P361|wdt:P101|wdt:P425|wdt:P31|wdt:P277) ?P3. }
       OPTIONAL {?item rdfs:label ?item_zh filter (lang(?item_zh) = "zh-cn")}.
       OPTIONAL {?P1 rdfs:label ?P1Label_zh filter (lang(?P1Label_zh) = "zh-cn")}.
       OPTIONAL {?P2 rdfs:label ?P2Label_zh filter (lang(?P2Label_zh) = "zh-cn")}.
       OPTIONAL {?P3 rdfs:label ?P3Label_zh filter (lang(?P3Label_zh) = "zh-cn")}.
       SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
      }'''
    return query

def get_results(query, endpoint_url="https://query.wikidata.org/sparql", ):
    '''
    Get wikidata query result
    Args:
        endpoint_url : "https://query.wikidata.org/sparql"
        query: query script
        tag: source tag
    Return:
    [
           {
          "item":"http://www.wikidata.org/entity/Q6859454",
          "itemLabel":"web developer",
          "P1":"http://www.wikidata.org/entity/Q183888",
          "P1Label":"software developer",
          "P2":"http://www.wikidata.org/entity/Q4164871",
          "P2Label":"position",
          "P3":"http://www.wikidata.org/entity/Q16686448",
          "P3Label":"artificial entity",
          "P1Label_zh":"软件开发者",
          "P2Label_zh":"职位",
          "P3Label_zh":"人造客体"
           },
           ...
   ]
    '''
    user_agent = "WDQS-Example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
  #  user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"
    #adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    try:
        res = sparql.query().convert()
        cprint(str(len(res['results']['bindings']))+' records fetched','WIKIDATA')
        if len(res['results']['bindings']) == 0:
            return False
        return res
    except: 
        time.sleep(1)
        cprint('Query failed:','WIKIDATA')
        #No distinct exception parameter for HTTPError from urllib2  & EndPointInternalError
        #source code : https://rdflib.dev/sparqlwrapper/doc/1.6.0/SPARQLWrapper.Wrapper-pysrc.html
        #guidance: https://sparqlwrapper.readthedocs.io/en/latest/main.html
        return False
    
def file_write_log(path,info):
    with open(path, mode='a+',encoding="utf-8") as file_a:
        file_a.write( str(info) + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()) + '\n')
    file_a.close()


def db_generate_tag_tree(session,tag_list):
    '''
    Args: return value from db_fetch_all_tags(session)
    
    Return: 1

    ''' 
    counter = 1000
    tag_number = len(tag_list)
    for tag in tag_list[1000:]:
        tag_name = serialize_tag(tag['n'])['name']
        #request wikidata with tag_name
        wiki_results = get_results(get_query(tag_name))
        if not wiki_results:
            file_write_log('wikidata-error-log.txt',tag_name)
            continue
        time.sleep( 5 )
        for res in wiki_results['results']['bindings']:
            #init origin tag
            db_add_keywords_to_tag(session,tag_name,tag_name)
            #handle itemLabel col
            if('itemLabel' in res):
                db_add_keywords_to_tag(session,tag_name,res['itemLabel']['value'])
            #handle P1Label col
            if('P1Label' in res):
                db_create_subsume_relation(session,res['P1Label']['value'],tag_name)
                db_add_keywords_to_tag(session,res['P1Label']['value'],res['P1Label']['value'])
            #handle P2Label col
            if('P2Label' in res):
                db_create_subsume_relation(session,res['P2Label']['value'],res['P1Label']['value'])
                db_add_keywords_to_tag(session,res['P2Label']['value'],res['P2Label']['value'])
            #handle P3Label col
            if('P3Label' in res):
                db_create_subsume_relation(session,res['P3Label']['value'],res['P2Label']['value'])
                db_add_keywords_to_tag(session,res['P3Label']['value'],res['P3Label']['value'])
            #handle item_zh col
            if('item_zh' in res):
                db_add_keywords_to_tag(session,tag_name,res['item_zh']['value'])
            #handle P1Label_zh col
            if('P1Label_zh' in res):
                db_add_keywords_to_tag(session,res['P1Label']['value'],res['P1Label_zh']['value'])
            #handle P2Label_zh col
            if('P2Label_zh' in res):
                db_add_keywords_to_tag(session,res['P2Label']['value'],res['P2Label_zh']['value'])
            #handle P3Label_zh col
            if('P3Label_zh' in res):
                db_add_keywords_to_tag(session,res['P3Label']['value'],res['P3Label_zh']['value'])
        cprint('Knowledge Graph Built: '+tag_name+' ['+str(counter)+'/'+str(tag_number)+']', 'DB')
        print(counter)
        counter+=1
    return tag_list

with driver.session() as session: 
    db_generate_tag_tree(session,tag_list)
    session.close()
    


[1;32;43m [WIKIDATA] [0m Generating query with:SumUp |2021-05-25 00:58:33|
[1;32;43m [WIKIDATA] [0m 3 records fetched |2021-05-25 00:58:35|
[1;32;43m [DB] [0m Knowledge Graph Built: SumUp [1000/18582] |2021-05-25 00:58:40|
1000
[1;32;43m [WIKIDATA] [0m Generating query with:Sybil Attack |2021-05-25 00:58:40|
[1;32;43m [WIKIDATA] [0m 0 records fetched |2021-05-25 00:58:41|
[1;32;43m [WIKIDATA] [0m Generating query with:VOTE |2021-05-25 00:58:41|
[1;32;43m [WIKIDATA] [0m 0 records fetched |2021-05-25 00:58:42|
[1;32;43m [WIKIDATA] [0m Generating query with:Online Content |2021-05-25 00:58:42|
[1;32;43m [WIKIDATA] [0m 0 records fetched |2021-05-25 00:58:43|
[1;32;43m [WIKIDATA] [0m Generating query with:Lie Group |2021-05-25 00:58:43|
[1;32;43m [WIKIDATA] [0m 0 records fetched |2021-05-25 00:58:45|
[1;32;43m [WIKIDATA] [0m Generating query with:Continuous Time Optimal Control Problems |2021-05-25 00:58:45|
[1;32;43m [WIKIDATA] [0m 0 records fetched |2021-05-25 00