In [1]:
import numpy as np
import rdflib
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import copy
import logging
import re

logging.basicConfig(level=logging.WARNING)

In [2]:
def query(query, result_format = "JSON", wrapper = "http://dbpedia.org/sparql"):
    sparql = SPARQLWrapper(wrapper)
    sparql.setQuery(query)
    if result_format == "JSON":
        sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    return results["results"]["bindings"]

In [3]:
def clean_string(string, regex=r'[^a-zA-Z\s0-9]+'):
    """
    clean string by removing non chars
    """
    return re.sub(regex, '', string)

In [4]:
def parse_sparql_results(result):
    if result:
        tmp = []
        for entry in result:
            #import pdb; pdb.set_trace()
            try:
                tmp.append(entry['result']['value'].split('/')[-1])
            except:
                pass
        return tmp
    else:
        return

In [5]:
def one_hop(word):
    res = []
    for predicate in ['subClassOf', 'broader', 'category']:
        q = create_query(word, predicate)
        try:
            query_res = query(q)
        except:
            time.sleep(5)
            try:
                query_res = query(q)
            except:
                res.append([])
                continue
        result = parse_sparql_results(query_res)
        if result:
            res.append(result)
            time.sleep(0.2)
        else:
            res.append([])
    return res

In [6]:
def create_query(word, predicate):
    if predicate == 'category':
        string = """
                    PREFIX dcterms: <http://purl.org/dc/terms/>

                    SELECT ?result
                    WHERE {
                    <%(word)s> dcterms:subject ?result
                    }
                 """ % {'word': word}
        return string
    if predicate == 'subClassOf':
        string = """
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

                    SELECT DISTINCT ?result
                    WHERE {
                      <%(word)s> a ?c1 ; a ?result .
                      ?c1 rdfs:subClassOf? ?result .
                    }
                 """ % {'word': word}
        return string
    
    if predicate == 'broader':
        string = """
                    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                    PREFIX dcterms: <http://purl.org/dc/terms/>

                    select ?result
                    WHERE { 
                      <%(word)s> dcterms:subject ?o .
                      ?o skos:broader ?result 
                    }
                 """ % {'word': word}
        return string
    
    return string

In [7]:
path = '../data/spotlight_responses.npy'

data = np.load(path)

In [8]:
# features

#resources
#subClassOf
#broader
#types
#categories



In [9]:
def prepare_data(event_data):
    data = {}
    for key, resources in event_data.item().items():
        if resources:
            
            # collect all resources & types from spotlight responses
            res_list = []
            type_list = []

            for resource in resources[0]:
                results = list(resource.values())[0]
                res_list.append(results[0])
                
                # if types are found:
                if results[1] != '':
                    type_list.append(results[1])
                
            data[key] = {'resources': res_list, 'types': type_list}
        
        else:
            continue
    return data

In [10]:
processed_data = prepare_data(data)

In [None]:
def enrich_data(proc_data):
    
    logging.warning(' events found: {}'.format(len(proc_data.items())))
    
    n_req = 0
    
    for i, (key, value) in enumerate(proc_data.items()):
        
        resources = value['resources']
        
        n_req += len(resources)
        
        subClassOf = []
        broader = []
        categories = []
        
        for entry in resources:
            #import pdb; pdb.set_trace()
            results = one_hop(entry)
            subClassOf.append([clean_string(x) for x in results[0]])
            broader.append([clean_string(x) for x in results[1]])
            categories.append([clean_string(x) for x in results[2]])
               
        proc_data[key]['subClassOf'] = list(set([item for sublist in subClassOf for item in sublist]))
        proc_data[key]['broader'] = list(set([item for sublist in broader for item in sublist]))
        proc_data[key]['categories'] = list(set([item for sublist in categories for item in sublist]))
        
        if (i + 1) % 10 == 0:
            logging.warning(' finished {} requests'.format(n_req))
            n_req = 0

    return proc_data

In [None]:
bla = enrich_data(processed_data)



In [None]:
bla['-KYGWvCQuzPXd6Kb-Gwp']