In [None]:
import networkx as nx
import sparql
import pandas as pd
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
from Levenshtein import distance as levenshtein_distance

In [None]:
patterns = {
    'p0': {'A': []},
    'p1': {'A': ['B']},
    'p2': {'A': ['B'],
          'B': ['C']},
    'p3': {'A': ['B'],
          'C': ['B']},
    'p4': {'A': ['B', 'C']},
    'p5': {'A': ['B', 'C', 'D']},
    'p6': {'A': ['B', 'C'],
          'C': ['D']},
    'p7': {'A': ['B'],
          'B': ['C'],
          'C': ['D']},
    'p8': {'A': ['B'],
          'B': ['C'],
          'D': ['C']},
    'p9': {'A': ['B'],
          'B': ['C'],
          'D': ['B']},
    'p10': {'A': ['B'],
           'B': ['C', 'D']},
    'p11': {'A': ['B'],
           'C': ['B'],
           'D': ['B']}
}


In [None]:
exclusions = ['<http://dbpedia.org/property/wikiPageUsesTemplate>',
              '<http://dbpedia.org/ontology/wikiPageExternalLink>', 
              '<http://dbpedia.org/ontology/wikiPageID>', 
              '<http://dbpedia.org/ontology/wikiPageRevisionID>', 
              '<http://dbpedia.org/ontology/wikiPageLength>', 
              '<http://dbpedia.org/ontology/wikiPageWikiLink>', 
              '<http://www.w3.org/2000/01/rdf-schema#label>', 
              '<http://www.w3.org/2002/07/owl#sameAs>', 
              '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>', 
              '<http://schema.org/sameAs>', 
              '<http://purl.org/dc/terms/subject>', 
              '<http://xmlns.com/foaf/0.1/isPrimaryTopicOf>', 
              '<http://xmlns.com/foaf/0.1/depiction>', 
              '<http://www.w3.org/2000/01/rdf-schema#seeAlso>', 
              '<http://www.w3.org/2000/01/rdf-schema#comment>', 
              '<http://dbpedia.org/ontology/abstract>', 
              '<http://dbpedia.org/ontology/thumbnail>', 
              '<http://dbpedia.org/property/caption>', 
              '<http://dbpedia.org/property/captionAlign>', 
              '<http://dbpedia.org/property/image>', 
              '<http://dbpedia.org/property/imageFlag>', 
              '<http://www.w3.org/ns/prov#wasDerivedFrom>', 
              '<http://dbpedia.org/ontology/wikiPageRedirects>', 
              '<http://dbpedia.org/ontology/wikiPageDisambiguates>']

In [None]:
class QueryBuilder():
    def __init__(self):
        self.embeddings = self.__load_embeddings()
        self.exclusions = self.__get_exclusions()
    
    def build(self, question, entity, pattern):
        cn = entity
        # get pattern graph
        p = self.__get_pattern(pattern)
        # make a copy of the pattern
        Q = p.copy()
        # get non-intermediate nodes
        NS = self.__get_non_intermediate_nodes(p)
        
        while True:
            # empty relations set
            R = pd.DataFrame(columns=['pred', 'label'])
            
            # check if there nodes or edges unlabeled
            if self.__is_labeled(Q):
                return Q
            
            # check if NS has outgoing edges
            if self.__check_if_outgoing(NS):
                outgoing_relations = self.__get_relations(entity=cn, query_type='outgoing')
                # add all outgoing relation found to R
                R = R.append(outgoing_relations)
                
            # check if NS has incoming edges
            if self.__check_if_incoming(NS):
                outgoing_relations = self.__get_relations(entity=cn, query_type='incoming')
                # add all incoming relation found to R
                R = R.append(outgoing_relations)
            
            # get r, most relevant relation to question
            r = self.__get_most_relevant_relation(question, R)
            
            return r
            
            # if entity is in question
            if cn in question: 
                return
                # assemble entity and relation r in Q
            else:
                return
                # assemble variable and relation r in Q
            
            # NS = adiacent node to explored structure
            # cn = not sure
        return    
        
    
    def __get_pattern(self, pattern):
        return nx.from_dict_of_lists(patterns[pattern], 
                                     create_using=nx.DiGraph)      
    
    def __get_non_intermediate_nodes(self, p):
        return {node: {'in_degree': p.in_degree(node), 'out_degree': p.out_degree(node)} 
                         for node in p.nodes if p.out_degree(node) + p.in_degree(node) < 2}
    
    def __is_labeled(self, Q):
        # check if nodes are labeled
        for node in Q.nodes:
            if not Q.nodes[node]:
                return False
        # check if edges are labeled
        for _,_,e in Q.edges(data=True):
            if not e:
                return False
        return True
    
    def __check_if_outgoing(self, NS):
        out_degree = [NS[node]['out_degree'] for node in NS]
        return max(out_degree) > 0
    
    def __check_if_incoming(self, NS):
        in_degree = [NS[node]['in_degree'] for node in NS]
        return max(in_degree) > 0
    
    def __get_relations(self, entity, query_type, endpoint = 'http://dbpedia.org/sparql'):
        print(query_type)
        q = self.__get_query(entity, query_type)
        results = sparql.query(endpoint, q)
        
        relations = pd.DataFrame(columns=['pred', 'label'])
        for i, row in enumerate(results):
            (pred, label) = sparql.unpack_row(row)
            
            if not label:
                label = self.__parse_predicate(pred)
            
            relations = relations.append({
                'pred': pred,
                'label': label
            }, ignore_index=True)
    
        return relations
    
    def __get_exclusions(self):
        return ', '.join(exclusions)
    
    def __parse_predicate(self, pred):
        last = pred.rsplit('/',1)[1]
        splitted = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', last) 
        return ' '.join(splitted)

    def __get_query(self, entity, query_type):
        if query_type == 'outgoing':
            return "select distinct ?pred ?pred_label_stripped \
                    where {  \
                        "+ entity +" ?pred ?obj.  \
                        OPTIONAL {  \
                            ?pred rdfs:label ?pred_label . \
                            FILTER (lang(?pred_label) = 'en').  \
                            BIND (STR(?pred_label)  AS ?pred_label_stripped). \
                        } . \
                        FILTER(?pred NOT IN ("+ self.exclusions +") ). \
                    }"
        elif query_type == 'incoming':
            return "select distinct ?pred ?pred_label_stripped \
                    where { \
                        ?subj ?pred dbr:Barack_Obama. \
                        ?pred rdfs:label ?pred_label. \
                        OPTIONAL {  \
                            ?pred rdfs:label ?pred_label . \
                            FILTER (lang(?pred_label) = 'en').  \
                            BIND (STR(?pred_label)  AS ?pred_label_stripped). \
                        } . \
                        FILTER(?pred NOT IN (" + self.exclusions + ") ). }"

        else:
            raise ValueError('query_type has to be either \'incoming\' or \'outgoing\' for value:' + query_type)
    
    def __load_embeddings(self, path='../../data/glove.twitter.27B.200d.txt'):
        embeddings_dict = {}
        print('Loading embeddings...')
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        return embeddings_dict
    
    def __get_most_relevant_relation(self, question, R, lambda_param=0.4):
        unique_relations = R.label.unique()
        question = question.lower().replace('?', ' ?')
        # tokenize question
        question_tokens = question.split()
        
        relevances = []
        
        for relation in unique_relations:
            # tokenize label
            relation_tokens = relation.split()
            
            relevance = 0
            for rel_token in relation_tokens:                
                
                for question_token in question_tokens:
                    
                    if rel_token in self.embeddings:
                        
                        rel_token_embedding = self.embeddings[rel_token]
                        question_token_embedding = self.embeddings[question_token]

                        # compute cosine similarity
                        cos_sim = dot(rel_token_embedding, question_token_embedding) \
                                    /(norm(rel_token_embedding)*norm(question_token_embedding))
                    else:
                        cos_sim = 0
                    # compute lev distance
                    lev_distance = levenshtein_distance(question_token, rel_token)
                    # sum to previous relenvances of relation tokens and question tokens
                    relevance += lambda_param * cos_sim + (1 - lambda_param) * 1/(lev_distance+1)
            print(relevance)
            print(relation)
            
            relevances.append(relevance)
        relevances = np.array(relevances)
        
        print(np.argmax(relevances))
        
        return unique_relations[np.argmax(relevances)]
                    

In [None]:
query_builder = QueryBuilder()

In [None]:
s = query_builder.build(question='Who is the spouse of Barack Obama?', entity='dbr:Barack_Obama', pattern='p1')