In [None]:
import networkx as nx
import sparql
import pandas as pd
import re
import numpy as np
import string
from numpy import dot
from numpy.linalg import norm
from Levenshtein import distance as levenshtein_distance
from nltk.corpus import stopwords
from sent2vec.vectorizer import Vectorizer
from scipy import spatial

In [None]:
patterns = {
    'p0': {'A': []},
    'p1': {'A': ['B']},
    'p2': {'A': ['B'],
          'B': ['C']},
    'p3': {'A': ['B'],
          'C': ['B']},
    'p4': {'A': ['B', 'C']},
    'p5': {'A': ['B', 'C', 'D']},
    'p6': {'A': ['B', 'C'],
          'C': ['D']},
    'p7': {'A': ['B'],
          'B': ['C'],
          'C': ['D']},
    'p8': {'A': ['B'],
          'B': ['C'],
          'D': ['C']},
    'p9': {'A': ['B'],
          'B': ['C'],
          'D': ['B']},
    'p10': {'A': ['B'],
           'B': ['C', 'D']},
    'p11': {'A': ['B'],
           'C': ['B'],
           'D': ['B']}
}


In [None]:
exclusions = ['<http://dbpedia.org/property/wikiPageUsesTemplate>',
              '<http://dbpedia.org/ontology/wikiPageExternalLink>', 
              '<http://dbpedia.org/ontology/wikiPageID>', 
              '<http://dbpedia.org/ontology/wikiPageRevisionID>', 
              '<http://dbpedia.org/ontology/wikiPageLength>', 
              '<http://dbpedia.org/ontology/wikiPageWikiLink>', 
              '<http://www.w3.org/2000/01/rdf-schema#label>', 
              '<http://www.w3.org/2002/07/owl#sameAs>', 
              '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>', 
              '<http://schema.org/sameAs>', 
              '<http://purl.org/dc/terms/subject>', 
              '<http://xmlns.com/foaf/0.1/isPrimaryTopicOf>', 
              '<http://xmlns.com/foaf/0.1/depiction>', 
              '<http://www.w3.org/2000/01/rdf-schema#seeAlso>', 
              '<http://www.w3.org/2000/01/rdf-schema#comment>', 
              '<http://dbpedia.org/ontology/abstract>', 
              '<http://dbpedia.org/ontology/thumbnail>', 
              '<http://dbpedia.org/property/caption>', 
              '<http://dbpedia.org/property/captionAlign>', 
              '<http://dbpedia.org/property/image>', 
              '<http://dbpedia.org/property/imageFlag>', 
              '<http://www.w3.org/ns/prov#wasDerivedFrom>', 
              '<http://dbpedia.org/ontology/wikiPageRedirects>', 
              '<http://dbpedia.org/ontology/wikiPageDisambiguates>',
             '<http://dbpedia.org/property/1namedata>']

In [None]:
def load_embeddings(path='../../data/glove.twitter.27B.200d.txt'):
        embeddings_dict = {}
        print('Loading embeddings...')
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        return embeddings_dict

In [None]:
embeddings = load_embeddings()

In [None]:
class QueryBuilder():
    def __init__(self, embeddings = None, bert_similarity = True):
        if bert_similarity:
            self.vectorizer = Vectorizer()
        else:
            if not embeddings:
                self.embeddings = self.__load_embeddings()
            else:
                self.embeddings = embeddings
        self.stops = stopwords.words('english')
        self.exclusions = self.__get_exclusions()
        self.bert_similarity = bert_similarity
    
    """
    Build query graph.
    
    :param question: natural language question
    :param entity: entity resource
    :param pattern: graph pattern of the question
    
    :return: query graph
    """
    def build(self, question, entity, pattern):
        # TODO: call DBPedia entity extractor/linker, remove entity param
        entities = [entity]
        # TODO: higher score linking
        cn = entities[0]
        # get pattern graph
        p = self.__get_pattern(pattern)
        # make a copy of the pattern
        Q = p.copy()
        # get non-intermediate nodes
        NS = self.__get_non_intermediate_nodes(p)
        
        while True:
            # empty relations set
            R = pd.DataFrame(columns=['pred', 'label', 'direction'])
            
            # check if there nodes or edges unlabeled
            if self.__is_labeled(Q):
                return Q
            
            # check if NS has outgoing edges
            if self.__check_if_outgoing(NS):
                outgoing_relations = self.__get_relations(entity=cn, query_type='outgoing')
                # add all outgoing relation found to R
                R = R.append(outgoing_relations)
                
            # check if NS has incoming edges
            if self.__check_if_incoming(NS):
                outgoing_relations = self.__get_relations(entity=cn, query_type='incoming')
                
                # add all incoming relation found to R
                R = R.append(outgoing_relations)
            
            # get r, most relevant relation to question
            if self.bert_similarity:
                r = self.__get_most_relevant_relation_bert(question, R)
            else:
                r = self.__get_most_relevant_relation(question, R)
            
            
            # check if cn URI is in q
            if cn in entities:
                # assemble entity and relation r in Q
                # TODO TODO
                
            else:
                return
                # assemble variable and relation r in Q
            
            # NS = adiacent node to explored structure
            # cn = not sure
        return    
        
    """
    Get graph pattern for a pattern p.
    
    :param pattern: pattern dictionary
    
    :return: networkx graph of the pattern p
    """
    def __get_pattern(self, pattern):
        return nx.from_dict_of_lists(patterns[pattern], 
                                     create_using=nx.DiGraph)      
    
    """
    Get non intermediate notes for a graph pattern p.
    
    :param p: graph pattern
    
    :return: dict of non-intermediary nodes
    """
    def __get_non_intermediate_nodes(self, p):
        return {node: {'in_degree': p.in_degree(node), 'out_degree': p.out_degree(node)} 
                         for node in p.nodes if p.out_degree(node) + p.in_degree(node) < 2}
    
    """
    Check if graph has unlabeled relations.
    
    :param Q: graph
    
    :return: True if completaly labelled, False otherwise
    """
    def __is_labeled(self, Q):
        # check if nodes are labeled
        for node in Q.nodes:
            if not Q.nodes[node]:
                return False
        # check if edges are labeled
        for _,_,e in Q.edges(data=True):
            if not e:
                return False
        return True
    
    """
    Check if nodes have outgoing relations.
    
    :param NS: dict of nodes (see __get_non_intermediate_nodes(p))
    
    :return: True if they have outgoing relations, False otherwise
    """
    def __check_if_outgoing(self, NS):
        out_degree = [NS[node]['out_degree'] for node in NS]
        return max(out_degree) > 0
    
    """
    Check if nodes have incoming relations.
    
    :param NS: dict of nodes (see __get_non_intermediate_nodes(p))
    
    :return: True if they have incoming relations, False otherwise
    """
    def __check_if_incoming(self, NS):
        in_degree = [NS[node]['in_degree'] for node in NS]
        return max(in_degree) > 0
    
    """
    Get outgoing or incoming relations for an entity.
    
    :param entity: entity for which you want to find relations
    :param query_type: 'outgoing' for outgoing relations, 'incoming' for incoming relations
    :param query_type: SPARQL endpoint
    
    :return: dataframe of outgoing/incoming relations (URI, label)
    """
    def __get_relations(self, entity, query_type, endpoint = 'http://dbpedia.org/sparql'):
        print(query_type)
        q = self.__get_query(entity, query_type)
        results = sparql.query(endpoint, q)
        
        relations = pd.DataFrame(columns=['pred', 'label', 'direction'])
        for i, row in enumerate(results):
            (pred, label) = sparql.unpack_row(row)
            
            if not label:
                label = self.__parse_predicate(pred)
            else:
                label = label.replace('-', ' ')
                
            tmp = relations[relations.label == label]
            
            # we keep dbo predicates if multiple with the same label
            if not tmp.empty and "http://dbpedia.org/ontology/" in pred:
                relations.loc[tmp.index, ['pred']] = pred
            else:
                relations = relations.append({
                    'pred': pred,
                    'label': label.lower(),
                    'direction': query_type
                }, ignore_index=True)
    
        return relations
    
    """
    Get predicates to exclude from the query.
    
    :return: concatenation of all exclusions
    """
    def __get_exclusions(self):
        return ', '.join(exclusions)
    
    """
    Parse URI to extract a label.
    
    :param pred: predicate URI
    
    :return: predicate label
    """
    def __parse_predicate(self, pred):
        last = pred.rsplit('/',1)[1]
        splitted = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', last) 
        return ' '.join(splitted)
    
    """
    Get SPARQL query to get incoming or outgoing relations for an entity.
    
    :param entity: entity for which you want to find relations
    :param query_type: 'outgoing' for outgoing relations query, 'incoming' for incoming relations query
    
    :return: SPARQL query
    """
    def __get_query(self, entity, query_type):
        if query_type == 'outgoing':
            return "select distinct ?pred ?pred_label_stripped \
                    where {  \
                        "+ entity +" ?pred ?obj.  \
                        FILTER (lang(?pred_label) = 'en').  \
                        OPTIONAL {  \
                            ?pred rdfs:label ?pred_label . \
                            BIND (STR(?pred_label)  AS ?pred_label_stripped). \
                        } . \
                        FILTER(?pred NOT IN ("+ self.exclusions +") ). \
                    }"
        elif query_type == 'incoming':
            return "select distinct ?pred ?pred_label_stripped \
                    where { \
                        ?subj ?pred " + entity + ". \
                        ?pred rdfs:label ?pred_label. \
                        FILTER (lang(?pred_label) = 'en').  \
                        OPTIONAL {  \
                            ?pred rdfs:label ?pred_label . \
                            BIND (STR(?pred_label)  AS ?pred_label_stripped). \
                        } . \
                        FILTER(?pred NOT IN (" + self.exclusions + ") ). }"

        else:
            raise ValueError('query_type has to be either \'incoming\' or \'outgoing\' for value:' + query_type)
    
    """
    Load Glove embeddings.
    
    :param path: path to glove emeddings
    
    :return: dictionary containing embeddings for each word
    """
    def __load_embeddings(self, path='../../data/glove.twitter.27B.200d.txt'):
        embeddings_dict = {}
        print('Loading embeddings...')
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        return embeddings_dict
    
    """
    Get most relevant relation using given embedding and levenshtein_distance.
    
    :param question: question in natural language
    :param R: set of candidate relations
    :param lambda_param: hyperparameter describing the importance of cosine similarity and levenshtein_distance
    
    :return: label of most relevant relation
    """
    def __get_most_relevant_relation(self, question, R, lambda_param=0.5):
        unique_relations = R
        question = question.lower().replace('?', ' ?')
        # tokenize question
        question_tokens = question.split()
        # remove stopwords and punctuation tokens
        question_tokens = [token for token in question_tokens 
                               if token not in self.stops and token not in string.punctuation]
        
        relevances = []
        
        for index, row in unique_relations.iterrows():
            # tokenize label
            relation_tokens = row['label'].split()
            
            relevance = 0
            for rel_token in relation_tokens:                
                
                for question_token in question_tokens:
                    
                    if rel_token in self.embeddings and question_token in self.embeddings:
                        
                        rel_token_embedding = self.embeddings[rel_token]
                        question_token_embedding = self.embeddings[question_token]
                        
                        # compute cosine similarity
                        cos_sim = 1 - spatial.distance.cosine(rel_token_embedding, question_token_embedding)
                    else:
                        cos_sim = 0
                    # compute lev distance
                    lev_distance = levenshtein_distance(question_token, rel_token)
                    # sum to previous relenvances of relation tokens and question tokens
                    relevance += lambda_param * cos_sim + (1 - lambda_param) * 1/(lev_distance+1)
            
            relevances.append(relevance/len(relation_tokens))
        relevances = np.array(relevances)
        
        return unique_relations.iloc[np.argmax(relevances)]
    
    """
    Get most relevant relation using bert and levenshtein_distance.
    
    :param question: question in natural language
    :param R: set of candidate relations
    :param lambda_param: hyperparameter describing the importance of cosine similarity and levenshtein_distance
    
    :return: label of most relevant relation
    """
    def __get_most_relevant_relation_bert(self, question, R, lambda_param=0.4):
        unique_relations = R
        
        question_processed = question.lower().replace('?', ' ?')
        # tokenize question
        question_tokens = question.split()
        question_tokens = [token for token in question_tokens 
                               if token not in self.stops and token not in string.punctuation]
        
        relevances = []
        
        # generate sentence embeddings for question
        self.vectorizer.bert([question])
        embedding_question = self.vectorizer.vectors
        # generate sentence embeddings for relations
        self.vectorizer.bert(unique_relations['label'].values)
        embeddings_relations = self.vectorizer.vectors
        
        for i, rel_embedding in enumerate(embeddings_relations):
            cos_sim = 1 - spatial.distance.cosine(embedding_question, rel_embedding)
            
            relation_tokens = unique_relations.iloc[i]['label'].split()
            
            lev = 0
            for rel_token in relation_tokens:                
                
                for question_token in question_tokens:
                    # compute lev distance
                    lev_distance = levenshtein_distance(question_token, rel_token)
                    # sum to previous relenvances of relation tokens and question tokens
                    lev += 1/(lev_distance+1)
            
            relevance = lambda_param * cos_sim + (1 - lambda_param) * (lev / len(relation_tokens))
            relevances.append(relevance)
        relevances = np.array(relevances)

        return unique_relations.iloc[np.argmax(relevances)]
                    

In [None]:
query_builder = QueryBuilder(embeddings = embeddings, bert_similarity = False)

In [None]:
a = query_builder.build(question='Who is the spouse of Barack Obama?', entity="dbr:Barack_Obama", pattern='p1')

In [None]:
a

In [None]:
prova = pd.DataFrame()
prova['relation'] = a
prova['score'] = b
prova.sort_values('score',ascending=False).head(50)

In [None]:
'dbo' in '/earrmaer/dbo:rektae'