In [None]:
# default_exp dataset_builder

# Dataset Builder
> Builds an optimal dataset with knowledge base relations, from a vanilla dataset.

In [1]:
# !pip install tensorflow
# !pip install tensorflow_hub
# !pip install spacy
# !pip install sklearn
# !python -m spacy download en_core_web_sm

In [1]:
#export 
import random
from rake_nltk import Rake
from kirby.database_proxy import WikiDatabase
import json
import importlib
import spacy
import tensorflow_hub as hub
from sklearn.neighbors import NearestNeighbors

In [3]:
# importlib.reload(kirby.database_proxy)

In [2]:
#export
class DatasetBuilder():
    def __init__(self):
        self.rake = Rake()
        self.db = WikiDatabase()
        self.nlp = spacy.load('en_core_web_sm')
        module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
        self.encoder = hub.load(module_url)
        pass
    
    def build(self, ds, dataset_type='random'):
        "Build a database based a given dataset"
        if dataset_type == 'random':
            ds.map(self.random, batched=False)
        elif dataset_type == 'description':
            pass
        elif dataset_type == 'relevant':
            pass
        
    def keyword(self, x):
        ranked_phrases = self.get_ranked_phrases(x)
        return ranked_phrases[0]
    
    def get_ranked_phrases(self, x):
        self.rake.extract_keywords_from_text(x)
        return self.rake.get_ranked_phrases()
    
    #staticmethod
    def add_to_accepted(self, a_sentences, sentence):
        if len(a_sentences) > 2:
            a_sentences.pop(0)
        a_sentences.append(sentence)
    

    def get_entities_in_text(self, x, random_entities=False):
        accepted_sentence = []
        accepted_entities = []
        print(self.nlp)
        doc = self.nlp(x)
        for entity in doc.ents: 
            if entity.label_ == 'CARDINAL': 
                continue
            print(entity.text, entity.label_)
            result = self.db.get_entity_by_label(entity.text)
            if len(result) == 0:
                continue
            elif len(result) > 1:
                result = self.db.get_entities_by_label_extensive(entity.text)
                if len(accepted_sentence) == 0 or random_entities:
                    q_a_index = random.randint(0, len(result) - 1)
                else:
                    encoded_sentences = self.encoder(accepted_sentence)  # array of sentence vectors

                    proposed_sentences = []
                    for entity_w in result:
                        proposed_sentences.append(entity_w[2])
                    encoded_proposed = self.encoder(proposed_sentences)
                    neigh = NearestNeighbors(n_neighbors=1)
                    neigh.fit(encoded_proposed)
                    closest = neigh.kneighbors(encoded_sentences)
                    q_a_index = closest[1][0][0]
                self.add_to_accepted(accepted_sentence, result[q_a_index][2])
                accepted_entities.append(result[q_a_index])

            else:
                # print('Accepted:', result[0])
                self.add_to_accepted(accepted_sentence, result[0][2])
                accepted_entities.append(result[0]) 
        return accepted_entities

    def entity(self, ranked_phrases):
        "Queries the knowledge base to find the entity and it's relations"
        for phrase in ranked_phrases:
            entity = self.kba.get_entity(phrase)
            if entity is not None:
                return entity
        return entity   
    def get_entity_properties_strings(self, entity_id):
        entity_properties_dict = {}
        for entity_property in self.db.get_entity_properties(entity_id): 
            property_name, related_entity_label = self.db.get_property_string(entity_property[0], entity_property[1])
            entity_properties_dict[property_name] = related_entity_label
        return entity_properties_dict

# Testing

In [3]:
# creation
ds_builder = DatasetBuilder()
assert isinstance(ds_builder, DatasetBuilder)

<sqlite3.Connection object at 0x7f180c09e8a0>


In [4]:
# Test ranked phrases
x = "Stephen Curry is my favorite basketball player."
ds_builder.rake.extract_keywords_from_text(x)
ranked_phrases = ds_builder.rake.get_ranked_phrases()
print(ranked_phrases)

['favorite basketball player', 'stephen curry']


In [5]:
print(ds_builder.db.conn)
print(ds_builder.db.get_entity_by_label('Cristiano Ronaldo'))

<sqlite3.Connection object at 0x7f180c09e8a0>
[['Q11571', 'Cristiano Ronaldo', 'Portuguese association football player']]


In [6]:
# Get random association
x = "Microsoft has bought Bethesda"
random_association = ds_builder.get_entities_in_text(x)
print(random_association)

<spacy.lang.en.English object at 0x7f18bdfc8eb0>
Microsoft ORG
Bethesda ORG
[['Q62079863', 'Microsoft Docs', ''], ['Q4897997', 'Bethesda Presbyterian Church', '']]


In [8]:
ds_builder.get_entity_properties_strings('Q5284')

[['P25', 'Q454928'], ['P19', 'Q5083'], ['P26', 'Q463877'], ['P106', 'Q131524'], ['P27', 'Q30'], ['P108', 'Q655286'], ['P69', 'Q7879362'], ['P910', 'Q7112419'], ['P31', 'Q5'], ['P551', 'Q1506847'], ['P166', 'Q12201445'], ['P1343', 'Q17311605'], ['P735', 'Q12344159'], ['P39', 'Q484876'], ['P734', 'Q16870134'], ['P40', 'Q23011254'], ['P172', 'Q7435494'], ['P463', 'Q463303'], ['P3438', 'Q575937'], ['P1412', 'Q1860'], ['P21', 'Q6581097'], ['P1830', 'Q2283'], ['P1441', 'Q12125162'], ['P6886', 'Q1860'], ['P641', 'Q188966'], ['P552', 'Q789447'], ['P22', 'Q684014'], ['P8017', 'L19333-F1'], ['P6553', 'L485'], ['P3373', 'Q92466067'], ['P1344', 'Q16972891']]
Mary Maxwell Gates mother
Seattle place of birth
Melinda Gates spouse
entrepreneur occupation
United States of America country of citizenship
Bill & Melinda Gates Foundation employer
Lakeside School educated at
Category:Bill Gates topic's main category
human instance of
Medina residence
Knight Commander of the Order of the British Empire award

TypeError: cannot unpack non-iterable NoneType object