In [None]:
# default_exp dataset_builder

# Dataset Builder
> Builds an optimal dataset with knowledge base relations, from a vanilla dataset.

In [1]:
# !pip install tensorflow
# !pip install tensorflow_hub
# !pip install spacy
# !pip install sklearn
# !python -m spacy download en_core_web_sm

In [2]:
#export 
import random
from rake_nltk import Rake
from kirby.database_proxy import WikiDatabase
import json
import importlib
import spacy
import tensorflow_hub as hub
from sklearn.neighbors import NearestNeighbors

In [3]:
# importlib.reload(kirby.database_proxy)

In [21]:
#export
class DatasetBuilder():
    def __init__(self):
        self.rake = Rake()
        self.db = WikiDatabase()
        self.nlp = spacy.load('en_core_web_sm')
        module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
        self.encoder = hub.load(module_url)
        pass
    
    def build(self, ds, dataset_type='random'):
        "Build a database based a given dataset"
        if dataset_type == 'random':
            ds.map(self.random, batched=False)
        elif dataset_type == 'description':
            pass
        elif dataset_type == 'relevant':
            pass
        
    def keyword(self, x):
        ranked_phrases = self.get_ranked_phrases(x)
        return ranked_phrases[0]
    
    def get_ranked_phrases(self, x):
        self.rake.extract_keywords_from_text(x)
        return self.rake.get_ranked_phrases()
    
    #staticmethod
    def add_to_accepted(self, a_sentences, sentence):
        if len(a_sentences) > 2:
            a_sentences.pop(0)
        a_sentences.append(sentence)
    

    def get_entities_in_text(self, x, random_entities=False):
        accepted_sentence = []
        accepted_entities = []
        print(self.nlp)
        doc = self.nlp(x)
        for entity in doc.ents: 
            if entity.label_ == 'CARDINAL': 
                continue
            print(entity.text, entity.label_)
            result = self.db.get_entity_by_label(entity.text)
            if len(result) == 0:
                continue
            elif len(result) > 1:
                result = self.db.get_entities_by_label_extensive(entity.text)
                if len(accepted_sentence) == 0 or random_entities:
                    q_a_index = random.randint(0, len(result) - 1)
                else:
                    encoded_sentences = self.encoder(accepted_sentence)  # array of sentence vectors

                    proposed_sentences = []
                    for entity_w in result:
                        # if entity_w[2] != '':
                        proposed_sentences.append(entity_w[2])
                    encoded_proposed = self.encoder(proposed_sentences)
                    neigh = NearestNeighbors(n_neighbors=1)
                    neigh.fit(encoded_proposed)
                    closest = neigh.kneighbors(encoded_sentences)
                    q_a_index = closest[1][0][0]
                self.add_to_accepted(accepted_sentence, result[q_a_index][2])
                accepted_entities.append(result[q_a_index])

            else:
                # print('Accepted:', result[0])
                self.add_to_accepted(accepted_sentence, result[0][2])
                accepted_entities.append(result[0]) 
        return accepted_entities

    def entity(self, ranked_phrases):
        "Queries the knowledge base to find the entity and it's relations"
        for phrase in ranked_phrases:
            entity = self.kba.get_entity(phrase)
            if entity is not None:
                return entity
        return entity   
    def get_entity_properties_strings(self, entity_id):
        print(self.db.get_entity_properties(entity_id))
        entity_properties_dict = {}
        for entity_property in self.db.get_entity_properties(entity_id): 
            property_name, related_entity_label = self.db.get_property_string(entity_property[0], entity_property[1])
            entity_properties_dict[property_name] = related_entity_label
        return entity_properties_dict

# Testing

In [22]:
# creation
ds_builder = DatasetBuilder()
assert isinstance(ds_builder, DatasetBuilder)

<sqlite3.Connection object at 0x7ff0fc381c60>








In [12]:
# Test ranked phrases
x = "Stephen Curry is my favorite basketball player."
ds_builder.rake.extract_keywords_from_text(x)
ranked_phrases = ds_builder.rake.get_ranked_phrases()
print(ranked_phrases)

['favorite basketball player', 'stephen curry']


In [23]:
print(ds_builder.db.conn)
print(ds_builder.db.get_entity_by_label('Cristiano Ronaldo'))

<sqlite3.Connection object at 0x7ff0fc381c60>
[['Q11571', 'Cristiano Ronaldo', 'Portuguese association football player']]


In [24]:
# Get random association
x = "Microsoft has bought Bethesda"
random_association = ds_builder.get_entities_in_text(x)
print(random_association)

<spacy.lang.en.English object at 0x7ff129cf2ac0>
Microsoft ORG
[['Q11272', 'Microsoft Excel', 'spreadsheet editor, part of Microsoft Office'], ['Q80911', 'Microsoft Outlook', 'personal information manager'], ['Q182496', 'Microsoft Bing', 'Web search engine from Microsoft'], ['Q518877', 'Microsoft BASICA', 'floppy-based BASIC interpreter, used mainly in the 80s, and developed for PC-DOS'], ['Q594763', 'Microsoft Expression Blend', 'Microsoft Software'], ['Q686425', 'Microsoft Plus!', 'software'], ['Q724878', 'Microsoft Office 2013', 'version of Microsoft Office'], ['Q725967', 'Microsoft Azure', 'cloud computing service created by Microsoft'], ['Q843084', 'Microsoft Document Imaging Format', 'file format'], ['Q848985', 'Microsoft Developer Network', ''], ['Q1081057', 'Microsoft Office SharePoint Server', ''], ['Q1574805', 'Microsoft Kin', 'mobile phone line from Microsoft'], ['Q1581373', 'Microsoft Office mobile apps', 'productivity mobile apps'], ['Q1735558', 'Microsoft Combat Flight Si



[['Q3124854', 'Bethesda system', 'classification system for cervical neoplasia, named after the town in Maryland'], ['Q4897965', 'Bethesda', 'unincorporated community in Davidson County, North Carolina'], ['Q4897966', 'Bethesda, Ontario', 'Wikimedia disambiguation page'], ['Q4897974', 'Bethesda-Chevy Chase High School', 'public school in Montgomery County, Maryland, United States, named for two of the towns it serves'], ['Q4897976', 'Bethesda Big Train', 'collegiate summer baseball team based in Bethesda, Maryland, United States'], ['Q4897980', 'Bethesda Hospital', 'Wikimedia disambiguation page'], ['Q4897993', 'Bethesda Presbyterian Church', 'church building in South Carolina, United States of America'], ['Q4897997', 'Bethesda Presbyterian Church', ''], ['Q13126545', 'Bethesda Chapel', 'Calvinistic Methodist chapel in Mold'], ['Q16951378', 'Bethesda Presbyterian Church', ''], ['Q22260503', 'Bethesda Brown', ''], ['Q26540268', 'Bethesda Methodist Church', 'Calderdale, West Yorkshire, H