In [34]:
class EmbeddingModelBuilder:
    
    def __init__(self,
                 embedding_type='word2vec',
                 filename = "word_embeddings/word2vec/GoogleNews-vectors-negative300.bin",
                 limit=500000
                ):        
        
        import gensim.models.keyedvectors as word2vec
        from gensim.models import KeyedVectors
        
        if embedding_type=='word2vec':
            self.Model = KeyedVectors.load_word2vec_format(filename,
                                                           binary=True, limit=limit)

In [37]:
x = EmbeddingModelBuilder().Model

In [None]:
class InvertedIndexBuilder:
    
    def __init__(dbms='psql'):                
        if dbms=='psql':
            self._GET_TABLENAMES_SQL = "SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';"        
            
            
        self.wordHash = {}
        self.attributeHash = {}
        
        
    
    def createInvertedIndex(self,embeddingModel, cur, showLog=True):
        #Output: wordHash (Term Index) with this structure below
        #map['word'] = [ 'table': ( {column} , ['ctid'] ) ]

        '''
        The Term Index is built in a preprocessing step that scans only
        once all the relations over which the queries will be issued.
        '''

        self.wordHash = {}
        self.attributeHash = {}


        # Get list of tablenames
        cur.execute(self._GET_TABLENAMES_SQL)
        for table in cur.fetchall():
            table_name = table[0]

            if table_name not in embeddingModel:
                print('TABLE ',table_name, 'SKIPPED')
                continue

            print('INDEXING TABLE ',table_name)

            self.attributeHash[table_name] = {}

            #Get all tuples for this tablename
            cur.execute(
                sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
                #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
            )
            printSkippedColumns = True
            for row in cur.fetchall(): 
                for column in range(1,len(row)):
                    column_name = cur.description[column][0] 

                    if column_name not in embeddingModel or column_name=='id':
                        if printSkippedColumns:
                            print('\tCOLUMN ',column_name,' SKIPPED')
                        continue

                    ctid = row[0]

                    for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:

                        #Ignoring STOPWORDS
                        if word in stw_set:
                            continue

                        #If word entry doesn't exists, it will be inicialized (setdefault method),
                        #Append the location for this word
                        self.wordHash.setdefault(word, {})                    
                        self.wordHash[word].setdefault( table_name , {} )
                        self.wordHash[word][table_name].setdefault( column_name , [] ).append(ctid)

                        self.attributeHash[table_name].setdefault(column_name,(0,set()))
                        self.attributeHash[table_name][column_name][1].add(word)
                printSkippedColumns=False

            #Count words

            for (column_name,(norm,wordSet)) in self.attributeHash[table_name].items():
                num_distinct_words = len(wordSet)
                wordSet.clear()
                self.attributeHash[table_name][column_name] = (norm,num_distinct_words)

        print ('INVERTED INDEX CREATED')

In [None]:
class Preprocessor:
    
    EmbeddingModel = None   
    WordHash = {}
    AttributeHash = {}
    
    
    def __init__(dbms='psql'):        
        
        if dbms=='psql':
            self._GET_TABLENAMES_SQL = "SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';"        
            
    
    def load(self):
        EmbeddingModel = EmbeddingModelBuilder().Model
              
        
        
            
    
    


In [26]:
class Tupleset:
    
    table = None
    predicates = {}
    
    def __init__(self, table):
        self.table = table
        
    def addMapping(self,attribute,schemaWords,valueWords):
        self.predicates.setdefault(attribute,   (set(),set())    ) 
        self.predicates[attribute][0].update(schemaWords)
        self.predicates[attribute][1].update(valueWords)
        
    def isFreeTupleset(self):
        return len(self.predicates)==0

    def __repr__(self):
        aux = (self.table,self.predicates)
        return repr(  tuple(aux) )
    
        
x = Tupleset('person')
x.addMapping('name',{'name'},{'paulo','rodrigo'})
x.addMapping('age',{'birth age'},{'23'})

x.isFreeTupleset()
x.predicates

{'age': ({'birth age'}, {'23'}), 'name': ({'name'}, {'paulo', 'rodrigo'})}

In [27]:
def TSFind(Q):
    #Input:  A keyword query Q=[k1, k2, . . . , km]
    #Output: Set of non-free and non-empty tuple-sets Rq

    '''
    The tuple-set Rki contains the tuples of Ri that contain all
    terms of K and no other keywords from Q
    '''
    
    #Part 1: Find sets of tuples containing each keyword
    global P
    P = {}
    for keyword in Q:
        tupleset = set()
        
        if keyword not in wordHash:
            continue
        
        for (table,attributes) in wordHash.get(keyword)[1].items():
            for (attribute,ctids) in attributes.items():
                for ctid in ctids:
                    tupleset.add( (table,attribute,ctid) )
        P[frozenset([keyword])] = tupleset
    
    #Part 2: Find sets of tuples containing larger termsets
    P = TSInterMartins(P)
    
    #Part 3:Build tuple-sets
    Rq = set()
    
    for valueWords , tuples in P.items():
        for (table,attribute,ctid) in tuples:
            
            ts = Tupleset(table)
            ts.addMapping(attribute,frozenset(),valueWords)
            Rq.add( ts )
    #print ('TUPLE SETS CREATED')
    return Rq


def TSInter(P):
    #Input: A Set of non-empty tuple-sets for each keyword alone P 
    #Output: The Set P, but now including larger termsets (process Intersections)

    '''
    Termset is any non-empty subset K of the terms of a query Q        
    '''
    
    Pprev = {}
    Pprev=copy.deepcopy(P)
    Pcurr = {}

    combinations = [x for x in itertools.combinations(Pprev.keys(),2)]
    for ( Ki , Kj ) in combinations:
        Tki = Pprev[Ki]
        Tkj = Pprev[Kj]
        
        X = Ki | Kj
        Tx = Tki & Tkj        
        
        if len(Tx) > 0:            
            Pcurr[X]  = Tx            
            Pprev[Ki] = Tki - Tx         
            Pprev[Kj] = Tkj - Tx
            
    if Pcurr != {}:
        Pcurr = copy.deepcopy(TSInter(Pcurr))
        
    #Pprev = Pprev U Pcurr
    Pprev.update(Pcurr)     
    return Pprev   


def TSInterMartins(P):
    #Input: A Set of non-empty tuple-sets for each keyword alone P 
    #Output: The Set P, but now including larger termsets (process Intersections)

    '''
    Termset is any non-empty subset K of the terms of a query Q        
    '''
    somethingChanged = False
    
    combinations = [x for x in itertools.combinations(P.keys(),2)]
    for ( Ki , Kj ) in combinations:
        Tki = P[Ki]
        Tkj = P[Kj]
        
        X = Ki | Kj
        Tx = Tki & Tkj        
        
        if len(Tx) > 0:            
            P[X]  = Tx            
            P[Ki] = Tki - Tx         
            P[Kj] = Tkj - Tx
            somethingChanged = True
            
    if somethingChanged:
        TSInterMartins(P)   
    return P