# MatCNGenPy

This is a python implementation of the algorithms described in 
**Efficient Match-Based Candidate Network Generation for Keyword 
Queries over Relational Databases** paper.


## Installation
- Install virtalenv
- Run ```source bin/activate``` to enter in the virtual enviroment
- Run ```pip install -r requirements.txt```
- Run ```python ModCNGen.py

In [1]:
import psycopg2
from psycopg2 import sql
from pprint import pprint as pp
from collections import defaultdict
import string
import itertools
import copy
from math import log1p
from queue import deque
import re
import glob

import nltk 
#nltk.download('wordnet')
#nltk.download('omw')
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

stw_set = set() 

In [2]:
# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

def createInvertedList():
    #Output: wordHash (Term Index) with this structure below
    #map['word'] = [ 'table': ( {column} , ['ctid'] ) ]

    '''
    The Term Index is built in a preprocessing step that scans only
    once all the relations over which the queries will be issued.
    '''
    
    wordHash = {}
    attributeHash = {}
    
    # Get list of tablenames
    cur.execute("SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        table_name = table[0]
        print('INDEXING TABLE ',table_name)
        
        attributeHash[table_name] = {}
        
        #Get all tuples for this tablename
        cur.execute(
            sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
            #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
        )

        for row in cur.fetchall():
            for column in range(1,len(row)):
                column_name = cur.description[column][0]   
                ctid = row[0]

                for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:
                    
                    #Ignoring STOPWORDS
                    if word in stw_set:
                        continue

                    #If word entry doesn't exists, it will be inicialized (setdefault method),
                    #Append the location for this word
                    wordHash.setdefault(word, {})                    
                    wordHash[word].setdefault( table_name , {} )
                    wordHash[word][table_name].setdefault( column_name , [] ).append(ctid)
                    
                    attributeHash[table_name].setdefault(column_name,(0,set()))
                    attributeHash[table_name][column_name][1].add(word)
        
        #Count words
        
        for (column_name,(norm,wordSet)) in attributeHash[table_name].items():
            num_distinct_words = len(wordSet)
            wordSet.clear()
            attributeHash[table_name][column_name] = (norm,num_distinct_words)
        

    print ('INVERTED INDEX CREATED')
    return (wordHash,attributeHash)

(wordHash,attributeHash) = createInvertedList()

INDEXING TABLE  casting
INDEXING TABLE  role
INDEXING TABLE  movie
INDEXING TABLE  char
INDEXING TABLE  person
INVERTED INDEX CREATED


In [None]:
pp(wordHash['denzel'])

In [3]:
def processIAF(wordHash,attributeHash):
    
    total_attributes = sum([len(attribute) for attribute in attributeHash.values()])
    
    for (term, values) in wordHash.items():
        
        attributes_with_this_term = sum([len(attribute) for attribute in wordHash[term].values()])
        
        IAF = log1p(total_attributes/attributes_with_this_term)
                
        wordHash[term] = (IAF,values)
    print('IAF PROCESSED')
processIAF(wordHash,attributeHash)

IAF PROCESSED


In [None]:
wordHash['denzel']

In [None]:
attributeHash['casting']

In [4]:
# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

def processNormsOfAttributes(wordHash,attributeHash):
  
    # Get list of tablenames
    cur.execute("SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        table_name = table[0]
        print('PROCESSING TABLE ',table_name)
        
        #Get all tuples for this tablename
        cur.execute(
            sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
            #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
        )

        for row in cur.fetchall():
            for column in range(1,len(row)):
                column_name = cur.description[column][0]   
                ctid = row[0]

                for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:
                    
                    #Ignoring STOPWORDS
                    if word in stw_set:
                        continue
                    
                    (prevNorm,num_distinct_words)=attributeHash[table_name][column_name]
                    
                    IAF = wordHash[word][0]
                    
                    Norm = prevNorm + IAF
                    
                    attributeHash[table_name][column_name]=(Norm,num_distinct_words)
                    

    print ('NORMS OF ATTRIBUTES PROCESSED')

processNormsOfAttributes(wordHash,attributeHash)

PROCESSING TABLE  casting
PROCESSING TABLE  role
PROCESSING TABLE  movie
PROCESSING TABLE  char
PROCESSING TABLE  person
NORMS OF ATTRIBUTES PROCESSED


In [None]:
attributeHash['casting']

In [None]:
wordHash.get('denzel')

In [None]:
wordHash.get('denzel')[1]

In [5]:
def TSFind(Q):
    #Input:  A keyword query Q=[k1, k2, . . . , km]
    #Output: Set of non-free and non-empty tuple-sets Rq

    '''
    The tuple-set Rki contains the tuples of Ri that contain all
    terms of K and no other keywords from Q
    '''
    
    #Part 1: Find sets of tuples containing each keyword
    global P
    P = {}
    for keyword in Q:
        tupleset = set()
        for (table,attributes) in wordHash.get(keyword)[1].items():
            for (attribute,ctids) in attributes.items():
                for ctid in ctids:
                    tupleset.add( (table,attribute,ctid) )
        P[frozenset([keyword])] = tupleset
    
    #Part 2: Find sets of tuples containing larger termsets
    P = TSInter(P)

    #Part 3:Build tuple-sets
    Rq = set()
    for keyword , tuples in P.items():
        for (table,attribute,ctid) in tuples:
            Rq.add( (table,attribute,keyword) )
    print ('TUPLE SETS CREATED')
    return Rq


def TSInter(P):
    #Input: A Set of non-empty tuple-sets for each keyword alone P 
    #Output: The Set P, but now including larger termsets (process Intersections)

    '''
    Termset is any non-empty subset K of the terms of a query Q        
    '''
    
    Pprev = {}
    Pprev=copy.deepcopy(P)
    Pcurr = {}

    combinations = [x for x in itertools.combinations(Pprev.keys(),2)]
    for ( Ki , Kj ) in combinations[0:4]:
        Tki = Pprev[Ki]
        Tkj = Pprev[Kj]
        
        X = Ki | Kj
        Tx = Tki & Tkj        
        
        if len(Tx) > 0:            
            Pcurr[X]  = Tx            
            Pprev[Ki] = Tki - Tx         
            Pprev[Kj] = Tkj - Tx
            
    if Pcurr != {}:
        Pcurr = copy.deepcopy(TSInter(Pcurr))
        
    #Pprev = Pprev U Pcurr
    Pprev.update(Pcurr)     
    return Pprev       

Q = ['denzel','washington','gangster']
Rq = TSFind(Q)
pp(Rq)

TUPLE SETS CREATED
{('casting', 'note', frozenset({'denzel'})),
 ('casting', 'note', frozenset({'gangster'})),
 ('casting', 'note', frozenset({'washington'})),
 ('casting', 'note', frozenset({'denzel', 'washington'})),
 ('char', 'name', frozenset({'washington'})),
 ('char', 'name', frozenset({'gangster'})),
 ('char', 'name', frozenset({'denzel'})),
 ('movie', 'title', frozenset({'washington'})),
 ('movie', 'title', frozenset({'gangster'})),
 ('person', 'name', frozenset({'washington'})),
 ('person', 'name', frozenset({'denzel'})),
 ('person', 'name', frozenset({'denzel', 'washington'}))}


In [6]:
#Rq[frozenset({'denzel', 'washington'})]
#Mq = QMGen(Q,Rq)
def QMGen(Q,Rq):
    #Input:  A keyword query Q, The set of non-empty non-free tuple-sets Rq
    #Output: The set Mq of query matches for Q
    
    '''
    Query match is a set of tuple-sets that, if properly joined,
    can produce networks of tuples that fulfill the query. They
    can be thought as the leaves of a Candidate Network.
    
    '''
    
    Mq = []
    for i in range(1,len(Q)+1):
        for subset in itertools.combinations(Rq,i):
            if(MinimalCover(subset,Q)):
                Mq.append(set(subset))
    print (len(Mq),'QUERY MATCHES CREATED')
    return Mq


def MinimalCover(MC, Q):
    #Input:  A subset MC (Match Candidate) to be checked as total and minimal cover
    #Output: If the match candidate is a TOTAL and MINIMAL cover

    '''
    Total:   every keyword is contained in at least one tuple-set of the match
    
    Minimal: we can not remove any tuple-set from the match and still have a
             total cover.    
    '''
    Subset = [termset for table,attribute,termset in MC]
    u = set().union(*Subset)    
    
    isTotal = (u == set(Q))
    for element in Subset:
        
        new_u = list(Subset)
        new_u.remove(element)
        
        new_u = set().union(*new_u)
        
        if new_u == set(Q):
            return False
    
    return isTotal

Mq = QMGen(Q,Rq)

for M in Mq:
    print(M,'\n\n')

42 QUERY MATCHES CREATED
{('casting', 'note', frozenset({'denzel', 'washington'})), ('char', 'name', frozenset({'gangster'}))} 


{('char', 'name', frozenset({'gangster'})), ('person', 'name', frozenset({'denzel', 'washington'}))} 


{('movie', 'title', frozenset({'gangster'})), ('casting', 'note', frozenset({'denzel', 'washington'}))} 


{('movie', 'title', frozenset({'gangster'})), ('person', 'name', frozenset({'denzel', 'washington'}))} 


{('casting', 'note', frozenset({'gangster'})), ('casting', 'note', frozenset({'denzel', 'washington'}))} 


{('casting', 'note', frozenset({'gangster'})), ('person', 'name', frozenset({'denzel', 'washington'}))} 


{('char', 'name', frozenset({'washington'})), ('casting', 'note', frozenset({'denzel'})), ('char', 'name', frozenset({'gangster'}))} 


{('char', 'name', frozenset({'washington'})), ('casting', 'note', frozenset({'denzel'})), ('movie', 'title', frozenset({'gangster'}))} 


{('char', 'name', frozenset({'washington'})), ('casting', 'note'

In [7]:
def getSchemaGraph():
    #Output: A Schema Graph G  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }
    
    
    # Connect to an existing database
    conn = psycopg2.connect("dbname=imdb user=postgres")

    # Open a cursor to perform database operations
    cur = conn.cursor()
    
    G = {} 
    cur.execute("SELECT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        G.setdefault(table[0],{})
    
    sql = "SELECT DISTINCT \
                tc.table_name, kcu.column_name, \
                ccu.table_name AS foreign_table_name, ccu.column_name AS foreign_column_name \
            FROM information_schema.table_constraints AS tc  \
            JOIN information_schema.key_column_usage AS kcu \
                ON tc.constraint_name = kcu.constraint_name \
            JOIN information_schema.constraint_column_usage AS ccu \
                ON ccu.constraint_name = tc.constraint_name \
            WHERE constraint_type = 'FOREIGN KEY'"
    cur.execute(sql)
    relations = cur.fetchall()
    
    for (table,column,foreign_table,foreign_column) in relations:
        G[table][foreign_table] = (1,column, foreign_column)
        G[foreign_table][table] = (-1,foreign_column,column)
    print ('SCHEMA CREATED')
    return G

def MatchGraph(Rq, G, M):
    #Input:  The set of non-empty non-free tuple-sets Rq,
    #        The Schema Graph G,
    #        A Query Match M
    #Output: A Schema Graph Gts  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }

    '''
    A Match Subgraph Gts[M] is a subgraph of G that contains:
        The set of free tuple-sets of G
        The query match M
    '''
    
    Gts = copy.deepcopy(G)
    
    tables = set()
    #Insert non-free nodes
    for (table ,attribute, keywords) in M:
        Gts[(table,attribute,keywords)]=copy.deepcopy(Gts[table])
        for foreign_table , (direction,column,foreign_column) in Gts[(table,attribute,keywords)].items():
            Gts[foreign_table][(table,attribute,keywords)] = (direction*(-1),foreign_column,column)

    return Gts 

G = getSchemaGraph()
pp(G)

print ('\nEXAMPLE OF MATCH GRAPH')
Gts = MatchGraph(Rq, G, Mq[0])
pp(Gts)

SCHEMA CREATED
{'casting': {'char': (1, 'person_role_id', 'id'),
             'movie': (1, 'movie_id', 'id'),
             'person': (1, 'person_id', 'id'),
             'role': (1, 'role_id', 'id')},
 'char': {'casting': (-1, 'id', 'person_role_id')},
 'movie': {'casting': (-1, 'id', 'movie_id')},
 'person': {'casting': (-1, 'id', 'person_id')},
 'role': {'casting': (-1, 'id', 'role_id')}}

EXAMPLE OF MATCH GRAPH
{'casting': {'char': (1, 'person_role_id', 'id'),
             'movie': (1, 'movie_id', 'id'),
             'person': (1, 'person_id', 'id'),
             'role': (1, 'role_id', 'id'),
             ('char', 'name', frozenset({'gangster'})): (1,
                                                         'person_role_id',
                                                         'id')},
 'char': {'casting': (-1, 'id', 'person_role_id'),
          ('casting', 'note', frozenset({'denzel', 'washington'})): (-1,
                                                                     'id'

In [8]:
def containsMatch(Ji,M):
    for relation in M:
        if relation not in Ji:
            return False
    return True

def isJNTSound(Gts,Ji):
    if len(Ji)<3:
        return True
    for i in range(len(Ji)-2):
        if (Ji[i],)[0] == (Ji[i+2],)[0]:
            edge_info = Gts[Ji[i]][Ji[i+1]]
            if(edge_info[0] == -1):
                return False
    return True

#CN = SingleCN(Mq[0],Gts,5,Q)
def SingleCN(M,Gts,Tmax,Q):
    from queue import deque
    F = deque()

    first_element = list(M)[0]
    J = [first_element]
    
    if len(M)==1:
        return J
    
    F.append(J)
    while F:
        J = F.pop()
        u = J[-1]
        for (adjacent,edge_info) in Gts[u].items():
            if (type(adjacent) is str) or (adjacent not in J):
                Ji = J + [adjacent]
                if (Ji not in F) and (len(Ji)<Tmax) and (isJNTSound(Gts,Ji)):
                    if(containsMatch(Ji,M)):
                        return Ji
                    else:
                        F.append(Ji)

Cns = []                        
for M in Mq: 
    Gts =  MatchGraph(Rq,G,M)
    Cn = SingleCN(M,Gts,10,Q)
    print('\n\n--------------------------------------------------\nGts\n')
    pp(Gts)
    print('\nM\n')
    pp(M)
    print('\nCN\n')
    pp(Cn)
    if(Cn is not None):
        Cns.append( (Cn,Gts,M) )
    
#Cn=[('movie_info', frozenset({'gangster'})), 'title', ('cast_info', frozenset({'washington', 'denzel'}))] 



--------------------------------------------------
Gts

{'casting': {'char': (1, 'person_role_id', 'id'),
             'movie': (1, 'movie_id', 'id'),
             'person': (1, 'person_id', 'id'),
             'role': (1, 'role_id', 'id'),
             ('char', 'name', frozenset({'gangster'})): (1,
                                                         'person_role_id',
                                                         'id')},
 'char': {'casting': (-1, 'id', 'person_role_id'),
          ('casting', 'note', frozenset({'denzel', 'washington'})): (-1,
                                                                     'id',
                                                                     'person_role_id')},
 'movie': {'casting': (-1, 'id', 'movie_id'),
           ('casting', 'note', frozenset({'denzel', 'washington'})): (-1,
                                                                      'id',
                                                                      'mo

 'role': {'casting': (-1, 'id', 'role_id'),
          ('casting', 'note', frozenset({'gangster'})): (-1, 'id', 'role_id')},
 ('casting', 'note', frozenset({'gangster'})): {'char': (1,
                                                         'person_role_id',
                                                         'id'),
                                                'movie': (1, 'movie_id', 'id'),
                                                'person': (1,
                                                           'person_id',
                                                           'id'),
                                                'role': (1, 'role_id', 'id'),
                                                ('char', 'name', frozenset({'washington'})): (1,
                                                                                              'person_role_id',
                                                                                              'id'),
         

             'movie': (1, 'movie_id', 'id'),
             'person': (1, 'person_id', 'id'),
             'role': (1, 'role_id', 'id'),
             ('movie', 'title', frozenset({'gangster'})): (1, 'movie_id', 'id'),
             ('person', 'name', frozenset({'denzel'})): (1, 'person_id', 'id'),
             ('person', 'name', frozenset({'washington'})): (1,
                                                             'person_id',
                                                             'id')},
 'char': {'casting': (-1, 'id', 'person_role_id')},
 'movie': {'casting': (-1, 'id', 'movie_id')},
 'person': {'casting': (-1, 'id', 'person_id')},
 'role': {'casting': (-1, 'id', 'role_id')},
 ('movie', 'title', frozenset({'gangster'})): {'casting': (-1,
                                                           'id',
                                                           'movie_id')},
 ('person', 'name', frozenset({'denzel'})): {'casting': (-1,
                                          

In [None]:
pp(attributeHash)

In [None]:
pp(Cns[0][1])

In [9]:
   
def CNRank(Cns,mi):
    Ranking = []
    for (Cn,Gts,M) in Cns:
        cosprod = 1
        
        for relation in Cn:
            if(type(relation) is str):
                continue

            (table,attribute,predicates) = relation
            
            (norm_attribute,distinct_terms) = attributeHash[table][attribute]
            
            wsum = 0
            
            for term in predicates:
                
                IAF = wordHash[term][0] 
                
                ctids = wordHash[term][1][table][attribute]
                fkj = len(ctids)
                
                if fkj>0:
                    
                    TF = log1p(fkj) / log1p(distinct_terms)
                    
                    wsum = wsum + TF*IAF
                        
            cos = wsum/norm_attribute
            cosprod *= cos

        score = mi * cosprod * 1/len(Cn)
        Ranking.append((Cn,Gts,M,score))
        
    return sorted(Ranking,key=lambda x: x[-1],reverse=True)

In [10]:
RankedCns=CNRank(Cns,2700000000000)

for (Cn,Gts,M,Score) in RankedCns:
    print(Score)
    print(Cn)


0.8850418213061707
[('char', 'name', frozenset({'gangster'})), 'casting', ('person', 'name', frozenset({'denzel', 'washington'}))]
0.7641225382910043
[('casting', 'note', frozenset({'denzel', 'washington'})), ('char', 'name', frozenset({'gangster'}))]
0.5386933903008395
[('movie', 'title', frozenset({'gangster'})), 'casting', ('person', 'name', frozenset({'denzel', 'washington'}))]
0.46509413549494394
[('movie', 'title', frozenset({'gangster'})), ('casting', 'note', frozenset({'denzel', 'washington'}))]
0.30094221229504214
[('casting', 'note', frozenset({'gangster'})), ('person', 'name', frozenset({'denzel', 'washington'}))]
0.11547814733566725
[('casting', 'note', frozenset({'gangster'})), 'role', ('casting', 'note', frozenset({'denzel', 'washington'}))]
1.354869245431144e-07
[('char', 'name', frozenset({'gangster'})), 'casting', ('person', 'name', frozenset({'washington'})), 'casting', ('person', 'name', frozenset({'denzel'}))]
1.2605606284910073e-07
[('char', 'name', frozenset({'was

In [11]:
def getSQLfromCN(Gts,Cn):
    #print('CN:\n',Cn)
    
    tables = set()
    conditions=[]
    for i in range(len(Cn)):
        if(type(Cn[i]) is str):
            A = Cn[i]
        else:
            (A,attrA,keywords) = Cn[i]           
            
            #tratamento de keywords
            for term in keywords:
                condition = 'CAST('+A + '.' + attrA + ' AS VARCHAR) ILIKE \'%' + term + '%\''
                conditions.append(condition)
                
        tables.add(A)
        
        if(i<len(Cn)-1):
            if(type(Cn[i+1]) is str):
                B = Cn[i+1]
            else:
                (B,attrB,keywords)=Cn[i+1]
            
            edge_info = Gts[Cn[i]][Cn[i+1]]
            
            (direction,joining_attrA,joining_attrB) = edge_info
            
            condition = A + '.' + joining_attrA + ' = ' + B + '.' + joining_attrB         
            conditions.append(condition)
            
    #print('TABLES:\n',tables)
    #print('CONDITIONS:')
    #pp(conditions)
    
    sqlText = 'SELECT * FROM ' + ', '.join(tables)
    sqlText += ' WHERE ' + ' AND '.join(conditions)
    #print('SQL:\n',sql)
    return sqlText

In [12]:
QuerySet = []
with open('querysets/queryset_imdb_coffman.txt') as f:
    for line in f.readlines():
        Q = [word for word in line.split() if word not in stopwords.words('english')]
        QuerySet.append(Q)
        
QuerySet

[['denzel', 'washington'],
 ['clint', 'eastwood'],
 ['john', 'wayne'],
 ['smith'],
 ['harrison', 'ford'],
 ['julia', 'roberts'],
 ['tom', 'hanks'],
 ['johnny', 'depp'],
 ['angelina', 'jolie'],
 ['morgan', 'freeman'],
 ['gone', 'wind'],
 ['star', 'wars'],
 ['casablanca'],
 ['lord', 'rings'],
 ['sound', 'music'],
 ['wizard', 'oz'],
 ['notebook'],
 ['forrest', 'gump'],
 ['princess', 'bride'],
 ['godfather'],
 ['atticus', 'finch'],
 ['indiana', 'jones'],
 ['james', 'bond'],
 ['rick', 'blaine'],
 ['kane'],
 ['hannibal', 'lecter'],
 ['norman', 'bates'],
 ['darth', 'vader'],
 ['nurse', 'ratched'],
 ['looking', 'kid'],
 ['hamill', 'skywalker'],
 ['hanks', '2004'],
 ['russell', 'crowe', 'gladiator'],
 ['brent', 'spiner', 'star', 'trek'],
 ['audrey', 'hepburn', '1951'],
 ['jacques', 'clouseau'],
 ['jack', 'ryan'],
 ['rocky', 'stallone'],
 ['terminator'],
 ['harrison', 'ford', 'george', 'lucas'],
 ['sean', 'connery', 'fleming'],
 ['dean', 'jones', 'herbie']]

In [13]:
def getGoldenStandards():
    goldenStandards = []
    for file in glob.iglob('golden_standards/*.txt', recursive=False):
        with open(file) as f:

            gs = set()

            for line in f.readlines():
              
                line_without_comment =line.split('#')[0]
                
                gs = gs | set(re.findall('\d+', line_without_comment))
            
            gs  = {int(i) for i in gs}
            
            goldenStandards.append(gs)
    return goldenStandards


def isCnRelevant(CnResult,goldenStandard):
    result_search_ids = set()
    for result_tuple in CnResult:
        for value in result_tuple:
            if value in goldenStandard:
                result_search_ids.add(value)
    return result_search_ids == goldenStandard

goldenStandards = getGoldenStandards()
goldenStandards

[{39927668},
 {39172749},
 {39931125},
 {39807078},
 {39214967},
 {40463372},
 {39295438},
 {39141807},
 {40255278},
 {39223764},
 {43846320},
 {44160570, 44182175, 44190586, 44209805, 44215230, 44215530},
 {43757282},
 {44246360, 44282840, 44300268},
 {44308629},
 {44317680},
 {44278650},
 {43843863},
 {44311171},
 {44306726},
 {7900883, 22335633, 44264090},
 {4783411,
  4783412,
  4783414,
  4783449,
  21302298,
  43931406,
  43952269,
  44013047,
  44158312},
 {3324865,
  3324877,
  3324989,
  3324992,
  3860898,
  3860902,
  3860913,
  3860915,
  3860959,
  3861043,
  3861058,
  3959151,
  3959180,
  4068882,
  4068906,
  6454410,
  7328399,
  7328442,
  7328489,
  7328494,
  7328502,
  7328561,
  7328590,
  21253184,
  43756690,
  43820265,
  43874345,
  43888167,
  43888349,
  43904881,
  43905671,
  43912633,
  43923452,
  44028316,
  44034783,
  44063958,
  44066193,
  44083502,
  44103656,
  44201168,
  44226252,
  44230834,
  44230855,
  44260665,
  44306714,
  44319790,
  44

In [14]:
def evaluateCNs(numQ,RankedCns):
    
    evaluationResults = []
    
    # Connect to an existing database
    conn = psycopg2.connect("dbname=imdb user=postgres")

    # Open a cursor to perform database operations
    cur = conn.cursor()
    
    position = 0
    for (Cn,Gts,M,score) in RankedCns:
        if Cn is not None:
            position+=1
            sqlText = getSQLfromCN(Gts,Cn)
            cur.execute(sqlText)
            results = cur.fetchall()

            if(results != []):
                print('-------------------------------------------------------------------------------------\n')
                print(position,'º CANDIDATE NETWORK')              
                print(Cn)
                print('\n',sqlText,'\n')

                if isCnRelevant(results,goldenStandards[numQ]):
                    print('RELEVANTE')
                    top1 = (position == 1)
                    MMR  = (1/position)
                    
                    evaluationResults.append((top1,MMR))
                    
                    print(results)
                    break
                
                print(results)


                
    return evaluationResults

In [None]:
numQ = -1

Er = []

for Q in QuerySet: 
    numQ+=1
    print('===============================================================================================================\n\n')
    print('QUERYSET ',Q)
    Rq = TSFind(Q)
    Mq = QMGen(Q,Rq)
    G = getSchemaGraph()
    Cns = []                        
    for M in Mq: 
        Gts =  MatchGraph(Rq,G,M)
        Cn = SingleCN(M,Gts,10,Q)
        if(Cn is not None):
            Cns.append( (Cn,Gts,M) )


    RankedCns=CNRank(Cns,2700000000000)

    evaluationResults = evaluateCNs(numQ,RankedCns)
    
    Er.append(evaluationResults)

print('|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n RESULTADOS')
print(Er)
        

In [68]:
Q= ['julia','roberts','woman']

tudoEstaIndexado = True
for word in Q:
    if wordHash.get(word) is None:
        print("Palavra ",word,"não indexada.")
        tudoEstaIndexado = False
        
if tudoEstaIndexado: 
    Rq = TSFind(Q)
    Mq = QMGen(Q,Rq)
    G = getSchemaGraph()
    Cns = []                        
    for M in Mq: 
        Gts =  MatchGraph(Rq,G,M)
        Cn = SingleCN(M,Gts,10,Q)
        if(Cn is not None):
            Cns.append( (Cn,Gts,M) )
        else:
            print('None Cn')
    RankedCns=CNRank(Cns,2700000000000)
    evaluationResults = evaluateCNs(1,RankedCns)

TUPLE SETS CREATED
79 QUERY MATCHES CREATED
SCHEMA CREATED
-------------------------------------------------------------------------------------

4 º CANDIDATE NETWORK
[('person', 'name', frozenset({'julia', 'roberts'})), 'casting', ('char', 'name', frozenset({'woman'}))]

 SELECT * FROM char, person, casting WHERE CAST(person.name AS VARCHAR) ILIKE '%julia%' AND CAST(person.name AS VARCHAR) ILIKE '%roberts%' AND person.id = casting.person_id AND casting.person_role_id = char.id AND CAST(char.name AS VARCHAR) ILIKE '%woman%' 

[(2156129, 'Spokesperson & Chairwoman of School Bus America', None, None, 'S1216', 'A562', 23355021, 1561193, 'Roberts, Julia', 'I', None, 'R1632', 'J4616', None, 40463372, 11247202, 1561193, 173562, 2156129, None, 7, 2, 13694493)]
-------------------------------------------------------------------------------------

7 º CANDIDATE NETWORK
[('movie', 'title', frozenset({'woman'})), 'casting', ('person', 'name', frozenset({'julia', 'roberts'}))]

 SELECT * FROM mov