# MatCNGenPy

This is a python implementation of the algorithms described in 
**Efficient Match-Based Candidate Network Generation for Keyword 
Queries over Relational Databases** paper.


## Installation
- Install virtalenv
- Run ```source bin/activate``` to enter in the virtual enviroment
- Run ```pip install -r requirements.txt```
- Run ```python ModCNGen.py

In [1]:
import psycopg2
from psycopg2 import sql
from pprint import pprint as pp
from collections import defaultdict
import string
import itertools
import copy
from math import log1p
from queue import deque
import ast

import nltk 
#nltk.download('wordnet')
#nltk.download('omw')
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

stw_set = set(stopwords.words('english')) - {'will'}

# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

In [2]:
def createInvertedIndex():
    #Output: wordHash (Term Index) with this structure below
    #map['word'] = [ 'table': ( {column} , ['ctid'] ) ]

    '''
    The Term Index is built in a preprocessing step that scans only
    once all the relations over which the queries will be issued.
    '''
    
    wordHash = {}
    attributeHash = {}
    
    # Get list of tablenames
    cur.execute("SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        table_name = table[0]
        print('INDEXING TABLE ',table_name)
        
        attributeHash[table_name] = {}
        
        #Get all tuples for this tablename
        cur.execute(
            sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
            #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
        )

        for row in cur.fetchall():
            for column in range(1,len(row)):
                column_name = cur.description[column][0]   
                ctid = row[0]

                for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:
                    
                    #Ignoring STOPWORDS
                    if word in stw_set:
                        continue

                    #If word entry doesn't exists, it will be inicialized (setdefault method),
                    #Append the location for this word
                    wordHash.setdefault(word, {})                    
                    wordHash[word].setdefault( table_name , {} )
                    wordHash[word][table_name].setdefault( column_name , [] ).append(ctid)
                    
                    attributeHash[table_name].setdefault(column_name,(0,set()))
                    attributeHash[table_name][column_name][1].add(word)
        
        #Count words
        
        for (column_name,(norm,wordSet)) in attributeHash[table_name].items():
            num_distinct_words = len(wordSet)
            wordSet.clear()
            attributeHash[table_name][column_name] = (norm,num_distinct_words)
        

    print ('INVERTED INDEX CREATED')
    return (wordHash,attributeHash)

(wordHash,attributeHash) = createInvertedIndex()

INDEXING TABLE  casting


KeyboardInterrupt: 

In [3]:
pp(wordHash['denzel'])

NameError: name 'wordHash' is not defined

In [4]:
def processIAF(wordHash,attributeHash):
    
    total_attributes = sum([len(attribute) for attribute in attributeHash.values()])
    
    for (term, values) in wordHash.items():
        
        attributes_with_this_term = sum([len(attribute) for attribute in wordHash[term].values()])
        
        IAF = log1p(total_attributes/attributes_with_this_term)
                
        wordHash[term] = (IAF,values)
    print('IAF PROCESSED')
processIAF(wordHash,attributeHash)

NameError: name 'wordHash' is not defined

In [5]:
wordHash['denzel']

NameError: name 'wordHash' is not defined

In [6]:
attributeHash['casting']

NameError: name 'attributeHash' is not defined

In [7]:
# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

def processNormsOfAttributes(wordHash,attributeHash):
  
    # Get list of tablenames
    cur.execute("SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        table_name = table[0]
        print('PROCESSING TABLE ',table_name)
        
        #Get all tuples for this tablename
        cur.execute(
            sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
            #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
        )

        for row in cur.fetchall():
            for column in range(1,len(row)):
                column_name = cur.description[column][0]   
                ctid = row[0]

                for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:
                    
                    #Ignoring STOPWORDS
                    if word in stw_set:
                        continue
                    
                    (prevNorm,num_distinct_words)=attributeHash[table_name][column_name]
                    
                    IAF = wordHash[word][0]
                    
                    Norm = prevNorm + IAF
                    
                    attributeHash[table_name][column_name]=(Norm,num_distinct_words)
                    

    print ('NORMS OF ATTRIBUTES PROCESSED')

processNormsOfAttributes(wordHash,attributeHash)

NameError: name 'wordHash' is not defined

In [8]:
attributeHash['casting']

NameError: name 'attributeHash' is not defined

In [9]:
wordHash.get('denzel')

NameError: name 'wordHash' is not defined

In [10]:
wordHash.get('denzel')[1]

NameError: name 'wordHash' is not defined

In [11]:
def TSFind(Q):
    #Input:  A keyword query Q=[k1, k2, . . . , km]
    #Output: Set of non-free and non-empty tuple-sets Rq

    '''
    The tuple-set Rki contains the tuples of Ri that contain all
    terms of K and no other keywords from Q
    '''
    
    #Part 1: Find sets of tuples containing each keyword
    global P
    P = {}
    for keyword in Q:
        tupleset = set()
        for (table,attributes) in wordHash.get(keyword)[1].items():
            for (attribute,ctids) in attributes.items():
                for ctid in ctids:
                    tupleset.add( (table,attribute,ctid) )
        P[frozenset([keyword])] = tupleset
    
    #Part 2: Find sets of tuples containing larger termsets
    P = TSInter(P)

    #Part 3:Build tuple-sets
    Rq = set()
    for keyword , tuples in P.items():
        for (table,attribute,ctid) in tuples:
            Rq.add( (table,attribute,keyword) )
    print ('TUPLE SETS CREATED')
    return Rq


def TSInter(P):
    #Input: A Set of non-empty tuple-sets for each keyword alone P 
    #Output: The Set P, but now including larger termsets (process Intersections)

    '''
    Termset is any non-empty subset K of the terms of a query Q        
    '''
    
    Pprev = {}
    Pprev=copy.deepcopy(P)
    Pcurr = {}

    combinations = [x for x in itertools.combinations(Pprev.keys(),2)]
    for ( Ki , Kj ) in combinations[0:4]:
        Tki = Pprev[Ki]
        Tkj = Pprev[Kj]
        
        X = Ki | Kj
        Tx = Tki & Tkj        
        
        if len(Tx) > 0:            
            Pcurr[X]  = Tx            
            Pprev[Ki] = Tki - Tx         
            Pprev[Kj] = Tkj - Tx
            
    if Pcurr != {}:
        Pcurr = copy.deepcopy(TSInter(Pcurr))
        
    #Pprev = Pprev U Pcurr
    Pprev.update(Pcurr)     
    return Pprev       

Q = ['denzel','washington','gangster']
Rq = TSFind(Q)
pp(Rq)

NameError: name 'wordHash' is not defined

In [12]:
#Rq[frozenset({'denzel', 'washington'})]
#Mq = QMGen(Q,Rq)
def QMGen(Q,Rq):
    #Input:  A keyword query Q, The set of non-empty non-free tuple-sets Rq
    #Output: The set Mq of query matches for Q
    
    '''
    Query match is a set of tuple-sets that, if properly joined,
    can produce networks of tuples that fulfill the query. They
    can be thought as the leaves of a Candidate Network.
    
    '''
    
    Mq = []
    for i in range(1,len(Q)+1):
        for subset in itertools.combinations(Rq,i):
            if(MinimalCover(subset,Q)):
                Mq.append(set(subset))
    return Mq


def MinimalCover(MC, Q):
    #Input:  A subset MC (Match Candidate) to be checked as total and minimal cover
    #Output: If the match candidate is a TOTAL and MINIMAL cover

    '''
    Total:   every keyword is contained in at least one tuple-set of the match
    
    Minimal: we can not remove any tuple-set from the match and still have a
             total cover.    
    '''
    Subset = [termset for table,attribute,termset in MC]
    u = set().union(*Subset)    
    
    isTotal = (u == set(Q))
    for element in Subset:
        
        new_u = list(Subset)
        new_u.remove(element)
        
        new_u = set().union(*new_u)
        
        if new_u == set(Q):
            return False
    
    return isTotal

Mq = QMGen(Q,Rq)
print (len(Mq),'QUERY MATCHES CREATED')
for M in Mq:
    print(M,'\n\n')

NameError: name 'Rq' is not defined

In [13]:
def getSchemaGraph():
    #Output: A Schema Graph G  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }
    
    
    # Connect to an existing database
    conn = psycopg2.connect("dbname=imdb user=postgres")

    # Open a cursor to perform database operations
    cur = conn.cursor()
    
    G = {} 
    cur.execute("SELECT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        G.setdefault(table[0],{})
    
    sql = "SELECT DISTINCT \
                tc.table_name, kcu.column_name, \
                ccu.table_name AS foreign_table_name, ccu.column_name AS foreign_column_name \
            FROM information_schema.table_constraints AS tc  \
            JOIN information_schema.key_column_usage AS kcu \
                ON tc.constraint_name = kcu.constraint_name \
            JOIN information_schema.constraint_column_usage AS ccu \
                ON ccu.constraint_name = tc.constraint_name \
            WHERE constraint_type = 'FOREIGN KEY'"
    cur.execute(sql)
    relations = cur.fetchall()
    
    for (table,column,foreign_table,foreign_column) in relations:
        G[table][foreign_table] = (1,column, foreign_column)
        G[foreign_table][table] = (-1,foreign_column,column)
    print ('SCHEMA CREATED')
    return G

def MatchGraph(Rq, G, M):
    #Input:  The set of non-empty non-free tuple-sets Rq,
    #        The Schema Graph G,
    #        A Query Match M
    #Output: A Schema Graph Gts  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }

    '''
    A Match Subgraph Gts[M] is a subgraph of G that contains:
        The set of free tuple-sets of G
        The query match M
    '''
    
    Gts = copy.deepcopy(G)
    
    tables = set()
    #Insert non-free nodes
    for (table ,attribute, keywords) in M:
        Gts[(table,attribute,keywords)]=copy.deepcopy(Gts[table])
        for foreign_table , (direction,column,foreign_column) in Gts[(table,attribute,keywords)].items():
            Gts[foreign_table][(table,attribute,keywords)] = (direction*(-1),foreign_column,column)

    return Gts 

G = getSchemaGraph()
pp(G)

print ('\nEXAMPLE OF MATCH GRAPH')
Gts = MatchGraph(Rq, G, Mq[0])
pp(Gts)

SCHEMA CREATED
{'casting': {'char': (1, 'person_role_id', 'id'),
             'movie': (1, 'movie_id', 'id'),
             'name': (1, 'person_id', 'id'),
             'role': (1, 'role_id', 'id')},
 'char': {'casting': (-1, 'id', 'person_role_id')},
 'movie': {'casting': (-1, 'id', 'movie_id')},
 'name': {'casting': (-1, 'id', 'person_id')},
 'role': {'casting': (-1, 'id', 'role_id')}}

EXAMPLE OF MATCH GRAPH


NameError: name 'Rq' is not defined

In [15]:
def containsMatch(Ji,M):
    for relation in M:
        if relation not in Ji:
            return False
    return True

def isJNTSound(Gts,Ji):
    if len(Ji)<3:
        return True
    for i in range(len(Ji)-2):
        if (Ji[i],)[0] == (Ji[i+2],)[0]:
            edge_info = Gts[Ji[i]][Ji[i+1]]
            if(edge_info[0] == -1):
                return False
    return True

#CN = SingleCN(Mq[0],Gts,5,Q)
def SingleCN(M,Gts,Tmax,Q):
    from queue import deque
    F = deque()

    first_element = list(M)[0]
    J = [first_element]
    
    if len(M)==1:
        return J
    
    F.append(J)
    while F:
        J = F.pop()
        u = J[-1]
        for (adjacent,edge_info) in Gts[u].items():
            if (type(adjacent) is str) or (adjacent not in J):
                Ji = J + [adjacent]
                if (Ji not in F) and (len(Ji)<Tmax) and (isJNTSound(Gts,Ji)):
                    if(containsMatch(Ji,M)):
                        return Ji
                    else:
                        F.append(Ji)

def MatchCN(Rq,Mq,G):
    Cns = []                        
    for M in Mq: 
        Gts =  MatchGraph(Rq,G,M)
        Cn = SingleCN(M,Gts,10,Q)
        if(Cn is not None):
            Cns.append( (Cn,Gts,M) )
    return Cns

Cns = MatchCN(Rq,Mq,G)

for Cn in Cns:
    print('\n\n--------------------------------------------------\nGts\n')
    pp(Gts)
    print('\nM\n')
    pp(M)
    print('\nCN\n')
    pp(Cn)
    
#Cn=[('movie_info', frozenset({'gangster'})), 'title', ('cast_info', frozenset({'washington', 'denzel'}))] 

NameError: name 'Rq' is not defined

In [16]:
pp(attributeHash)

NameError: name 'attributeHash' is not defined

In [17]:
pp(Cns[0][1])

NameError: name 'Cns' is not defined

In [18]:
   
def CNRank(Cns,mi):
    Ranking = []
    for (Cn,Gts,M) in Cns:
        cosprod = 1
        
        for relation in Cn:
            if(type(relation) is str):
                continue

            (table,attribute,predicates) = relation
            
            (norm_attribute,distinct_terms) = attributeHash[table][attribute]
            
            wsum = 0
            
            for term in predicates:
                
                IAF = wordHash[term][0] 
                
                ctids = wordHash[term][1][table][attribute]
                fkj = len(ctids)
                
                if fkj>0:
                    
                    TF = log1p(fkj) / log1p(distinct_terms)
                    
                    wsum = wsum + TF*IAF
                        
            cos = wsum/norm_attribute
            cosprod *= cos

        score = mi * cosprod * 1/len(Cn)
        Ranking.append((Cn,Gts,M,score))
        
    return sorted(Ranking,key=lambda x: x[-1],reverse=True)

In [19]:
RankedCns=CNRank(Cns,2700000000000)

for (Cn,Gts,M,Score) in RankedCns:
    print(Score)
    print(Cn)


NameError: name 'Cns' is not defined

In [20]:
def getSQLfromCN(Gts,Cn):
    #print('CN:\n',Cn)
    
    selected_attributes = [] 
    tables = []
    conditions=[]
    relationships = []
    
    for i in range(len(Cn)):
        
        if(type(Cn[i]) is str):
            tableA = Cn[i]
            attrA=''
            keywords=[]
        else:
            (tableA,attrA,keywords) = Cn[i]  
                
        A = 't' + str(i)
        
        
        if(attrA != ''):
            selected_attributes.append(A +'.'+ attrA)
        
        tables.append(tableA+' '+A)
            
        #tratamento de keywords
        for term in keywords:
            condition = 'CAST('+A +'.'+ attrA + ' AS VARCHAR) ILIKE \'%' + term + '%\''
            conditions.append(condition)
        
        if(i<len(Cn)-1):
            if(type(Cn[i+1]) is str):
                tableB = Cn[i+1]
                attrB=''
            else:
                (tableB,attrB,keywords)=Cn[i+1]
                  
            B = 't'+str(i+1)
            
            edge_info = Gts[Cn[i]][Cn[i+1]]
            (direction,joining_attrA,joining_attrB) = edge_info
            
            relationships.append( (A,B) )
            
            condition = A + '.' + joining_attrA + ' = ' + B + '.' + joining_attrB         
            conditions.append(condition)
    
    tables_id = ['t'+str(i)+'.__search_id' for i in range(len(tables))]
    
    relationshipsText = ['('+str(a)+'.__search_id'+','+str(b)+'.__search_id'+')' for (a,b) in relationships]
    
    
    sqlText = 'SELECT '
    sqlText +=' ('+', '.join(tables_id)+') AS Tuples '
    if len(relationships)>0:
        sqlText +=', ('+', '.join(relationshipsText)+') AS Relationships'
        
    sqlText += ' , ' + ' , '.join(selected_attributes)
    
    sqlText +=' FROM ' + ', '.join(tables)
    sqlText +=' WHERE ' + ' AND '.join(conditions)
    '''
    print('SELECT:\n',selected_attributes)
    print('TABLES:\n',tables)
    print('CONDITIONS:')
    pp(conditions)
    print('RELATIONSHIPS:')
    pp(relationships)
    '''    
    #print('SQL:\n',sql)
    return sqlText

print('CN:\n',Cns[0][0])
getSQLfromCN(Cns[0][1],Cns[0][0])

NameError: name 'Cns' is not defined

In [21]:
def getQuerySets():
    QuerySet = []
    with open('querysets/queryset_imdb_coffman.txt') as f:
        for line in f.readlines():
            Q = [word for word in line.split() if word not in stw_set]
            QuerySet.append(Q)
    return QuerySet
        
QuerySet = getQuerySets()

In [22]:
def getGoldenStandards():
    goldenStandards = {}
    for i in range(1,51):
        filename = 'golden_standards/0'+str(i).zfill(2) +'.txt'
        with open(filename) as f:

            listOfTuples = []
            Q = ()
            for i, line in enumerate(f.readlines()):
              
                line_without_comment =line.split('#')[0]
                
                if(i==2):
                    comment_of_line = line.split('#')[1]
                    
                    #Remove words not in OLIVEIRA experiments
                    Q = tuple([word for word in comment_of_line.split() if word not in ['title','dr.',"here's",'char','name'] and word not in stw_set])
                
                if line_without_comment:                    
                    
                    relevantResult = eval(line_without_comment)
                    listOfTuples.append( relevantResult )
            
            goldenStandards[Q]=listOfTuples
            
    return goldenStandards


goldenStandards = getGoldenStandards()
goldenStandards

{('angelina', 'jolie'): [([40255278], [])],
 ('atticus',
  'finch'): [([44264090, 7900883, 22335633],
   [(7900883, 44264090), (7900883, 22335633)])],
 ('audrey',
  'hepburn',
  '1951'): [([40223661, 12011272, 43947185],
   [(12011272, 40223661), (12011272, 43947185)]), ([40223661,
    12011277,
    44086852],
   [(12011277, 40223661), (12011277, 44086852)]), ([40223661,
    12011281,
    44078626],
   [(12011281, 40223661), (12011281, 44078626)]), ([40223661,
    12011282,
    44082975],
   [(12011282, 40223661), (12011282, 44082975)]), ([40223661,
    12011323,
    44267426],
   [(12011323, 40223661), (12011323, 44267426)]), ([40223661,
    12011336,
    44231266],
   [(12011336, 40223661), (12011336, 44231266)])],
 ('brent',
  'spiner',
  'star',
  'trek'): [([39815480, 9103406, 44215275],
   [(9103406, 39815480), (9103406, 44215275)]), ([39815480, 9103405, 44194106],
   [(9103405, 39815480), (9103405, 44194106)]), ([39815480, 9103409, 44202515],
   [(9103409, 39815480), (9103409, 4

In [23]:
def evaluateCN(CnResult,goldenStandard):
    '''
    print('Verificar se são iguais:\n')
    print('Result: \n',CnResult)
    print('Golden Result: \n',goldenStandard)
    '''
    
    tuplesOfCNResult =  set(CnResult[0])
    
    tuplesOfStandard =  set(goldenStandard[0])
        
    #Check if the CN result have all tuples in golden standard
    if tuplesOfCNResult.issuperset(tuplesOfStandard) == False:
        return False
    
    
    relationshipsOfCNResult = CnResult[1]
    
    for goldenRelationship in goldenStandard[1]:
        
        (A,B) = goldenRelationship
        
        if (A,B) not in relationshipsOfCNResult and (B,A) not in relationshipsOfCNResult:
            return False
        
    return True


def evaluanteResult(Result,Query):
    
    goldenStandard = goldenStandards[tuple(Query)]
    
    for goldenRow in goldenStandard:

        found = False

        for row in Result:
            if evaluateCN(row,goldenRow):
                found = True

        if not found:
            return False
        
    return True
            
            
x=[('(39292828,5360667,21231023)', '("(39292828,5360667)","(5360667,21231023)")', 'Hamill, Mark', 'Luke Skywalker'), ('(39292828,5360749,21231023)', '("(39292828,5360749)","(5360749,21231023)")', 'Hamill, Mark', 'Luke Skywalker'), ('(39292828,5360752,21231023)', '("(39292828,5360752)","(5360752,21231023)")', 'Hamill, Mark', 'Luke Skywalker'), ('(39292828,5360753,21231023)', '("(39292828,5360753)","(5360753,21231023)")', 'Hamill, Mark', 'Luke Skywalker')]
q = ['hamill', 'skywalker']

def normalizeResult(ResultFromDatabase):
    normalizedResult = []
    
    for row in ResultFromDatabase:        
        if type(row[0]) == int:
            tuples = [row[0]]
        else:
            tuples = eval(str(row[0]))
        
        try:
            relationships = eval(row[1])
            relationships = [eval(element) for element in relationships]
        except:
            relationships = []
            
        
        normalizedResult.append( (tuples,relationships) )
    return normalizedResult

normX = normalizeResult(x)

evaluanteResult(normX,q)

True

In [24]:
def getRelevantPosition(RankedCns,Q):
    
    for (position,(Cn,Gts,M,score)) in enumerate(RankedCns):

        #print('CN:\n')
        #pp(Cn)
        
        SQL = getSQLfromCN(Gts,Cn)

        #print(SQL)
        
        cur.execute(SQL)
        Results = cur.fetchall()

        NResults = normalizeResult(Results)

        Relevance = evaluanteResult(NResults,Q)

        if Relevance == True:
            return position+1

    return -1

In [29]:
QuerySets = getQuerySets()
QuerySets[0:42]

[['denzel', 'washington'],
 ['clint', 'eastwood'],
 ['john', 'wayne'],
 ['will', 'smith'],
 ['harrison', 'ford'],
 ['julia', 'roberts'],
 ['tom', 'hanks'],
 ['johnny', 'depp'],
 ['angelina', 'jolie'],
 ['morgan', 'freeman'],
 ['gone', 'wind'],
 ['star', 'wars'],
 ['casablanca'],
 ['lord', 'rings'],
 ['sound', 'music'],
 ['wizard', 'oz'],
 ['notebook'],
 ['forrest', 'gump'],
 ['princess', 'bride'],
 ['godfather'],
 ['atticus', 'finch'],
 ['indiana', 'jones'],
 ['james', 'bond'],
 ['rick', 'blaine'],
 ['will', 'kane'],
 ['hannibal', 'lecter'],
 ['norman', 'bates'],
 ['darth', 'vader'],
 ['nurse', 'ratched'],
 ['looking', 'kid'],
 ['hamill', 'skywalker'],
 ['hanks', '2004'],
 ['russell', 'crowe', 'gladiator'],
 ['brent', 'spiner', 'star', 'trek'],
 ['audrey', 'hepburn', '1951'],
 ['jacques', 'clouseau'],
 ['jack', 'ryan'],
 ['rocky', 'stallone'],
 ['terminator'],
 ['harrison', 'ford', 'george', 'lucas'],
 ['sean', 'connery', 'fleming'],
 ['dean', 'jones', 'herbie']]

In [31]:
def preProcessing():
    global wordHash
    global attributeHash
    (wordHash,attributeHash) = createInvertedIndex()
    processIAF(wordHash,attributeHash)
    processNormsOfAttributes(wordHash,attributeHash)
    print('PRE-PROCESSING STAGE FINISHED')
    

def main():   
    QuerySets = getQuerySets()
    goldenStandards = getGoldenStandards()
    QuerySets.remove(['harrison', 'ford', 'george', 'lucas'])
    
    EVALUATION_RESULTS = []
    
    for (i,Q) in enumerate(QuerySets):
        
        print('QUERY-SET ',Q,'\n')
        
        print('FINDING TUPLE-SETS')
        Rq = TSFind(Q)
        print(len(Rq),'TUPLE-SETS CREATED')
        
        print('GENERATING QUERY MATCHES')
        Mq = QMGen(Q,Rq)
        print (len(Mq),'QUERY MATCHES CREATED')
        
        print('GENERATING CANDIDATE NETWORKS')
        G = getSchemaGraph()
        Cns = MatchCN(Rq,Mq,G)
        print (len(Mq),'CANDIDATE NETWORKS CREATED')
        
        print('RANKING CANDIDATE NETWORKS')
        RankedCns=CNRank(Cns,2700000000000)
        
        print('EVALUATING ANSWER')
        Position = getRelevantPosition(RankedCns,Q)
        print('\nRELEVANT ANSWER IN POSITION ', Position,'\n\n')
        TOP1= (Position==1)
        MRR = (1/Position)
        
        EVALUATION_RESULTS.append(Position)
    return EVALUATION_RESULTS

In [32]:
preProcessing()

INDEXING TABLE  casting
INDEXING TABLE  name
INDEXING TABLE  role
INDEXING TABLE  movie
INDEXING TABLE  char
INVERTED INDEX CREATED
IAF PROCESSED
PROCESSING TABLE  casting
PROCESSING TABLE  name
PROCESSING TABLE  role
PROCESSING TABLE  movie
PROCESSING TABLE  char
NORMS OF ATTRIBUTES PROCESSED
PRE-PROCESSING STAGE FINISHED


In [33]:
main()

QUERY-SET  ['denzel', 'washington'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
9 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
14 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
14 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  1 


QUERY-SET  ['clint', 'eastwood'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
8 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
11 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
11 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  1 


QUERY-SET  ['john', 'wayne'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
12 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
20 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
20 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  1 


QUERY-SET  ['will', 'smith'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
12 TUPLE-SETS CREATED
GENERATING Q


RELEVANT ANSWER IN POSITION  -1 


QUERY-SET  ['nurse', 'ratched'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
5 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
1 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
1 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  -1 


QUERY-SET  ['looking', 'kid'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
8 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
16 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
16 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  -1 


QUERY-SET  ['hamill', 'skywalker'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
5 TUPLE-SETS CREATED
GENERATING QUERY MATCHES
4 QUERY MATCHES CREATED
GENERATING CANDIDATE NETWORKS
SCHEMA CREATED
4 CANDIDATE NETWORKS CREATED
RANKING CANDIDATE NETWORKS
EVALUATING ANSWER

RELEVANT ANSWER IN POSITION  1 


QUERY-SET  ['hanks', '2004'] 

FINDING TUPLE-SETS
TUPLE SETS CREATED
9 

[1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 4,
 -1,
 -1,
 -1]

In [38]:
x = [1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 4,
 -1,
 -1,
 -1]
len(x)


41

In [48]:
QuerySets
x
[QuerySets[i] for i in range(41) if x[i]==-1]

[['atticus', 'finch'],
 ['indiana', 'jones'],
 ['james', 'bond'],
 ['rick', 'blaine'],
 ['will', 'kane'],
 ['hannibal', 'lecter'],
 ['norman', 'bates'],
 ['darth', 'vader'],
 ['nurse', 'ratched'],
 ['looking', 'kid'],
 ['russell', 'crowe', 'gladiator'],
 ['jacques', 'clouseau'],
 ['jack', 'ryan'],
 ['terminator'],
 ['sean', 'connery', 'fleming'],
 ['dean', 'jones', 'herbie']]

In [49]:
pp(attributeHash)

{'casting': {'__search_id': (2946557.1130310143, 812695),
             'id': (2926482.0981843793, 812695),
             'movie_id': (2287363.3144858778, 181706),
             'note': (1988705.117386036, 26275),
             'nr_order': (1087837.3215669103, 332),
             'person_id': (2313027.8276497438, 273034),
             'person_role_id': (1515883.046458141, 206953),
             'role_id': (1318166.6321591807, 11)},
 'char': {'__search_id': (758181.4097895048, 206952),
          'id': (584192.4873340287, 206952),
          'imdb_id': (213784.5215601025, 1),
          'imdb_index': (213784.5215601025, 1),
          'name': (1204219.523615734, 77991),
          'name_pcode_nf': (458154.4252000812, 17351),
          'surname_pcode': (380590.3657178183, 7415)},
 'movie': {'__search_id': (665691.1324713555, 181706),
           'episode_nr': (187705.0247137609, 1),
           'episode_of_id': (187705.0247137609, 1),
           'id': (510658.84283990104, 181706),
           'imdb_id