# MatCNGenPy

This is a python implementation of the algorithms described in 
**Efficient Match-Based Candidate Network Generation for Keyword 
Queries over Relational Databases** paper.


## Installation
- Install virtalenv
- Run ```source bin/activate``` to enter in the virtual enviroment
- Run ```pip install -r requirements.txt```
- Run ```python ModCNGen.py

In [113]:
import psycopg2
from psycopg2 import sql
from pprint import pprint as pp
from collections import defaultdict
import string
import itertools
import copy
from math import log1p
from queue import deque

In [None]:
# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

def createInvertedList():
    #Output: wordHash (Term Index) with this structure below
    #map['word'] = [ 'table': ( {column} , ['ctid'] ) ]

    '''
    The Term Index is built in a preprocessing step that scans only
    once all the relations over which the queries will be issued.
    '''
    
    wordHash = {}
    wordCount = {}
    
    # Get list of tablenames
    cur.execute("SELECT DISTINCT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        table_name = table[0]
        print('INDEXING TABLE ',table_name)
        
        wordCount[table_name] = {}
        
        #Get all tuples for this tablename
        cur.execute(
            sql.SQL("SELECT ctid, * FROM {};").format(sql.Identifier(table_name))
            #NOTE: sql.SQL is needed to specify this parameter as table name (can't be passed as execute second parameter)
        )

        for row in cur.fetchall():
            for column in range(1,len(row)):
                column_name = cur.description[column][0]   
                ctid = row[0]

                #NOTE: Need to remove stopwords
                for word in [word.strip(string.punctuation) for word in str(row[column]).lower().split()]:

                    #If word entry doesn't exists, it will be inicialized (setdefault method),
                    #Append the location for this word
                    wordHash.setdefault(word, {})                    
                    wordHash[word].setdefault( table_name , {} )
                    wordHash[word][table_name].setdefault( column_name , [] ).append(ctid)
                    
                    wordCount[table_name].setdefault(column_name,set())
                    wordCount[table_name][column_name].add(word)
        
        #Count words
        for (column_name,wordSet) in wordCount[table_name].items():
            aux = len(wordSet)
            wordSet.clear()
            wordCount[table_name][column_name]=aux
            #print(table_name,'.',column_name,': ',wordCount[table_name][column_name])

    print ('INVERTED INDEX CREATED')
    return (wordHash,wordCount)

(wordHash,wordCount) = createInvertedList()

INDEXING TABLE  char_name
INDEXING TABLE  name
INDEXING TABLE  movie_info
INDEXING TABLE  role_type
INDEXING TABLE  title
INDEXING TABLE  cast_info
INVERTED INDEX CREATED


In [None]:
def processIAF(wordHash,wordCount):
    
    total_attributes = sum([len(relation) for relation in wordCount.values()]) 
    
    for (termo, values) in wordHash.items():
        for (table, attributes) in values.items():
            for (column,ctids) in attributes.items():
                
                num_occurences = len(ctids)
                
                IAF = log1p(total_attributes/num_occurences)
                
                wordHash[termo][table][column] = (IAF,ctids)

processIAF(wordHash,wordCount)

In [98]:
def TSFind(Q):
    #Input:  A keyword query Q=[k1, k2, . . . , km]
    #Output: Set of non-free and non-empty tuple-sets Rq

    '''
    The tuple-set Rki contains the tuples of Ri that contain all
    terms of K and no other keywords from Q
    '''
    
    #Part 1: Find sets of tuples containing each keyword
    global P
    P = {}
    for keyword in Q:
        tupleset = set()
        for (table,attributes) in wordHash.get(keyword).items():
            for (iaf,ctids) in attributes.values():
                for ctid in ctids:
                    tupleset.add( (table,ctid) )
        P[frozenset([keyword])] = tupleset

    #Part 2: Find sets of tuples containing larger termsets
    P = TSInter(P)

    #Part 3:Build tuple-sets
    Rq = set()
    for keyword , tuples in P.items():
        for (table,ctid) in tuples:
            Rq.add( (table,keyword) )
    print ('TUPLE SETS CREATED')
    return Rq


def TSInter(P):
    #Input: A Set of non-empty tuple-sets for each keyword alone P 
    #Output: The Set P, but now including larger termsets (process Intersections)

    '''
    Termset is any non-empty subset K of the terms of a query Q        
    '''
    
    Pprev = {}
    Pprev=copy.deepcopy(P)
    Pcurr = {}

    combinations = [x for x in itertools.combinations(Pprev.keys(),2)]
    for ( Ki , Kj ) in combinations[0:4]:
        Tki = Pprev[Ki]
        Tkj = Pprev[Kj]
        
        X = Ki | Kj
        Tx = Tki & Tkj        
        
        if len(Tx) > 0:            
            Pcurr[X]  = Tx            
            Pprev[Ki] = Tki - Tx         
            Pprev[Kj] = Tkj - Tx
            
    if Pcurr != {}:
        Pcurr = copy.deepcopy(TSInter(Pcurr))
        
    #Pprev = Pprev U Pcurr
    Pprev.update(Pcurr)     
    return Pprev       

Q = ['denzel','washington','gangster']
Rq = TSFind(Q)
pp(Rq)

TUPLE SETS CREATED
{('cast_info', frozenset({'denzel', 'washington'})),
 ('cast_info', frozenset({'gangster'})),
 ('cast_info', frozenset({'denzel'})),
 ('cast_info', frozenset({'washington'})),
 ('char_name', frozenset({'denzel'})),
 ('char_name', frozenset({'washington'})),
 ('char_name', frozenset({'gangster'})),
 ('movie_info', frozenset({'washington'})),
 ('movie_info', frozenset({'denzel'})),
 ('movie_info', frozenset({'denzel', 'washington'})),
 ('movie_info', frozenset({'gangster'})),
 ('movie_info', frozenset({'gangster', 'washington'})),
 ('name', frozenset({'denzel'})),
 ('name', frozenset({'washington'})),
 ('name', frozenset({'denzel', 'washington'})),
 ('title', frozenset({'gangster'})),
 ('title', frozenset({'washington'}))}


In [208]:
#Rq[frozenset({'denzel', 'washington'})]
#Mq = QMGen(Q,Rq)
def QMGen(Q,Rq):
    #Input:  A keyword query Q, The set of non-empty non-free tuple-sets Rq
    #Output: The set Mq of query matches for Q
    
    '''
    Query match is a set of tuple-sets that, if properly joined,
    can produce networks of tuples that fulfill the query. They
    can be thought as the leaves of a Candidate Network.
    
    '''
    
    Mq = []
    for i in range(1,len(Q)+1):
        for subset in itertools.combinations(Rq,i):
            if(MinimalCover(subset,Q)):
                Mq.append(subset)
    print ('QUERY MATCHES CREATED')
    return Mq


def MinimalCover(MC, Q):
    #Input:  A subset MC (Match Candidate) to be checked as total and minimal cover
    #Output: If the match candidate is a TOTAL and MINIMAL cover

    '''
    Total:   every keyword is contained in at least one tuple-set of the match
    
    Minimal: we can not remove any tuple-set from the match and still have a
             total cover.    
    '''
    
    Subset = [termset for table,termset in MC]
    u = set().union(*Subset)    
    
    isTotal = (u == set(Q))
    for element in Subset:
        
        new_u = list(Subset)
        new_u.remove(element)
        
        new_u = set().union(*new_u)
        
        if new_u == set(Q):
            return False
    
    return isTotal

Mq = QMGen(Q,Rq)

for M in Mq:
    print(M,'\n\n')

QUERY MATCHES CREATED
(('movie_info', frozenset({'denzel', 'washington'})), ('movie_info', frozenset({'gangster', 'washington'}))) 


(('movie_info', frozenset({'denzel', 'washington'})), ('title', frozenset({'gangster'}))) 


(('movie_info', frozenset({'denzel', 'washington'})), ('char_name', frozenset({'gangster'}))) 


(('movie_info', frozenset({'denzel', 'washington'})), ('cast_info', frozenset({'gangster'}))) 


(('movie_info', frozenset({'denzel', 'washington'})), ('movie_info', frozenset({'gangster'}))) 


(('char_name', frozenset({'denzel'})), ('movie_info', frozenset({'gangster', 'washington'}))) 


(('movie_info', frozenset({'gangster', 'washington'})), ('name', frozenset({'denzel', 'washington'}))) 


(('movie_info', frozenset({'gangster', 'washington'})), ('cast_info', frozenset({'denzel', 'washington'}))) 


(('movie_info', frozenset({'gangster', 'washington'})), ('name', frozenset({'denzel'}))) 


(('movie_info', frozenset({'gangster', 'washington'})), ('cast_info', froze

In [212]:
def getSchemaGraph():
    #Output: A Schema Graph G  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }
    
    G = {} 
    cur.execute("SELECT tablename FROM pg_tables WHERE schemaname!='pg_catalog' AND schemaname !='information_schema';")
    for table in cur.fetchall():
        G.setdefault(table[0],{})
    
    sql = "SELECT DISTINCT \
                tc.table_name, kcu.column_name, \
                ccu.table_name AS foreign_table_name, ccu.column_name AS foreign_column_name \
            FROM information_schema.table_constraints AS tc  \
            JOIN information_schema.key_column_usage AS kcu \
                ON tc.constraint_name = kcu.constraint_name \
            JOIN information_schema.constraint_column_usage AS ccu \
                ON ccu.constraint_name = tc.constraint_name \
            WHERE constraint_type = 'FOREIGN KEY'"
    cur.execute(sql)
    relations = cur.fetchall()
    
    for (table,column,foreign_table,foreign_column) in relations:
        G[table][foreign_table] = (1,column, foreign_column)
        G[foreign_table][table] = (-1,foreign_column,column)
    print ('SCHEMA CREATED')
    return G

def MatchGraph(Rq, G, M):
    #Input:  The set of non-empty non-free tuple-sets Rq,
    #        The Schema Graph G,
    #        A Query Match M
    #Output: A Schema Graph Gts  with the structure below:
    # G['node'] = edges
    # G['table'] = { 'foreign_table' : (direction, column, foreign_column) }

    '''
    A Match Subgraph Gts[M] is a subgraph of G that contains:
        The set of free tuple-sets of G
        The query match M
    '''
    
    Gts = copy.deepcopy(G)
    
    tables = set()
    #Insert non-free nodes
    for (table , keywords) in M:
        Gts[(table,keywords)]=copy.deepcopy(Gts[table])
        for foreign_table , (direction,column,foreign_column) in Gts[(table,keywords)].items():
            Gts[foreign_table][(table,keywords)] = (direction*(-1),foreign_column,column)

    return Gts 

G = getSchemaGraph()
pp(G)

print('\n\nMATCH GRAPH FOR ',Mq[6],'\n')
Gts =  MatchGraph(Rq,G,Mq[6])
pp(Gts)

SCHEMA CREATED
{'cast_info': {'char_name': (1, 'person_role_id', 'id'),
               'name': (1, 'person_id', 'id'),
               'role_type': (1, 'role_id', 'id'),
               'title': (1, 'movie_id', 'id')},
 'char_name': {'cast_info': (-1, 'id', 'person_role_id')},
 'movie_info': {'title': (1, 'movie_id', 'id')},
 'name': {'cast_info': (-1, 'id', 'person_id')},
 'role_type': {'cast_info': (-1, 'id', 'role_id')},
 'title': {'cast_info': (-1, 'id', 'movie_id'),
           'movie_info': (-1, 'id', 'movie_id')}}


MATCH GRAPH FOR  (('movie_info', frozenset({'gangster', 'washington'})), ('name', frozenset({'denzel', 'washington'}))) 

{'cast_info': {'char_name': (1, 'person_role_id', 'id'),
               'name': (1, 'person_id', 'id'),
               'role_type': (1, 'role_id', 'id'),
               'title': (1, 'movie_id', 'id'),
               ('name', frozenset({'denzel', 'washington'})): (1,
                                                               'person_id',
         

In [210]:
def containsMatch(Ji,M):
    for relation in M:
        if relation not in Ji:
            return False
    return True

def isJNTSound(Gts,Ji):
    if len(Ji)<3:
        return True
    for i in range(len(Ji)-2):
        if (Ji[i],)[0] == (Ji[i+2],)[0]:
            edge_info = Gts[Ji[i]][Ji[i+1]]
            if(edge_info[0] == -1):
                return False
    return True

#CN = SingleCN(Mq[0],Gts,5,Q)
def SingleCN(M,Gts,Tmax,Q):
    F = deque()
    J = [M[0]]
    F.append(J)
    while F:
        J = F.pop()
        u = J[-1]
        for (adjacent,edge_info) in Gts[u].items():
            if (type(adjacent) is str) or (adjacent not in J):
                Ji = J + [adjacent]
                if (Ji not in F) and (len(Ji)<Tmax) and (isJNTSound(Gts,Ji)):
                    if(containsMatch(Ji,M)):
                       return Ji
                    else:
                       F.append(Ji)
Cn = SingleCN(Mq[6],Gts,10,Q)
pp(Cn)
#Cn=[('movie_info', frozenset({'gangster'})), 'title', ('cast_info', frozenset({'washington', 'denzel'}))] 

[('movie_info', frozenset({'gangster', 'washington'})),
 'title',
 'cast_info',
 ('name', frozenset({'denzel', 'washington'}))]


In [221]:
for M in Mq: 
    Gts =  MatchGraph(Rq,G,M)
    Cn = SingleCN(M,Gts,10,Q)
    
    # Connect to an existing database
    conn = psycopg2.connect("dbname=imdb user=postgres")

    # Open a cursor to perform database operations
    cur = conn.cursor()
    
    if Cn is not None:
        print('===============================================================================================================================================\n')
        pp(Cn)
        sqlText = getSQLfromCN(Gts,Cn)
        print('\n',sqlText,'\n')
        cur.execute(sqlText)
        results = cur.fetchall()
        print(results)


[('movie_info', frozenset({'denzel', 'washington'})),
 'title',
 ('movie_info', frozenset({'gangster', 'washington'}))]

 SELECT * FROM title, movie_info WHERE movie_info.info ILIKE '%denzel%' AND movie_info.info ILIKE '%washington%' AND movie_info.movie_id = title.id AND title.id = movie_info.movie_id AND movie_info.info ILIKE '%gangster%' AND movie_info.info ILIKE '%washington%' 

[]

[('movie_info', frozenset({'denzel', 'washington'})),
 ('title', frozenset({'gangster'}))]

 SELECT * FROM title, movie_info WHERE movie_info.info ILIKE '%denzel%' AND movie_info.info ILIKE '%washington%' AND movie_info.movie_id = title.id AND title.title ILIKE '%gangster%' 

[]

[('movie_info', frozenset({'denzel', 'washington'})),
 'title',
 'cast_info',
 ('char_name', frozenset({'gangster'}))]

 SELECT * FROM cast_info, title, char_name, movie_info WHERE movie_info.info ILIKE '%denzel%' AND movie_info.info ILIKE '%washington%' AND movie_info.movie_id = title.id AND title.id = cast_info.movie_id AND 

[]

[('char_name', frozenset({'gangster'})),
 'cast_info',
 ('name', frozenset({'denzel', 'washington'}))]

 SELECT * FROM cast_info, name, char_name WHERE char_name.name ILIKE '%gangster%' AND char_name.id = cast_info.person_role_id AND cast_info.person_id = name.id AND name.name ILIKE '%denzel%' AND name.name ILIKE '%washington%' 

[]

[('char_name', frozenset({'gangster'})),
 ('cast_info', frozenset({'denzel', 'washington'}))]

 SELECT * FROM cast_info, char_name WHERE char_name.name ILIKE '%gangster%' AND char_name.id = cast_info.person_role_id AND cast_info.note ILIKE '%denzel%' AND cast_info.note ILIKE '%washington%' 

[]

[('name', frozenset({'denzel', 'washington'})),
 ('cast_info', frozenset({'gangster'}))]

 SELECT * FROM cast_info, name WHERE name.name ILIKE '%denzel%' AND name.name ILIKE '%washington%' AND name.id = cast_info.person_id AND cast_info.note ILIKE '%gangster%' 

[]

[('name', frozenset({'denzel', 'washington'})),
 'cast_info',
 'role_type',
 'cast_info',
 'char

[]

[('char_name', frozenset({'denzel'})),
 'cast_info',
 ('char_name', frozenset({'gangster'})),
 'cast_info',
 ('title', frozenset({'washington'}))]

 SELECT * FROM cast_info, title, char_name WHERE char_name.name ILIKE '%denzel%' AND char_name.id = cast_info.person_role_id AND cast_info.person_role_id = char_name.id AND char_name.name ILIKE '%gangster%' AND char_name.id = cast_info.person_role_id AND cast_info.movie_id = title.id AND title.title ILIKE '%washington%' 

[]

[('char_name', frozenset({'denzel'})),
 ('cast_info', frozenset({'gangster'})),
 ('title', frozenset({'washington'}))]

 SELECT * FROM cast_info, title, char_name WHERE char_name.name ILIKE '%denzel%' AND char_name.id = cast_info.person_role_id AND cast_info.note ILIKE '%gangster%' AND cast_info.movie_id = title.id AND title.title ILIKE '%washington%' 

[]

[('char_name', frozenset({'denzel'})),
 'cast_info',
 ('title', frozenset({'washington'})),
 ('movie_info', frozenset({'gangster'}))]

 SELECT * FROM cast_info,

[]

[('char_name', frozenset({'washington'})),
 'cast_info',
 ('name', frozenset({'denzel'})),
 'cast_info',
 ('char_name', frozenset({'gangster'}))]

 SELECT * FROM cast_info, name, char_name WHERE char_name.name ILIKE '%washington%' AND char_name.id = cast_info.person_role_id AND cast_info.person_id = name.id AND name.name ILIKE '%denzel%' AND name.id = cast_info.person_id AND cast_info.person_role_id = char_name.id AND char_name.name ILIKE '%gangster%' 

[]

[('char_name', frozenset({'washington'})),
 ('cast_info', frozenset({'denzel'})),
 ('char_name', frozenset({'gangster'}))]

 SELECT * FROM cast_info, char_name WHERE char_name.name ILIKE '%washington%' AND char_name.id = cast_info.person_role_id AND cast_info.note ILIKE '%denzel%' AND cast_info.person_role_id = char_name.id AND char_name.name ILIKE '%gangster%' 

[]

[('char_name', frozenset({'washington'})),
 'cast_info',
 ('char_name', frozenset({'gangster'})),
 'cast_info',
 'role_type',
 'cast_info',
 'title',
 ('movie_info'


[('title', frozenset({'gangster'})),
 ('movie_info', frozenset({'denzel'})),
 'title',
 ('movie_info', frozenset({'washington'}))]

 SELECT * FROM title, movie_info WHERE title.title ILIKE '%gangster%' AND title.id = movie_info.movie_id AND movie_info.info ILIKE '%denzel%' AND movie_info.movie_id = title.id AND title.id = movie_info.movie_id AND movie_info.info ILIKE '%washington%' 

[]

[('title', frozenset({'gangster'})),
 'movie_info',
 'title',
 'cast_info',
 ('name', frozenset({'washington'})),
 'cast_info',
 ('name', frozenset({'denzel'}))]

 SELECT * FROM cast_info, title, name, movie_info WHERE title.title ILIKE '%gangster%' AND title.id = movie_info.movie_id AND movie_info.movie_id = title.id AND title.id = cast_info.movie_id AND cast_info.person_id = name.id AND name.name ILIKE '%washington%' AND name.id = cast_info.person_id AND cast_info.person_id = name.id AND name.name ILIKE '%denzel%' 

[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', 


[('title', frozenset({'gangster'})),
 ('cast_info', frozenset({'washington'})),
 ('name', frozenset({'denzel'}))]

 SELECT * FROM cast_info, title, name WHERE title.title ILIKE '%gangster%' AND title.id = cast_info.movie_id AND cast_info.note ILIKE '%washington%' AND cast_info.person_id = name.id AND name.name ILIKE '%denzel%' 

[]

[('title', frozenset({'gangster'})),
 ('cast_info', frozenset({'denzel'})),
 ('name', frozenset({'washington'}))]

 SELECT * FROM cast_info, title, name WHERE title.title ILIKE '%gangster%' AND title.id = cast_info.movie_id AND cast_info.note ILIKE '%denzel%' AND cast_info.person_id = name.id AND name.name ILIKE '%washington%' 

[]

[('title', frozenset({'gangster'})),
 ('movie_info', frozenset({'denzel'})),
 'title',
 'cast_info',
 ('name', frozenset({'washington'}))]

 SELECT * FROM cast_info, title, name, movie_info WHERE title.title ILIKE '%gangster%' AND title.id = movie_info.movie_id AND movie_info.info ILIKE '%denzel%' AND movie_info.movie_id = titl

[]

[('char_name', frozenset({'gangster'})),
 ('cast_info', frozenset({'denzel'})),
 'role_type',
 'cast_info',
 'char_name',
 'cast_info',
 'title',
 ('movie_info', frozenset({'washington'}))]

 SELECT * FROM cast_info, movie_info, role_type, title, char_name WHERE char_name.name ILIKE '%gangster%' AND char_name.id = cast_info.person_role_id AND cast_info.note ILIKE '%denzel%' AND cast_info.role_id = role_type.id AND role_type.id = cast_info.role_id AND cast_info.person_role_id = char_name.id AND char_name.id = cast_info.person_role_id AND cast_info.movie_id = title.id AND title.id = movie_info.movie_id AND movie_info.info ILIKE '%washington%' 

[]

[('char_name', frozenset({'gangster'})),
 'cast_info',
 ('name', frozenset({'washington'})),
 'cast_info',
 ('name', frozenset({'denzel'}))]

 SELECT * FROM cast_info, name, char_name WHERE char_name.name ILIKE '%gangster%' AND char_name.id = cast_info.person_role_id AND cast_info.person_id = name.id AND name.name ILIKE '%washington%' AND 

[]

[('cast_info', frozenset({'gangster'})),
 ('name', frozenset({'denzel'})),
 ('cast_info', frozenset({'washington'}))]

 SELECT * FROM cast_info, name WHERE cast_info.note ILIKE '%gangster%' AND cast_info.person_id = name.id AND name.name ILIKE '%denzel%' AND name.id = cast_info.person_id AND cast_info.note ILIKE '%washington%' 

[]

[('cast_info', frozenset({'gangster'})),
 ('name', frozenset({'washington'})),
 ('cast_info', frozenset({'denzel'}))]

 SELECT * FROM cast_info, name WHERE cast_info.note ILIKE '%gangster%' AND cast_info.person_id = name.id AND name.name ILIKE '%washington%' AND name.id = cast_info.person_id AND cast_info.note ILIKE '%denzel%' 

[]

[('cast_info', frozenset({'gangster'})),
 ('name', frozenset({'washington'})),
 'cast_info',
 'role_type',
 'cast_info',
 'char_name',
 'cast_info',
 'title',
 ('movie_info', frozenset({'denzel'}))]

 SELECT * FROM cast_info, movie_info, role_type, title, char_name, name WHERE cast_info.note ILIKE '%gangster%' AND cast_info.

In [145]:
def TF_IAF(term,table_name,column_name,wordHash,wordCount):
    
    (IAF,ctids) = wordHash[term][table_name][column_name]
    
    frequency = len(ctids)
    distinct_terms = wordCount[table_name][column_name]
    TF = log1p(frequency) / log1p(distinct_terms)
    
    return TF*IAF

In [146]:
Gts =  MatchGraph(Rq,G,Mq[0])
Cn = SingleCN(Mq[0],Gts,10,Q)
Cn

[('movie_info', frozenset({'denzel', 'washington'})),
 'title',
 ('movie_info', frozenset({'gangster', 'washington'}))]

In [193]:
def anorm(relation,Bj):
    return wordCount[relation][Bj]
    
def CNRank(Cns,mi):
    Ranking = []
    for (Cn,Gts) in Cns:
        cosprod = 1
        
        if Cn is None:
            continue
        
        for relation in Cn:
            if(type(relation) is str):
                continue

            (table_name,predicates) = relation

            for term in predicates:
                wsum = 0
                for Bj in wordHash[term][table_name].keys():       
                    w=TF_IAF(term,table_name,Bj,wordHash,wordCount)
                    wsum += w

                cos = wsum/anorm(table_name,Bj)
                cosprod *= cos

            score = mi * cosprod * 1/len(Cn)
            Ranking.append((Cn,Gts,score))
        
    return sorted(Ranking,key=lambda x: x[-1],reverse=True)

In [235]:
Cns = []
for M in Mq: 
    Gts =  MatchGraph(Rq,G,M)
    Cn = SingleCN(M,Gts,10,Q)
    Cns.append( (Cn,Gts) )
RankedCns=CNRank(Cns,200000)

for (Cn,Gts,Score) in RankedCns:
    print(Score)
    print(Cn)


0.9921613800900272
[('cast_info', frozenset({'gangster'})), ('name', frozenset({'denzel'})), ('cast_info', frozenset({'washington'}))]
0.9921613800900272
[('cast_info', frozenset({'gangster'})), ('name', frozenset({'washington'})), ('cast_info', frozenset({'denzel'}))]
0.7441210350675204
[('cast_info', frozenset({'gangster'})), ('name', frozenset({'washington'})), 'cast_info', ('name', frozenset({'denzel'}))]
0.5952968280540163
[('cast_info', frozenset({'gangster'})), 'role_type', ('cast_info', frozenset({'washington'})), 'char_name', ('cast_info', frozenset({'denzel'}))]
0.3629354197338176
[('title', frozenset({'gangster'})), ('cast_info', frozenset({'denzel', 'washington'}))]
0.33072046003000904
[('cast_info', frozenset({'gangster'})), ('name', frozenset({'washington'})), 'cast_info', 'role_type', 'cast_info', 'char_name', 'cast_info', 'title', ('movie_info', frozenset({'denzel'}))]
0.33072046003000904
[('cast_info', frozenset({'gangster'})), 'role_type', ('cast_info', frozenset({'wa

In [186]:
def getSQLfromCN(Gts,Cn):
    #print('CN:\n',Cn)
    
    tables = set()
    conditions=[]
    for i in range(len(Cn)):
        if(type(Cn[i]) is str):
            A = Cn[i]
        else:
            (A,keywords) = Cn[i]           
            
            #tratamento de keywords
            for term in keywords:
                attrA= [column_name for column_name in wordHash[term][A].keys()][0]
                condition = A + '.' + attrA + ' ILIKE \'%' + term + '%\''
                conditions.append(condition)
                
        tables.add(A)
        
        if(i<len(Cn)-1):
            if(type(Cn[i+1]) is str):
                B = Cn[i+1]
            else:
                B = Cn[i+1][0]
            
            edge_info = Gts[Cn[i]][Cn[i+1]]
            (direction,attrA,attrB) = edge_info
            
            condition = A + '.' + attrA + ' = ' + B + '.' + attrB         
            conditions.append(condition)
            
    #print('TABLES:\n',tables)
    #print('CONDITIONS:')
    #pp(conditions)
    
    sqlText = 'SELECT * FROM ' + ', '.join(tables)
    sqlText += ' WHERE ' + ' AND '.join(conditions)
    #print('SQL:\n',sql)
    return sqlText

In [217]:
# Connect to an existing database
conn = psycopg2.connect("dbname=imdb user=postgres")

# Open a cursor to perform database operations
cur = conn.cursor()

for (Cn,Gts,score) in RankedCns:
    if Cn is not None:
        sqlText = getSQLfromCN(Gts,Cn)
        cur.execute(sqlText)
        results = cur.fetchall()
        print(results)

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327208, 31472, 15, "Frank Lucas: I don't care what you call it, put a choke-hold on the motherfucker and call it Blue Dog Shit.", None, 26096287), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327207, 31472, 15, "Redtop: What's the matter? Ain't you niggas never seen naked coochie before?::Huey Lucas: Why they all naked?::Frank Lucas: So they can't steal nothin'.", None, 26096286), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 437

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327208, 31472, 15, "Frank Lucas: I don't care what you call it, put a choke-hold on the motherfucker and call it Blue Dog Shit.", None, 26096287), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327207, 31472, 15, "Redtop: What's the matter? Ain't you niggas never seen naked coochie before?::Huey Lucas: Why they all naked?::Frank Lucas: So they can't steal nothin'.", None, 26096286), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'W

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327208, 31472, 15, "Frank Lucas: I don't care what you call it, put a choke-hold on the motherfucker and call it Blue Dog Shit.", None, 26096287), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327207, 31472, 15, "Redtop: What's the matter? Ain't you niggas never seen naked coochie before?::Hue

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327208, 31472, 15, "Frank Lucas: I don't care what you call it, put a choke-hold on the motherfucker and call it Blue Dog Shit.", None, 26096287), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327207, 31472, 15, "Redtop: What's the matter? Ain't you niggas never seen naked coochie before?::Huey Lucas: Why they all naked?::Frank Lucas: So they can't steal nothin'.", None, 26096286), (7425591

[]
[]
[(7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327208, 31472, 15, "Frank Lucas: I don't care what you call it, put a choke-hold on the motherfucker and call it Blue Dog Shit.", None, 26096287), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242', None, 39927668, 327207, 31472, 15, "Redtop: What's the matter? Ain't you niggas never seen naked coochie before?::Huey Lucas: Why they all naked?::Frank Lucas: So they can't steal nothin'.", None, 26096286), (7425591, 1023912, 31472, 213404, None, 1, 1, 9871867, 31472, 'American Gangster', None, 1, 2007, None, 'A5625', None, None, None, None, 43784946, 1023912, 'Washington, Denzel', None, None, 'W2523', 'D5242'

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


Tutorial
http://www.nltk.org/howto/wordnet.html
http://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html

In [237]:
import nltk 
nltk.download('wordnet')
nltk.download('omw')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pr3ma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw to
[nltk_data]     C:\Users\pr3ma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw is already up-to-date!


In [274]:
pp(wordCount)

{'cast_info': {'__search_id': 812695,
               'id': 812695,
               'movie_id': 181706,
               'note': 26395,
               'nr_order': 332,
               'person_id': 273034,
               'person_role_id': 206953,
               'role_id': 11},
 'char_name': {'__search_id': 206952,
               'id': 206952,
               'imdb_id': 1,
               'imdb_index': 1,
               'name': 78110,
               'name_pcode_nf': 17359,
               'surname_pcode': 7423},
 'movie_info': {'__search_id': 198678,
                'id': 198678,
                'info': 416236,
                'info_type_id': 1,
                'movie_id': 17553,
                'note': 1},
 'name': {'__search_id': 273034,
          'id': 273034,
          'imdb_id': 1,
          'imdb_index': 76,
          'name': 107066,
          'name_pcode_cf': 18183,
          'name_pcode_nf': 16421,
          'surname_pcode': 3751},
 'role_type': {'__search_id': 11, 'id': 11, 'role': 13},

In [340]:
wn.synsets('movie')[0].wup_similarity(wn.synsets('film')[0])

1.0

In [327]:
for syn in wn.synsets('note'):
    print(wn.synsets('movie')[0].wup_similarity(syn),syn)

0.26666666666666666 Synset('note.n.01')
0.631578947368421 Synset('note.n.02')
0.26666666666666666 Synset('note.n.03')
0.2857142857142857 Synset('note.n.04')
0.3076923076923077 Synset('note.n.05')
0.25 Synset('bill.n.03')
0.2857142857142857 Synset('note.n.07')
0.2857142857142857 Synset('eminence.n.01')
0.3076923076923077 Synset('note.n.09')
None Synset('note.v.01')
None Synset('notice.v.02')
None Synset('note.v.03')
None Synset('note.v.04')


In [331]:
for syn in wn.synsets('paper'):
    print(wn.synsets('article')[0].wup_similarity(syn),syn)

0.26666666666666666 Synset('paper.n.01')
0.4 Synset('composition.n.08')
0.5555555555555556 Synset('newspaper.n.01')
0.625 Synset('paper.n.04')
0.9411764705882353 Synset('paper.n.05')
0.2222222222222222 Synset('newspaper.n.02')
0.75 Synset('newspaper.n.03')
None Synset('paper.v.01')
None Synset('wallpaper.v.01')
