In [246]:
import time
from itertools import repeat
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import conceptnet_lite
from conceptnet_lite import Label, edges_between, edges_for

# ConceptNet Local Search

[Git Repo](https://github.com/ldtoolkit/conceptnet-lite)

In [2]:
## CACHING (!!! execute this cell only once !!!)
#

node_cache_1 = {}
node_cache_2 = {}

rel_cache = {}

#
##

In [325]:
## Globals
src = 'input/fRAT.csv'
rat_frat = 'frat'
check = '3'
###

In [317]:
def get_nodes_frat(node, depth=1):
    """ Given a node, get all the other nodes related to it
        The search is performed by looking at all the edges related to a particular node.

        Returns:
            dictionary of form:
                {"pick_someone's_brain": ['related_to'], 'blindly': ['related_to'], ...  'cross_purpose': ['related_to']}
    """
    if depth == 1:
        if node in node_cache_1.keys():
            return node_cache_1[node]
    elif depth == 2:
        if node in node_cache_2.keys():
            return node_cache_2[node]
    
    try:
        # edges starting from (our node)
        if node in node_cache_1.keys():
            nodes = node_cache_1[node]
        else:
            nodes = []
            for e in edges_for(Label.get(text=node, language='en').concepts, same_language=True):
                if e.start.text not in [node]:
                    nodes.append((e.start.text, e.relation.name))
                if e.end.text not in [node]:
                    nodes.append((e.end.text, e.relation.name))

        if depth == 2:
            nodes2 = []
            for n in nodes:
                for e in edges_for(Label.get(text=n[0], language='en').concepts, same_language=True):
                    if e.start.text not in [n[0]]:
                        nodes2.append((e.start.text, e.relation.name))
                    if e.end.text not in [n[0]]:
                        nodes2.append((e.end.text, e.relation.name))

        result = {}
        for tup in list(set(nodes if depth == 1 else nodes2)):
            if tup[0] not in result:
                result[tup[0]] = list()
                result[tup[0]].append(tup[1])
            else:
                result[tup[0]].append(tup[1])
        
        if depth == 1:
            node_cache_1[node] = result  
        else:
            node_cache_2[node] = result
        return result
    except Exception as e:
        msg = '!!!   No label for the node "{}"... Are you sure the spelling is correct?'.format(node)
        print(msg, e)
        return {}

def get_nodes_rat(word):
    """ Given a word, get all the compound words related to it as well as their relation name
        Compound words are basically being identified by the underscore (_)
    """
    # TODO: refactor the logic to be like frat
    try:
        result = []
        relation = []
        for e in edges_for(Label.get(text=word).concepts, same_language=True):
            if (e.start.text.find('_') != -1) & (e.start.text.find(word) != -1):
                result.append(e.start.text.replace(word, '').strip('_'))
                relation.append(e.relation.name)
            if (e.end.text.find('_') != -1) & (e.end.text.find(word) != -1):
                result.append(e.end.text.replace(word, '').strip('_'))
                relation.append(e.relation.name)

        joint_result = []
        for i in range(len(result)):
            if result[i].find('_') != -1:
                words = result[i].split('_')
                for word in words:
                    if word != '':
                        joint_result.append((word, relation[i]))
            else:
                joint_result.append((result[i], relation[i]))

        final_result = {}
        for tup in list(set(joint_result)):
            if tup[0] not in final_result:
                final_result[tup[0]] = list()
                final_result[tup[0]].append(tup[1])
            else:
                final_result[tup[0]].append(tup[1])
        return final_result
    except Exception as e:
        msg = '!!!   No label for the node "{}"... Are you sure the spelling is correct?'.format(node)
        print(msg, e)
        return {}

In [318]:
def get_conceptnet():
    conceptnet_lite.connect('conceptnet_database.db')

def save_csv(rat_frat, check, output):
    output = pd.DataFrame(list(output))
    f_name = rat_frat + '_' + str(check) + '_conceptnet_search.xlsx'
    dIr = os.path.join('output', rat_frat)
    if not os.path.exists(dIr):
        os.mkdir(dIr)
    output.to_excel(os.path.join(dIr, f_name), index=False)
    print('Saved results in {}'.format(f_name))
    
def check_for(relation_dict, check_for, query):
    results = [set(relation_dict[key].keys()) for key in relation_dict.keys()]
    
    if (len(query) == 2) & ('2' in check_for):
        yield results[0] & results[1], [query[0], query[1]]
    
    if (len(query) == 3) & ('2' in check_for):
        yield results[0] & results[1], [query[0], query[1]]
        yield results[0] & results[2], [query[0], query[2]]
        yield results[1] & results[2], [query[1], query[2]]
    
    if (len(query) == 3) & ('3' in check_for):
        yield results[0] & results[1] & results[2], [query[0], query[1], query[2]]
    

def get_output(solutions, query, relation_dict, ground_solution, has_solution, rel_general=True):
    """ rel_general = True defines whether the relationships should be constructed for all nodes.
        set to False for depth 2 because so many nodes"""
    solutions = list(solutions)    
    relations = [] # both directions
    to_solution = [] # node -> solution
    from_solution = [] # solution -> node
    
    # build a relationship message for: (1) node, (2) relation (3) solution
    # For example:
    #
    # cues: antlers, doe, fawn
    # relation: related_to
    # solution: deer
    # relationship message: antler is related_to deer, doe is related_to to deer, fawn is related_to to deer
    if has_solution & rel_general:
        for node in query:
            for sol in solutions:
                rel = ', '.join(relation_dict[node][sol.strip()]) # get the relationships for each node and solution
                relations.append(node + ' is "'+ rel + '" to ' + sol)

                key = node + ' - ' + sol
                if key in rel_cache.keys():
                    for r in rel_cache[key]:
                        from_solution.append(r)
                else:
                    rel_cache[key] = []                
                    for e in edges_between(Label.get(text=node, language='en').concepts, Label.get(text=sol, language='en').concepts):
                        rel = e.start.text + ' is "' + e.relation.name + '" to ' + e.end.text
                        to_solution.append(rel)
                        rel_cache[key].append(rel)

        for sol in solutions:
            for node in query:
                key = sol + ' - ' + node
                if key in rel_cache.keys():
                    for r in rel_cache[key]:
                        from_solution.append(r)
                else:
                    rel_cache[key] = []                
                    for e in edges_between(Label.get(text=sol, language='en').concepts, Label.get(text=node, language='en').concepts):
                        rel = e.start.text + ' is "' + e.relation.name + '" to ' + e.end.text
                        from_solution.append(rel)
                        rel_cache[key].append(rel)

    return {'FrAt': ', '.join(query),
            'ground solution': ground_solution,
            'solutions': ', '.join(solutions),
            'has_solution': has_solution,
            'relation': ' | '.join(relations),
            'relation_to_solution': ' | '.join(to_solution),
            'relation_from_solution': ' | '.join(from_solution)}

In [323]:
def compute(items):
    index, query, df, args, output, accuracy = items
    get_nodes = get_nodes_rat if args[0] == 'rat' else get_nodes_frat
    solution = df.iloc[index].wans
    relation_dict = {}
    for node in query:
        relation_dict[node] = get_nodes(node)
    
    # the format of the relation_dict at this point would be
    #
    # { 'query_node': {'related_node_1': ['relation_1', 'relation_2'], ..., 'related_node_n': ['relation_1']}
    #  'question': {"pick_someone's_brain": ['related_to'], 'blindly': ['related_to'], ... 'cross_purpose': ['related_to']},
    #  'reply': {'repone': ['related_to'], ... 'sentences': ['related_to']},
    #  'solution': {'solutionism': ['derived_from', 'related_to'],... 'exhibit': ['related_to']}
    # }
    checked = []
    for result, quer in check_for(relation_dict, args[1], query):
        print('Checking triple {}...'.format(quer))
        accuracy['total'] += 1
        has_solution = any(solution.lower().strip() == node for node in result)
        if has_solution:
            accuracy['tp'] += 1
            checked.append(quer)
            output.append(get_output(result, quer, relation_dict, solution, has_solution))
        else:
            relation_dict2 = {}
            print('Checking depth 2 for triple {}...'.format(quer))
            for node in quer:
                relation_dict2[node] = get_nodes(node, 2)
            for result2, quer2 in check_for(relation_dict2, args[1], quer):
                print('Checking depth 2 for tuple {}...'.format(quer2))
                if any((quer2 == q for q in checked)):
                    continue
                
                checked.append(quer2)
                has_solution = any(solution.lower().strip() == node for node in result2)
                if has_solution:
                    accuracy['tp'] += 1
                        
                output.append(get_output(result2, quer2, relation_dict2, solution, has_solution, True))
                        
    return output, accuracy

In [326]:
get_conceptnet()
output = []
accuracy = {
    'total': 0,
    'tp': 0
}
if rat_frat == 'rat': # csvs differ
    df = pd.read_csv(src)
else:
    df = pd.read_csv(src, sep=';')
queries = df.w1 + ' ' + df.w2 + ' ' + df.w3
queries = [list(map(lambda x: x.lower(), filter(len, line.split(' ')))) for line in queries]

# queries format
# [['question', 'reply', 'solution'], ... ['fault', 'incorrect', 'unjust']]
for item in tqdm(zip(range(0, len(queries)),
                queries,
                repeat(df),
                repeat((rat_frat, check)),
                repeat(output),
                repeat(accuracy))):
    output, accuracy = compute(item)

output.append({'Accuracy': str(round(100*accuracy['tp']/accuracy['total'], 2)) + '%'})
save_csv(rat_frat, check, output)

0it [00:00, ?it/s]

Checking triple ['question', 'reply', 'solution']...
Checking triple ['sensitive', 'sob', 'weep']...
Checking triple ['antlers', 'doe', 'fawn']...
Checking triple ['bud', 'dandelion', 'petals']...
Checking triple ['colt', 'mare', 'unicorn']...


6it [00:03,  1.57it/s]

!!!   No label for the node "royaly"... Are you sure the spelling is correct? <Model: Label> instance matching query does not exist:
SQL: SELECT "t1"."id", "t1"."text", "t1"."language_id" FROM "label" AS "t1" WHERE (("t1"."language_id" = ?) AND ("t1"."text" = ?)) LIMIT ? OFFSET ?
Params: [30, 'royaly', 1, 0]
Checking triple ['crown', 'royaly', 'throne']...
Checking triple ['algebra', 'calculus', 'trigonometry']...
Checking triple ['pedal', 'pull', 'shove']...
Checking triple ['clockwise', 'left', 'wrong']...
Checking triple ['flu', 'nauseous', 'virus']...
Checking triple ['astronomy', 'moon', 'twinkle']...
Checking triple ['bait', 'pond', 'tuna']...
Checking triple ['bandaid', 'trim', 'wound']...
Checking triple ['gravity', 'low', 'up']...
Checking triple ['emergency', 'rapid', 'slow']...
Checking triple ['brawl', 'debate', 'soldier']...
Checking triple ['birds', 'frog', 'kite']...
Checking triple ['finger', 'glove', 'palm']...
Checking triple ['bed', 'darkness', 'sedative']...
Checkin

29it [00:04,  2.23it/s]

!!!   No label for the node "yo-yo"... Are you sure the spelling is correct? <Model: Label> instance matching query does not exist:
SQL: SELECT "t1"."id", "t1"."text", "t1"."language_id" FROM "label" AS "t1" WHERE (("t1"."language_id" = ?) AND ("t1"."text" = ?)) LIMIT ? OFFSET ?
Params: [30, 'yo-yo', 1, 0]
Checking triple ['adults', 'development', 'yo-yo']...
Checking triple ['cemetery', 'coma', 'noose']...
Checking triple ['exam', 'scare', 'terror']...
Checking triple ['hand', 'toe', 'trigger']...
Checking triple ['angel', 'church', 'faith']...


48it [00:04, 10.70it/s]

!!!   No label for the node "commamder"... Are you sure the spelling is correct? <Model: Label> instance matching query does not exist:
SQL: SELECT "t1"."id", "t1"."text", "t1"."language_id" FROM "label" AS "t1" WHERE (("t1"."language_id" = ?) AND ("t1"."text" = ?)) LIMIT ? OFFSET ?
Params: [30, 'commamder', 1, 0]
Checking triple ['body', 'commamder', 'scull']...
Checking triple ['cello', 'scalpel', 'trumpet']...
Checking triple ['desk', 'quill', 'stapler']...
Checking triple ['arrest', 'badge', 'deputy']...
Checking triple ['electron', 'inertia', 'zest']...
Checking triple ['diet', 'strain', 'sweat']...
Checking triple ['assault', 'cop', 'murder']...
Checking triple ['drill', 'grave', 'spike']...
Checking triple ['care', 'tactful', 'willing']...
Checking triple ['midnight', 'saturn', 'wolf']...
Checking triple ['bloom', 'opportunity', 'split']...
Checking triple ['accomplished', 'dolphin', 'sly']...
Checking triple ['duck', 'sardine', 'sinker']...
Checking triple ['europe', 'mushroom'




In [247]:
print('Cached {} nodes in the first level, {} in the second and {} relationships'.format(
        len(node_cache_1), len(node_cache_2), len(rel_cache)))

Cached 139 nodes in the first level, 105 in the second and 51236 relationships
