## Create KB from WebQSP Data 
* Load the train,test KB triplets from KGQA WebQSP data 
* Write all the KB Triplets from there

In [1]:
from pathlib import Path 
import os 

root_dir = Path(os.getcwd())

data_dir = root_dir/'datasets/WebQSP/'

In [2]:
import json 
import os 
from pathlib import Path 

def get_ent_ids_qa(path, id_to_ent={}, relations_found=set()):
    """
    Get entid to ent string from question pairs
    
    
    """    
    with open(path, encoding='utf-8') as f: 
        data = json.load(f)

    for question in data['Questions']:
        question_string = question['RawQuestion']
        relation_path = question['Parses'][0]['InferentialChain']
        
        if not relation_path:
            continue 
            
        answer_entities = [row['EntityName'] for row in question['Parses'][0]['Answers']]
        answer_entity_ids = [row['AnswerArgument'] for row in question['Parses'][0]['Answers']]
                
        subject_entities = question['Parses'][0]['TopicEntityName']
        subject_entity_id = question['Parses'][0]['TopicEntityMid']
    
        assert len(answer_entities) == len(answer_entity_ids)
        
        for entid, ent_name in zip(answer_entity_ids, answer_entities):
            if entid not in id_to_ent:
                id_to_ent[entid] = ent_name
                
                
        if subject_entity_id not in id_to_ent:
            id_to_ent[subject_entity_id] = subject_entities
            

            
        relations_found.update(relation_path)
            
    print("Number of entities ", len(id_to_ent))
    print("Number of relations ", len(relations_found))
    
    return id_to_ent, relations_found


root_dir = Path(os.getcwd())


if not os.path.exists(root_dir/'datasets/WebQSP/data_processed'):
    os.mkdir(root_dir/'datasets/WebQSP/data_processed')


path_train = root_dir/'datasets/WebQSP/data/WebQSP.train.json'
path_test = root_dir/'datasets/WebQSP/data/WebQSP.test.json'


id_to_ent, relations_found = get_ent_ids_qa(path_train)
id_to_ent, relations_found = get_ent_ids_qa(path_test, id_to_ent, relations_found)

Number of entities  28111
Number of relations  483
Number of entities  38456
Number of relations  555


In [3]:
kg_dir = data_dir/"data_embedkgqa/EmbedKGQA/data/fbwq_full"

train_kg_path = kg_dir/'train.txt'
test_kg_path = kg_dir/'test.txt'

kg_data = set()
all_entities = set()
all_relations = set()

for kg_path in [train_kg_path, test_kg_path]:
    with open(train_kg_path, encoding='utf-8') as f:
        for line in f: 
            res = line.split('\t')

            headid, relname, tailid = res
            
            headid = headid.strip()
            tailid = tailid.strip()
            relname = relname.strip()
            

            all_entities.add(headid)
            all_entities.add(tailid)
            all_relations.add(relname)
            
            kg_data.add((headid, relname, tailid))
            
            
print('Number of entities in KB', len(all_entities))
print('Number of relations in KB', len(all_relations))
print('Number of triplets in KB', len(kg_data))            

Number of entities in KB 1886681
Number of relations in KB 572
Number of triplets in KB 5778443


In [4]:
missing = set(id_to_ent).difference(all_entities)
missing_rels = relations_found.difference(all_relations)

print('Num Missing Entities', len(missing))
print('Num Missing Relations', len(missing_rels))

Num Missing Entities 141
Num Missing Relations 18


In [5]:
write_dir = data_dir/'data_processed'

with open(write_dir/'qa_ents_missing_in_kb.json', 'w') as f: 
    json.dump(list(missing), f)

with open(write_dir/'qa_relations_missing_in_kb.json', 'w') as f: 
    json.dump(list(missing_rels), f)
    
    
with open(write_dir/'qa_ent_to_id.json', 'w') as f: 
    json.dump(dict(id_to_ent), f)
    
kb_path = write_dir/'kb.txt'

with open(kb_path, 'w', encoding='utf-8') as f: 
    f.write("")

    
fp = open(kb_path, 'a', encoding='utf-8')
for row in kg_data: 
    assert len(row) == 3
    write = "|".join(row)
    
    fp.write(write+'\n')
    
fp.close()