In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import *

In [2]:
# read glossary
glossary = pd.read_csv('../data/metadata2kg/round1/r1_glossary_processed.csv')
# convert domain, range columns to str
glossary['domain'] = glossary['domain'].astype(str)
glossary['range'] = glossary['range'].astype(str)
# replace 'nan' with ''
glossary['domain'] = glossary['domain'].replace('nan', '')
glossary['range'] = glossary['range'].replace('nan', '')
# read sample metadata
metadata = pd.read_json('../data/metadata2kg/round1/r1_test_metadata.jsonl', lines=True)
# concat index number with id to make it unique
#metadata['id'] = metadata.index.astype(str) + '_' + metadata['id']

## Completion

In [3]:
prompt = """You are given a table in JSON-LD standard and a prompt. Take into account the table_name and table_columns to rank the top five official fine-grained property from dbpedia.org/ontology/ vocabulary for the prompt column. Don't respond in json.

Example:
table: 
{{"table_name": "Museum", "table_columns": ["RANG", "Museum", "Stadt", "Facebook-Fans"]}}

prompt: Stadt

fine-grained property:
1: location
2: locationCity
3: locationName
4: city
5: livingPlace

Task:
table: 
{{"table_name": "{table_name}", "table_columns": {col_names}}}

prompt: {label}

fine-grained property:
"""

In [None]:
import asyncio
import os
import time

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

@background
def fn(chunk):
    for i, table in chunk.iterrows():
      json_file = 'test_completion_baseline_k_matches_gpt_sc0/output' + str(i) + '.json' # change for each run

      # convert string to seed offset
      seed_offset = sum([ord(c) for c in json_file]) 

      # if directory does not exist, create it
      if not os.path.exists(json_file[:json_file.rfind('/')]):
        os.makedirs(json_file[:json_file.rfind('/')])

      if os.path.isfile(json_file):
        continue
      
      id = table['id']
      print(id)
      print('Table: ', i, table['label'])

      for retry in range(10):
        try:          
            # second prompt
            messages = [
                {"role": "user", "content": prompt.format(table_name=table['table_name'],
                                                        label=table['label'],
                                                        col_names=table['table_columns'])}
            ]
            #print(messages[0]['content'])
            m2 = message_gpt(messages, temperature=0.7, seed=retry+seed_offset)
            print(m2)
            print(get_numbered_ids(m2))

            # write output to json file
            with open(json_file, 'w') as f:
                # write json object
                f.write(json.dumps({
                    'table': table.to_json(),
                    'document': [],
                    'analysis_k_matches': m2,
                    'k_matches': get_numbered_ids(m2)[-10:],
                    '1_match': []
                }, indent=4))
               
            # for now never retry
            break
        except Exception as e:
            print(e),
            pass

        print('Retrying: ', json_file)
        time.sleep(2)

n_clients = 20

for i in range(n_clients):
    fn(metadata[metadata.index%n_clients==i])
    #break

## Merge

In [5]:
# self consistency join
import glob

folders = [] # add folders here
output_folder = 'test_completion_baseline_k_matches'

# if directory exists, its contents
if os.path.exists(output_folder):
    for file in glob.glob(output_folder + '/*.json'):
        os.remove(file)
else:
    os.makedirs(output_folder)
            
for folder in folders:
    for file in glob.glob(folder + '/*.json'):
        with open(file, 'r') as f:   
            try:
                data = json.load(f)
            except Exception as e:
                print(e)
                continue

        # if file does not exist, write it and copy
        if not os.path.isfile(output_folder + '/' + os.path.basename(file)):
            # write output to json file
            with open(output_folder + '/' + os.path.basename(file), 'w') as f:
                # convert k_matches to [{id: id, count: count}]
                data['n_samples'] = 1
                data['k-matches'] = [{'id': id, 'count': 1, 'rrf': 1/(rank+0.01)} for rank, id in enumerate(data['k_matches'])]
                # write json object
                f.write(json.dumps(data, indent=4))

        # count matches
        else:
            with open(output_folder + '/' + os.path.basename(file), 'r') as f:
                data2 = json.load(f)

            # if k_matches in k-matches, increment count else add to k-matches
            for rank, id in enumerate(data['k_matches']):
                for d in data2['k-matches']:
                    if id == d['id']:
                        d['count'] += 1
                        d['rrf'] += 1/(rank+0.01)
                        break
                else:
                    data2['k-matches'].append({'id': id, 'count': 1, 'rrf': 1/(rank+0.01)})

            # sort by count
            data2['n_samples'] = data2['n_samples'] + 1
            data2['k-matches'] = sorted(data2['k-matches'], key=lambda x: x['rrf'], reverse=True)

            # drop
            data2.pop('analysis_k_matches', None)
            data2.pop('analysis_1_match', None)
            # write output to json file
            with open(output_folder + '/' + os.path.basename(file), 'w') as f:
                # write json object
                f.write(json.dumps(data2, indent=4))

In [6]:
# filter ids that are not in the glossary
for file in glob.glob(output_folder + '/*.json'):
    with open(file, 'r') as f:
        data = json.load(f)

    # filter k-matches
    data['k-matches'] = [d for d in data['k-matches'] if d['id'] in glossary['id_no_prefix'].values]
    data['1_match'] = [d for d in data['1_match'] if d['id'] in glossary['id_no_prefix'].values]

    # write output to json file
    with open(file, 'w') as f:
        # write json object
        f.write(json.dumps(data, indent=4))

## Eval

In [7]:
import glob

folder = 'test_completion_baseline_k_matches'

files = glob.glob(folder + '/output*.json')

# read all json files
data = []
for file in files:
    with open(file) as f:
        data.append(json.load(f))

# create mapping file
mapping = []
for d in data:
    try:
        # parse table as json object and get id
        table = json.loads(d['table'])
        id = table['id']
        mappings = []

        for matches in d['k-matches']:
            mappings.append({'id': matches['id'], 'score': matches['rrf']})

        # sort by score
        mappings = sorted(mappings, key = lambda i: i['score'], reverse=True)

        # normalize scores, round to 3 decimals
        for match in mappings:
            #match['score'] = match['score'] / len(mappings)
            match['score'] = round(match['score'], 3)

        mapping.append({'id': id, 'mappings': mappings})
    except Exception as e:
        print(d)
        print(e)

# change id_no_prefix to id
for table in mapping:
    #print(len(table['mappings']))
    for match in table['mappings']:
        match['id'] = 'http://dbpedia.org/ontology/' + match['id']

# write mapping file
with open('mapping_completion_baseline.jsonl', 'w') as f:
    for m in mapping:
        f.write(json.dumps(m) + '\n')