In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import *

In [17]:
# read glossary
glossary = pd.read_csv('../data/metadata2kg/round1/r1_glossary_processed.csv')
# convert domain, range columns to str
glossary['domain'] = glossary['domain'].astype(str)
glossary['range'] = glossary['range'].astype(str)
# replace 'nan' with ''
glossary['domain'] = glossary['domain'].replace('nan', '')
glossary['range'] = glossary['range'].replace('nan', '')
# read sample metadata
metadata = pd.read_json('../data/metadata2kg/round1/r1_test_metadata.jsonl', lines=True)
# concat index number with id to make it unique
#metadata['id'] = metadata.index.astype(str) + '_' + metadata['id']

In [19]:
import json

# load glossary descriptions
with open('glossary_descriptions_cleaned.json', 'r') as f:
    glossary_desc = json.load(f)
glossary['emb_desc'] = glossary['id'].map(glossary_desc)

## Completion

In [18]:
prompt = """Given a table '{table_name}' with columns {col_names} what is the best description for the column '{label}'?
I will provide you with {num} triples in the format [domain, property, range] with a description of the property, each indicated by number identifier [].

{descriptions}
You must rank the top five descriptions best matching the column and provide the 5 unique identifiers using the output format [].

### Example
Analysis: <reasoning>

1. [id1]
2. [id2]
3. [id3]
4. [id4]
5. [id5]

### Answer
"""

In [None]:
rerank = read_jsonl_file('mapping_rerank.jsonl')

In [None]:
import asyncio
import os
import time

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

@background
def fn(chunk):
    for i, table in chunk.iterrows():
      json_file = 'test_completion_k_matches_gpt_sc0/output' + str(i) + '.json' # change this for each run

      # convert string to seed offset
      seed_offset = sum([ord(c) for c in json_file]) 

      # if directory does not exist, create it
      if not os.path.exists(json_file[:json_file.rfind('/')]):
        os.makedirs(json_file[:json_file.rfind('/')])

      if os.path.isfile(json_file):
        continue
      
      id = table['id']
      print(id)
      maps = [m['mappings'] for m in rerank if m['id'] == id][-1]
      top_k = [m['id'] for m in maps][:30]
      
      document = glossary[glossary['id'].isin(top_k)]
      print(len(document['id'].tolist()), document['id_no_prefix'].tolist())

      # random shuffle the document for 'lost in the middle' problem
      document = document.sample(frac=1).reset_index(drop=True)

      print('Table: ', i, table['label'])

      for retry in range(10):
        try:          
            # second prompt
            messages = [
                {"role": "user", "content": prompt.format(table_name=table['table_name'],
                                                        label=table['label'],
                                                        descriptions=print_descriptions(print_glossary(document).split('\n')),
                                                        num = len(document['desc'].tolist()),
                                                        col_names=table['table_columns'])}
            ]
            #print(messages[0]['content'])
            m2 = message_gpt(messages, temperature=0.7, seed=retry+seed_offset)
            print(m2)

            # write output to json file
            with open(json_file, 'w') as f:
                # write json object
                f.write(json.dumps({
                    'table': table.to_json(),
                    'document': document['id_no_prefix'].tolist(),
                    'analysis_k_matches': m2,
                    'k_matches': document.iloc[extract_identifiers(m2)[-5:]]['id_no_prefix'].tolist(),
                    '1_match': []
                }, indent=4))
               
            # for now never retry
            break
        except Exception as e:
            print(e),
            pass

        print('Retrying: ', json_file)
        time.sleep(2)

n_clients = 10

for i in range(n_clients):
    fn(metadata[metadata.index%n_clients==i])
    #break

## Merge

In [None]:
# self consistency join
import glob
import os

folders = [] # add folders here
output_folder = 'test_completion_k_matches'

# if directory exists, its contents
if os.path.exists(output_folder):
    for file in glob.glob(output_folder + '/*.json'):
        os.remove(file)
else:
    os.makedirs(output_folder)
            
for folder in folders:
    for file in glob.glob(folder + '/*.json'):
        with open(file, 'r') as f:   
            try:
                data = json.load(f)
            except:
                print(file)
                continue

        # if file does not exist, write it and copy
        if not os.path.isfile(output_folder + '/' + os.path.basename(file)):
            # write output to json file
            with open(output_folder + '/' + os.path.basename(file), 'w') as f:
                # convert k_matches to [{id: id, count: count}]
                data['n_samples'] = 1
                data['k-matches'] = [{'id': id, 'count': 1, 'rrf': 1/(rank+0.01)} for rank, id in enumerate(data['k_matches'])]
                # write json object
                f.write(json.dumps(data, indent=4))

        # count matches
        else:
            with open(output_folder + '/' + os.path.basename(file), 'r') as f:
                data2 = json.load(f)

            # if k_matches in k-matches, increment count else add to k-matches
            for rank, id in enumerate(data['k_matches']):
                for d in data2['k-matches']:
                    if id == d['id']:
                        d['count'] += 1
                        d['rrf'] += 1/(rank+0.01)
                        break
                else:
                    data2['k-matches'].append({'id': id, 'count': 1, 'rrf': 1/(rank+0.01)})

            # if 1_match in 1_match, increment count else add to 1_match
            for id in data['1_match']:
                for d in data2['1_match']:
                    if id == d['id']:
                        d['count'] += 1
                        break
                else:
                    data2['1_match'].append({'id': id, 'count': 1})

            # sort by count
            data2['n_samples'] = data2['n_samples'] + 1
            data2['k-matches'] = sorted(data2['k-matches'], key=lambda x: x['rrf'], reverse=True)

            # drop
            data2.pop('analysis_k_matches', None)
            data2.pop('analysis_1_match', None)

            # write output to json file
            with open(output_folder + '/' + os.path.basename(file), 'w') as f:
                # write json object
                f.write(json.dumps(data2, indent=4))

## Eval

In [None]:
import glob
import json

folder = 'test_completion_k_matches'

files = glob.glob(folder + '/output*.json')

# read all json files
data = []
for file in files:
    with open(file) as f:
        data.append(json.load(f))

# create mapping file
mapping = []
for d in data:
    try:
        # parse table as json object and get id
        table = json.loads(d['table'])
        id = table['id']
        mappings = []

        for matches in d['k-matches']:
            mappings.append({'id': matches['id'], 'score': matches['rrf']})

        # sort by score
        mappings = sorted(mappings, key = lambda i: i['score'], reverse=True)

        # round to 3 decimals
        for match in mappings:
            match['score'] = round(match['score'], 3)

        mapping.append({'id': id, 'mappings': mappings})
    except Exception as e:
        print(e)

# change id_no_prefix to id
for table in mapping:
    for match in table['mappings']:
        match['id'] = str(glossary[glossary['id_no_prefix'] == match['id']]['id'].values[0])

# write mapping file
with open('mapping_completion.jsonl', 'w') as f:
    for m in mapping:
        f.write(json.dumps(m) + '\n')