In [2]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import *

In [3]:
# read glossary
glossary = pd.read_json('../data/metadata2kg/round1/r1_glossary_enriched.jsonl', lines=True)
glossary['id_no_prefix'] = glossary['id'].str.replace('http://dbpedia.org/ontology/', '')
# if id_no_prefix has duplicates, add a number to the end
glossary['c'] = glossary.groupby('id_no_prefix').cumcount()
glossary['c'] = glossary['c'].apply(lambda x: '' if x == 0 else str(x))
glossary['id_no_prefix'] = glossary['id_no_prefix'] + glossary['c']
glossary = glossary.drop(columns=['c'])

In [None]:
glossary.head()

In [25]:
# save
glossary.to_csv('../data/metadata2kg/round1/r1_glossary_processed.csv', index=False)

# Vocabulary enrichment

In [5]:
glossary_prompt = """"Can you clean up the following? Just fix mistakes or translate to english if necessary. Also change obscure words and you must write abbreviations and acronyms in full. Convert obscure names to their type.
### Example
Q: 
0. last win
1. charles
2. Harelbeke

A: 
0. Last win
1. King (Charles)
2. City (Harelbeke)

### Data

{data}
"""

In [6]:
glossary_desc = {}

In [None]:
import asyncio

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

@background
def fn(groups):
    for chunk in groups:
        if len(set(chunk['id']).intersection(glossary_desc.keys())) > 0:
            print('Already processed')
            return
        
        messages = [
            {"role": "user", "content": glossary_prompt.format(data=print_descriptions(chunk['label'].tolist()))},
        ]

        print(messages[0]['content'])
        #print(messages[0]['content'])
        m = message_gpt(messages, temperature=0.0, seed=47)

        content = extract_content(m)

        if len(content) != len(chunk):
            print('Error: Length mismatch')
            return
        
        for id, content in zip(chunk['id'], content):
            glossary_desc[id] = content

n_clients = 20

# group metadata in chunks of 100
grouped = glossary.groupby([i//100 for i in range(len(glossary))])

for i in range(n_clients):
    group_idx = np.arange(len(grouped))%n_clients==i
    fn([grouped.get_group(i) for i in range(len(grouped)) if group_idx[i]])

In [None]:
import json

# save glossary descriptions
with open('glossary_descriptions_cleaned.json', 'w') as f:
    json.dump(glossary_desc, f)

## Metadata enrichment

In [11]:
# read metadata
metadata = pd.read_json('../data/metadata2kg/round1/r1_test_metadata.jsonl', lines=True)

In [12]:
metadata_prompt = """Can you clean up the following? Fix mistakes and put everything in english if not. Write abbreviations or acronyms in full and add the abbreviation in brackets after the word. Convert names to their type. Add a relation e.g. is a, of a or other. You must understand the meaning of the word first to write the relation.

### Example
Q: 
0. year, film
1. developer, videogame
2. harelbeke, country

A: 
0. Year of a film
1. Developer of a videogame
3. City (Harelbeke) of a country

### Data

{data}
"""

In [9]:
metadata_desc = {}

In [16]:
messages = [
    {"role": "user", "content": metadata_prompt.format(data=print_descriptions(
        metadata.apply(lambda x: '{label}, {table_name}'.format(label=x['label'], table_name=x['table_name'].lower()), axis=1)))}
]

m = message_gpt(messages, temperature=0.0, seed=47)

In [16]:
import json

metadata['emb_desc'] = extract_content(m)
metadata_desc = metadata.set_index('id')['emb_desc'].to_dict()

with open('metadata_descriptions_cleaned.json', 'w') as f:
    json.dump(metadata_desc, f)