In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import *

In [2]:
# read glossary
glossary = pd.read_csv('../data/metadata2kg/round2/r2_glossary_processed.csv')
# read sample metadata
metadata = pd.read_json('../data/metadata2kg/round2/r2_test_metadata.jsonl', lines=True)
# concat index number with id to make it unique
metadata['id'] = metadata.index.astype(str) + '_' + metadata['id']

In [3]:
# load glossary descriptions
with open('metadata_descriptions_cleaned.json', 'r') as f:
    metadata_desc = json.load(f)
metadata['emb_desc'] = metadata['id'].map(metadata_desc)

## Embeddings

In [None]:
from openai import OpenAI
from constants import API_KEY_OPENAI, API_KEY_DEEPINFRA

client = OpenAI(api_key=API_KEY_OPENAI)

glossary_embeddings_openai = []
for i in range(0, len(glossary), 512):
    print(i)
    emb = client.embeddings.create(input = glossary['desc'].values[i:i+512].tolist(), model='text-embedding-3-large').data
    emb = [e.embedding for e in emb]
    glossary_embeddings_openai.append(emb)
glossary_embeddings_openai = np.concatenate(glossary_embeddings_openai)

metadata_embeddings_openai = []
for i in range(0, len(metadata), 512):
    print(i)
    emb = client.embeddings.create(input = metadata['emb_desc'].values[i:i+512].tolist(), model='text-embedding-3-large').data
    emb = [e.embedding for e in emb]
    metadata_embeddings_openai.append(emb)
metadata_embeddings_openai = np.concatenate(metadata_embeddings_openai)

In [5]:
glossary['embeddings_openai'] = glossary_embeddings_openai.tolist()
metadata['embeddings_openai'] = metadata_embeddings_openai.tolist()

In [6]:
sim_score = []

g = np.concatenate(glossary['embeddings_openai'].values).reshape(-1, 3072)
for d in metadata.to_dict('records'):
    e = d['embeddings_openai']
    sim_score.append(np.dot(g, e)) # cosine similarity
sim_score = np.array(sim_score).reshape(len(metadata), len(glossary))

In [7]:
rank_openai = {}

for i, d in enumerate(metadata.to_dict('records')):
    sim = sim_score[i,:]
    idx = np.argsort(sim.flatten())[::-1]
    orig_idx = np.tile(glossary['id'].values, len(sim))
    rank_ids = orig_idx[idx]
    rank_ids = list(dict.fromkeys(rank_ids))
    rank_openai[d['id']] = rank_ids

## Rerank

In [8]:
query = """What is the best description of column {column} in the table '{table_name}' with columns {table_columns}?"""

In [9]:
rerank_openai = {}

In [None]:
import asyncio
import os
from rank_gpt import sliding_windows

def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

@background
def fn(chunk):
    for i, row in chunk.iterrows():
        if row['id'] in rerank_openai:
            continue

        print(row['id'])
        top_hits = rank_openai[row['id']][:150]
        
        item = {
            'query': query.format(column=row['label'], 
                                table_columns=row['table_columns'], 
                                table_name=row['table_name']),
            'hits': [{'id': row['id'], 'content': row['label'] + ", " + row['desc']} for i, row in glossary.iloc[top_hits].iterrows()]
        }

        new_item = sliding_windows(item, rank_start=0, rank_end=len(top_hits), window_size=20, step=10, model_name="meta-llama/Meta-Llama-3-70B-Instruct", api_key=API_KEY_DEEPINFRA)
        ranking = [hit['id'] for hit in new_item['hits']]
        rerank_openai[row['id']] = ranking

n_clients = 20
for i in range(n_clients):
    fn(metadata[metadata.index%n_clients==i])
    #break

## Eval

In [None]:
import glob

# create mapping file
mapping = []
for i, d in enumerate(metadata.to_dict('records')):
    # parse table as json object and get id
    id = d['id']
    mappings = []

    if id in rerank_openai:
        for top in rerank_openai[id]: #rank
            mappings.append({'id': str(int(top)), 'score': 1.0})
    else:
        continue

    mapping.append({'id': '_'.join(id.split('_')[1:]), 'mappings': mappings})

for table in mapping:
    print(table)

# write mapping file
with open('mapping_rerank.jsonl', 'w') as f:
    for m in mapping:
        f.write(json.dumps(m) + '\n')