In [22]:
%pip install -U sentence-transformers



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
from sentence_transformers import SentenceTransformer, util


def extract_info_from_table(file_path):
    ids = []
    labels = []
    table_ids = []
    table_names = []
    table_columns = []

    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            ids.append(data['id'])
            labels.append(data['label'])
            table_ids.append(data['table_id'])
            table_names.append(data['table_name'])
            table_columns.append(data['table_columns'])

    return ids, labels, table_ids, table_names, table_columns


def extract_info_from_glossary(file_path):
    ids = []
    labels = []
    descriptions = []

    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            ids.append(data['id'])
            labels.append(data['label'])
            descriptions.append(data['desc'])

    return ids, labels, descriptions


def similarity(table_columns:list, glossary_labels:list, model_name = "all-MiniLM-L6-v2"):
    """
    cosine similarity between two lists
    """
    model = SentenceTransformer(model_name)
    embedding_table = model.encode(table_columns)
    embedding_glossary = model.encode(glossary_labels)
    cos_sim = util.cos_sim(embedding_table,embedding_glossary)
    return cos_sim


  from tqdm.autonotebook import tqdm, trange


In [11]:
# Loading Table Metadata
file_path = '/content/drive/MyDrive/sem-tab-2024/round2/r2_sample_metadata.jsonl'
column_ids, column_labels, table_ids, table_names, table_columns = extract_info_from_table(file_path)

# print("IDs:", column_ids)
# print("Labels:", column_labels)
# print("Table IDs:", table_ids)
# print("Table Names:", table_names)
# print("Table Columns:", table_columns)

# Loading Glossary information
glossary_path = '/content/drive/MyDrive/sem-tab-2024/round2/r2_glossary.jsonl'
glossary_ids, glossary_labels, glossary_descriptions = extract_info_from_glossary(glossary_path)
# print(glossary_ids[0], glossary_labels[0], glossary_descriptions[0])

IDs: ['nys-traffic-tickets-issued-four-year-window##Violation Charged Code', 'nys-traffic-tickets-issued-four-year-window##Violation Description', 'nys-traffic-tickets-issued-four-year-window##Violation Year', 'nys-traffic-tickets-issued-four-year-window##Violation Month', 'nys-traffic-tickets-issued-four-year-window##Violation Day of Week', 'nys-traffic-tickets-issued-four-year-window##Age at Violation', 'nys-traffic-tickets-issued-four-year-window##Gender', 'nys-traffic-tickets-issued-four-year-window##State of License', 'nys-traffic-tickets-issued-four-year-window##Police Agency', 'nys-traffic-tickets-issued-four-year-window##Court', 'nys-traffic-tickets-issued-four-year-window##Source']
Labels: ['Violation Charged Code', 'Violation Description', 'Violation Year', 'Violation Month', 'Violation Day of Week', 'Age at Violation', 'Gender', 'State of License', 'Police Agency', 'Court', 'Source']
Table IDs: ['nys-traffic-tickets-issued-four-year-window', 'nys-traffic-tickets-issued-four-

In [5]:
glossary_label_desc = [(label + " " + desc) for label in glossary_labels for desc in glossary_descriptions]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")
glossary_label_desc = [(label + " " + desc) for label in glossary_labels for desc in glossary_descriptions]
embeddings_glossary_label_desc = model.encode(glossary_label_desc)
# embeddings_column_labels = model.encode(column_labels)



In [21]:
output_ls = []
for i in range(len(column_labels)):
  column_label = column_labels[i]
  embedding_column_label = model.encode(column_label)
  cos_sim = util.cos_sim(embedding_column_label, embeddings_glossary_label_desc)

  # Add all pairs to a list with their cosine similarity score
  all_sentence_combinations = []
  for glossary_labels_idx in range(len(glossary_labels)):
      all_sentence_combinations.append([cos_sim[0][glossary_labels_idx],glossary_labels_idx])

  # Sort list by the highest cosine similarity score
  all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

  # print("\nTop-10 most similar pairs:")
  cos_ls = []
  for score, glossary_labels_idx in all_sentence_combinations[0:10]:
    glossary_label = glossary_labels[glossary_labels_idx]
    glossary_id = glossary_ids[glossary_labels_idx]
    cos_score = cos_sim[0][glossary_labels_idx].item()
    cos_dict = {
        "id": glossary_id,
        "score": cos_score
    }
    cos_ls.append(cos_dict)
    # print("column: {} \t glossary label: {} \t cos: {:.4f} id:{}".format(column_ids[i], glossary_label, cos_score , glossary_id))

  output_dict = {
      "id": column_ids[i],
      "mappings": cos_ls
  }
  output_ls.append(output_dict)
  # print(output_dict)


with open("/content/drive/MyDrive/sem-tab-2024/round2/output.jsonl", 'w') as file:
  for dict in output_ls:
    json_str = json.dumps(dict)
    file.write(json_str + '\n')

In [17]:
import os

# Change the current working directory
new_directory = '/content/drive/MyDrive/sem-tab-2024/round2'
os.chdir(new_directory)

In [20]:
!python evaluate.py -m output.jsonl -g r2_sample_metadata_GT.jsonl

Hit@1: 0.45
Hit@5: 0.73
