In [3]:
import torch
from transformers import BertTokenizer, BertModel
import os
import json
from tqdm import tqdm
# Input and Output Paths
output_file = "output.json"  # Path to the existing output JSON
embedded_output_file = "output_with_embeddings.json"  # Path to save the updated JSON

# Load pre-trained BERT model and tokenizer
print("Loading BERT model...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to compute BERT embeddings
def compute_bert_embedding(text):
    """
    Compute BERT embedding for a given text.
    Returns the mean-pooled embedding as a list.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        embedding = last_hidden_state.mean(dim=1).squeeze().tolist()  # Mean pooling
    return embedding

# Function to add embeddings to positive and negative labels
def add_embeddings_to_labels(output_file, embedded_output_file):
    """
    Add BERT embeddings to positive and negative labels in the JSON file.
    """
    # Load existing data
    if not os.path.exists(output_file):
        raise FileNotFoundError(f"Output file not found: {output_file}")

    with open(output_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Add embeddings to positive labels
    print("Embedding positive labels...")
    for entry in tqdm(data.get("positive labels", [])[:10]):
        if not entry.get("embedding"):  # Avoid reprocessing
            entry["embedding"] = compute_bert_embedding(entry["description"])

    # Add embeddings to negative labels
    print("Embedding negative labels...")
    for entry in tqdm(data.get("negative labels", [])[:10]):
        if not entry.get("embedding"):  # Avoid reprocessing
            entry["embedding"] = compute_bert_embedding(entry["description"])

    # Save updated data to a new file
    with open(embedded_output_file, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)

    print(f"Embeddings added and saved to: {embedded_output_file}")

# Run the function
if __name__ == "__main__":
    add_embeddings_to_labels(output_file, embedded_output_file)

Loading BERT model...
Embedding positive labels...


100%|██████████| 10/10 [00:00<00:00, 21.02it/s]


Embedding negative labels...


100%|██████████| 10/10 [00:00<00:00, 23.90it/s]


Embeddings added and saved to: output_with_embeddings.json
