# Generating Embeddings for RAG

In [None]:
import pandas as pd
import json

# Load Processed Data
combined_data = pd.read_csv('/content/drive/MyDrive/GenAI-CSA/data/combined/combined_data.csv')
with open('/content/drive/MyDrive/GenAI-CSA/data/metadata/metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Loaded combined_data.csv with {combined_data.shape[0]} rows.")
print(f"Loaded metadata.json with {len(metadata)} entries.")

In [None]:
# Integrating Metadata with Text Data
documents = [
    {"page_content": content, "metadata": metadata}
    for content, metadata in zip(combined_data['Data_Summary'], metadata)
]

print(f"Reconstructed {len(documents)} documents for embedding generation.")

In [None]:
from sentence_transformers import SentenceTransformer
import pickle
import time

# Loading Embedding Generation Model
start_time = time.time()
model = SentenceTransformer('paraphrase-mpnet-base-v2')

print(f"Model loaded in {time.time() - start_time:.2f} seconds.")

In [None]:
# Preparing Textual Data for Embeddings
texts = [doc['page_content'] for doc in documents]

# Generate Embeddings
start_time = time.time()
embeddings = model.encode(
    texts,
    show_progress_bar=True,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print(f"Embeddings generated in {time.time() - start_time:.2f} seconds.")