In [None]:
import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Define input directory for embeddings and output for vector store index (optional, but good practice)
PROJECT_ROOT = os.getcwd()
# Input: Chunks with embeddings from the embedder.ipynb
INPUT_FILE_ALL_EMBEDDINGS = os.path.join(PROJECT_ROOT, "embeddings", "all_chunks_with_embeddings.json")
# Output: Directory to save the FAISS index (optional, for persistence)
OUTPUT_DIR_VECTOR_STORE = os.path.join(PROJECT_ROOT, "vector_store")
FAISS_INDEX_FILE = os.path.join(OUTPUT_DIR_VECTOR_STORE, "faiss_index.bin")

# Configuration for the embedding model (same as in embedder.ipynb)
EMBEDDING_MODEL_NAME = 'all-MiniLM-L12-v2'
embedding_model = None # Will be loaded once for query embedding

def create_output_directory():
    """Creates the output directory for the vector store if it doesn't exist."""
    os.makedirs(OUTPUT_DIR_VECTOR_STORE, exist_ok=True)
    print(f"Ensured output directory '{OUTPUT_DIR_VECTOR_STORE}' exists.")

def load_embedding_model_for_query():
    """Loads the SentenceTransformer model globally for query embedding."""
    global embedding_model
    if embedding_model is None:
        print(f"Loading embedding model for queries: {EMBEDDING_MODEL_NAME}...")
        try:
            embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
            print("Query embedding model loaded successfully.")
        except Exception as e:
            print(f"Error loading query embedding model {EMBEDDING_MODEL_NAME}: {e}")
            print("Please ensure you have an active internet connection or the model is cached locally.")
            embedding_model = None
    return embedding_model

class VectorStore:
    """
    A simple in-memory vector store using FAISS for efficient similarity search.
    """
    def __init__(self):
        self.index = None
        self.chunk_data = [] # Stores the original chunk information (text, metadata)
        self.dimension = 0

    def load_data_and_build_index(self, json_filepath):
        """
        Loads chunk data with embeddings from a JSON file and builds the FAISS index.
        """
        if not os.path.exists(json_filepath):
            print(f"Error: Input file '{json_filepath}' not found. Please ensure embedder.main() has run successfully.")
            return False

        print(f"Loading chunks and embeddings from '{json_filepath}'...")
        try:
            with open(json_filepath, 'r', encoding='utf-8') as f:
                self.chunk_data = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Failed to load JSON from '{json_filepath}': {e}")
            return False
        except Exception as e:
            print(f"An error occurred while reading '{json_filepath}': {e}")
            return False

        if not self.chunk_data:
            print("No chunk data found to build the index.")
            return False

        # Extract embeddings and ensure they are floats
        embeddings_list = [np.array(item["embedding"], dtype=np.float32) for item in self.chunk_data if "embedding" in item and item["embedding"] is not None]

        if not embeddings_list:
            print("No embeddings found in the data. Cannot build FAISS index.")
            return False

        # All embeddings must have the same dimension
        self.dimension = embeddings_list[0].shape[0]
        embeddings_matrix = np.array(embeddings_list)

        print(f"Building FAISS index with {len(embeddings_matrix)} vectors of dimension {self.dimension}...")
        
        # Using IndexFlatL2 for a simple L2 (Euclidean distance) index
        # This is a basic but effective index for similarity search.
        self.index = faiss.IndexFlatL2(self.dimension)
        self.index.add(embeddings_matrix) # Add the embeddings to the index

        print("FAISS index built successfully.")
        return True

    def search(self, query_text, k=5):
        """
        Performs a similarity search in the vector store for a given query.

        Args:
            query_text (str): The text query to search for.
            k (int): The number of top relevant chunks to retrieve.

        Returns:
            list: A list of dictionaries, where each dictionary contains
                  the original chunk information and its similarity score.
        """
        if self.index is None:
            print("Vector store index not built. Please load data first.")
            return []
        if load_embedding_model_for_query() is None:
            print("Query embedding model not loaded. Cannot perform search.")
            return []

        print(f"Generating embedding for query: '{query_text}'...")
        query_embedding = embedding_model.encode(query_text, convert_to_numpy=True).astype(np.float32)
        
        # Ensure query embedding has the correct dimension
        if query_embedding.shape[0] != self.dimension:
            print(f"Error: Query embedding dimension ({query_embedding.shape[0]}) does not match index dimension ({self.dimension}).")
            return []

        # Reshape for FAISS: needs to be 2D array (1, dimension)
        query_embedding = query_embedding.reshape(1, -1)

        print(f"Searching for top {k} similar chunks...")
        # D: distances (lower is more similar for L2), I: indices of top-k vectors
        distances, indices = self.index.search(query_embedding, k)

        results = []
        for i, idx in enumerate(indices[0]):
            if idx == -1: # FAISS returns -1 if not enough results
                continue
            chunk = self.chunk_data[idx]
            score = distances[0][i]
            results.append({
                "chunk": chunk,
                "similarity_score": float(score) # Convert numpy float to Python float for JSON serialization
            })
        print(f"Found {len(results)} relevant chunks.")
        return results

    def save_index(self, filepath=FAISS_INDEX_FILE):
        """Saves the FAISS index to disk for persistence."""
        create_output_directory()
        if self.index:
            try:
                faiss.write_index(self.index, filepath)
                print(f"FAISS index saved to '{filepath}'")
            except Exception as e:
                print(f"Error saving FAISS index: {e}")
        else:
            print("No index to save.")

    def load_index(self, filepath=FAISS_INDEX_FILE):
        """Loads a FAISS index from disk."""
        if os.path.exists(filepath):
            try:
                self.index = faiss.read_index(filepath)
                # Reconstruct chunk_data if needed for search results (depends on use case)
                # For this simple example, we assume chunk_data is loaded separately with build_index or always kept in memory
                print(f"FAISS index loaded from '{filepath}'")
                return True
            except Exception as e:
                print(f"Error loading FAISS index from '{filepath}': {e}")
                return False
        else:
            print(f"FAISS index file not found at '{filepath}'")
            return False

def main():
    """Main function to demonstrate vector store functionality."""
    create_output_directory()
    vector_store = VectorStore()

    # Build the index from the embeddings JSON file
    if vector_store.load_data_and_build_index(INPUT_FILE_ALL_EMBEDDINGS):
        # Example Usage: Perform a search
        sample_query = "What was Microsoft's total revenue in 2023?"
        retrieved_chunks = vector_store.search(sample_query, k=3)

        print("\n--- Retrieved Chunks for Sample Query ---")
        if retrieved_chunks:
            for i, result in enumerate(retrieved_chunks):
                chunk = result['chunk']
                score = result['similarity_score']
                print(f"\nResult {i+1} (Score: {score:.4f}):")
                print(f"  Company: {chunk.get('company', 'N/A')}, Year: {chunk.get('year', 'N/A')}")
                print(f"  Source File: {chunk.get('source_file', 'N/A')}")
                print(f"  Chunk ID: {chunk.get('id', 'N/A')}")
                print(f"  Text (first 200 chars): {chunk.get('text', '')[:200]}...")
        else:
            print("No chunks retrieved.")
        
        # Optional: Save the FAISS index to disk for later use without re-building
        vector_store.save_index()
    else:
        print("Failed to initialize vector store. Check previous errors.")

if __name__ == "__main__":
    main()
