In [None]:
# notebooks/02_chunking_embedding.ipynb

import pandas as pd
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from vector_store_utils import load_embedding_model, create_and_index_faiss, load_faiss_index, semantic_search_faiss
from data_preparation import load_complaint_data # To load the filtered data


# --- Task 2: Text Chunking, Embedding, and Vector Store Indexing ---

# Define paths
FILTERED_DATA_PATH = '../data/filtered_complaints.csv'
VECTOR_STORE_DIR = '../vector_store'

# Load the cleaned and filtered dataset
print(f"Loading filtered data from {FILTERED_DATA_PATH}...")
df_cleaned = load_complaint_data(FILTERED_DATA_PATH)

if df_cleaned.empty:
    print("Filtered data not found. Please run '01_eda_preprocessing.ipynb' first.")
else:
    print(f"Loaded {len(df_cleaned)} filtered complaints.")

    # 1. Long narratives are often ineffective when embedded as a single vector.
    # Implement a text chunking strategy.
    # We will use RecursiveCharacterTextSplitter within `create_and_index_faiss`
    # Default parameters for chunking are set in chunk_text function (chunk_size=500, chunk_overlap=50)

    # 2. Choose an embedding model.
    # A good starting point is sentence-transformers/all-MiniLM-L6-v2.
    EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
    model = load_embedding_model(EMBEDDING_MODEL_NAME)

    if model is None:
        print("Failed to load embedding model. Exiting.")
    else:
        # 3. Embedding and Indexing:
        # For each text chunk, generate its vector embedding.
        # Create a vector store using FAISS.
        # Store the embeddings in the vector database with metadata.

        faiss_index, chunk_metadata = create_and_index_faiss(df_cleaned, model, VECTOR_STORE_DIR)

        if faiss_index is not None and chunk_metadata:
            print("\nFAISS Indexing Complete!")
            print(f"Total chunks indexed: {len(chunk_metadata)}")
            print(f"Dimension of embeddings: {faiss_index.d}")

            # --- Optional: Test the search functionality ---
            print("\n--- Testing Semantic Search (Top 3 relevant chunks) ---")
            test_query = "problems with credit report accuracy"
            retrieved_results = semantic_search_faiss(test_query, model, faiss_index, chunk_metadata, k=3)

            if retrieved_results:
                print(f"Query: '{test_query}'")
                for i, result in enumerate(retrieved_results):
                    print(f"\n--- Result {i+1} (Distance: {result['distance']:.4f}) ---")
                    print(f"Complaint ID: {result['original_complaint_id']}")
                    print(f"Product: {result['product']}")
                    print(f"Chunk Text: {result['chunk_text'][:200]}...") # Print first 200 chars
            else:
                print("No results retrieved for the test query.")

            print("\nFAISS index and metadata successfully saved and tested.")
        else:
            print("FAISS index creation failed. Check previous logs.")