# NYC Landmarks Wikipedia Integration Testing

This notebook tests the integration of Wikipedia articles for NYC landmarks into the vector database. It demonstrates the process of:

1. Fetching landmark information from the CoreDataStore API
2. Retrieving associated Wikipedia articles
3. Processing article content (fetching, cleaning, chunking)
4. Generating embeddings for article chunks
5. Storing embeddings in Pinecone vector database
6. Querying the vector database to retrieve Wikipedia content
7. Analyzing the distribution and quality of Wikipedia content in the vector database

The notebook serves as both a testing tool and a demonstration of the Wikipedia integration capabilities.

## Environment Setup

First, let's set up our environment by creating a Python alias and installing any required dependencies.

In [None]:
# Create a python alias for python3 and verify the Python installation
!alias python=python3
!python --version

# Check if the project is installed correctly
!pip list | grep nyc-landmarks-vector-db || echo "Project not installed - install with 'pip install -e .'"

In [None]:
# Install the project in development mode if not already installed
import os

# Check if we're in the right directory structure
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
print(f"Project root directory: {project_root}")

# Check for setup.py to confirm we're in the right place
setup_py_path = os.path.join(project_root, "setup.py")
if os.path.exists(setup_py_path):
    print("setup.py found, installing project in development mode...")
    !cd {project_root} && pip install -e .
else:
    print(f"setup.py not found at {setup_py_path}, please check directory structure")

In [16]:
# Check for environment variables required by the project
import os

# List of potential required environment variables
env_vars = [
    "OPENAI_API_KEY",  # For OpenAI embeddings
    "PINECONE_API_KEY",  # For Pinecone vector DB
    "PINECONE_ENVIRONMENT",  # Pinecone environment
    "PINECONE_INDEX_NAME",  # Pinecone index name
]

print("Checking environment variables:")
for var in env_vars:
    if var in os.environ:
        print(f"✓ {var} is set")
    else:
        print(f"✗ {var} is NOT set")

Checking environment variables:
✓ OPENAI_API_KEY is set
✓ PINECONE_API_KEY is set
✓ PINECONE_ENVIRONMENT is set
✓ PINECONE_INDEX_NAME is set


## Setup and Imports

First, let's import the necessary modules and set up logging.

In [None]:
import logging
import os
import sys
import math
from typing import List

import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd

# Add project root to path to ensure imports work correctly
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from nyc_landmarks.db.db_client import get_db_client
from nyc_landmarks.db.wikipedia_fetcher import WikipediaFetcher
from nyc_landmarks.embeddings.generator import EmbeddingGenerator
from nyc_landmarks.models.wikipedia_models import (
    WikipediaArticleModel,
)
from nyc_landmarks.vectordb.pinecone_db import PineconeDB

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger()

# Initialize the components
db_client = get_db_client()  # Using db_client instead of api_client
wiki_fetcher = WikipediaFetcher()
embedding_generator = EmbeddingGenerator()
pinecone_db = PineconeDB()

## 1. Exploring Landmark Data

Let's start by fetching some landmarks from the CoreDataStore API and explore the data structure using interactive pagination widgets.

In [33]:
# Get the total record count for pagination
print("Getting total landmark record count...")
total_records = db_client.get_total_record_count()
print(f"Total landmark records: {total_records}")

# Create interactive widgets for landmark data pagination
page_size_options = [10, 20, 50, 100]
page_size_dropdown = widgets.Dropdown(
    options=page_size_options,
    value=10,
    description='Page size:',
    disabled=False,
    layout=widgets.Layout(width='200px')
)

# Calculate max page number based on total records and page size
def get_max_page(page_size):
    return math.ceil(total_records / page_size)

# Create the page number input with validation
page_number = widgets.BoundedIntText(
    value=1,
    min=1,
    max=get_max_page(page_size_dropdown.value),
    step=1,
    description='Page:',
    disabled=False,
    layout=widgets.Layout(width='150px')
)

# Navigation buttons
prev_button = widgets.Button(
    description='Previous',
    disabled=True,  # Disabled initially since we start at page 1
    button_style='',
    tooltip='Go to previous page',
    icon='arrow-left'
)

next_button = widgets.Button(
    description='Next',
    disabled=False,
    button_style='',
    tooltip='Go to next page',
    icon='arrow-right'
)

# Status label showing page info
status_label = widgets.Label(
    value=f"Page 1 of {get_max_page(page_size_dropdown.value)} (Records: {total_records})"
)

# Output area for the dataframe
output_area = widgets.Output()

# Function to fetch and display landmarks
def fetch_and_display_landmarks(page, page_size):
    with output_area:
        clear_output()
        print(f"Fetching page {page} with {page_size} records per page...")
        try:
            # Fetch the data from the API
            response = db_client.get_lpc_reports(page=page, limit=page_size)

            # Check if we got results
            if not response.results:
                print(f"No landmarks found on page {page}")
                return None

            # Create a DataFrame for display
            landmarks_df = pd.DataFrame([landmark.model_dump() for landmark in response.results])

            # Calculate record range on current page
            start_record = (page - 1) * page_size + 1
            end_record = min(start_record + len(response.results) - 1, total_records)

            print(f"Showing records {start_record}-{end_record} of {total_records}")
            display(landmarks_df)

            # Return the landmarks for potential further use
            return response.results
        except Exception as e:
            print(f"Error fetching landmarks: {str(e)}")
            return None

# Event handlers for widgets
def on_page_change(change):
    if change['name'] == 'value' and change['new'] != change['old']:
        page = change['new']
        max_page = get_max_page(page_size_dropdown.value)

        # Update button states
        prev_button.disabled = (page <= 1)
        next_button.disabled = (page >= max_page)

        # Update status label
        status_label.value = f"Page {page} of {max_page} (Records: {total_records})"

        # Fetch and display landmarks
        global landmarks
        landmarks = fetch_and_display_landmarks(page, page_size_dropdown.value)

def on_page_size_change(change):
    if change['name'] == 'value' and change['new'] != change['old']:
        # Recalculate max page
        new_page_size = change['new']
        new_max_page = get_max_page(new_page_size)

        # Update page number widget range
        page_number.max = new_max_page

        # Adjust current page if needed
        if page_number.value > new_max_page:
            page_number.value = new_max_page

        # Update status label
        status_label.value = f"Page {page_number.value} of {new_max_page} (Records: {total_records})"

        # Refetch with new page size
        global landmarks
        landmarks = fetch_and_display_landmarks(page_number.value, new_page_size)

        # Update button states
        prev_button.disabled = (page_number.value <= 1)
        next_button.disabled = (page_number.value >= new_max_page)

def on_prev_button_click(b):
    if page_number.value > 1:
        page_number.value -= 1

def on_next_button_click(b):
    if page_number.value < get_max_page(page_size_dropdown.value):
        page_number.value += 1

# Register event handlers
page_number.observe(on_page_change, names='value')
page_size_dropdown.observe(on_page_size_change, names='value')
prev_button.on_click(on_prev_button_click)
next_button.on_click(on_next_button_click)

# Layout the widgets
controls = widgets.HBox([page_size_dropdown, page_number, prev_button, next_button])
dashboard = widgets.VBox([controls, status_label, output_area])

# Display the widgets
display(dashboard)

# Initial display
landmarks = fetch_and_display_landmarks(page_number.value, page_size_dropdown.value)

Getting total landmark record count...
Total landmark records: 1765


VBox(children=(HBox(children=(Dropdown(description='Page size:', layout=Layout(width='200px'), options=(10, 20…

## 2. Retrieving Wikipedia Articles for Landmarks

Now let's check which landmarks have associated Wikipedia articles and examine their structure.

In [None]:
# Function to check and display Wikipedia articles for a landmark

def check_wikipedia_articles(landmark_id: str) -> List[WikipediaArticleModel]:
    """Check if a landmark has associated Wikipedia articles.

    Args:
        landmark_id: ID of the landmark to check

    Returns:
        List of WikipediaArticleModel objects
    """
    articles = db_client.get_wikipedia_articles(landmark_id)
    print(f"Found {len(articles)} Wikipedia articles for landmark: {landmark_id}")
    return articles

# Check Wikipedia articles for each landmark
landmark_articles = {}
for landmark in landmarks:
    landmark_id = landmark.lpNumber
    name = landmark.name
    print(f"Checking {name} ({landmark_id})...")
    articles = check_wikipedia_articles(landmark_id)
    if articles:
        landmark_articles[landmark_id] = articles
    print("-" * 40)

print(
    f"Found {len(landmark_articles)} landmarks with Wikipedia articles out of {len(landmarks)} total"
)

In [31]:
# Display the Wikipedia articles we found
if landmark_articles:
    # Extract landmark ID, name, article title, and URL into a list of dictionaries
    articles_data = []
    for landmark_id, articles in landmark_articles.items():
        landmark_name = next(
            (l.name for l in landmarks if l.lpNumber == landmark_id), "Unknown"
        )
        for article in articles:
            articles_data.append(
                {
                    "landmark_id": landmark_id,
                    "landmark_name": landmark_name,
                    "article_title": article.title,
                    "article_url": article.url,
                }
            )

    # Create a DataFrame for easier viewing
    articles_df = pd.DataFrame(articles_data)
    articles_df
else:
    print("No landmarks with Wikipedia articles found in the sample")

## 3. Fetching and Processing Wikipedia Content

Now let's fetch the actual content from a Wikipedia article and process it for embedding.

## 7. End-to-End Wikipedia Processing Test

This section demonstrates a complete end-to-end workflow for processing and querying Wikipedia articles for NYC landmarks.

In [None]:
# Complete end-to-end test for a single landmark
def process_landmark_wikipedia_articles(landmark_id):
    """Process Wikipedia articles for a single landmark.

    Args:
        landmark_id: ID of the landmark to process

    Returns:
        Dictionary with processing results
    """
    print(f"Processing Wikipedia articles for landmark {landmark_id}")

    # Step 1: Get Wikipedia articles for the landmark
    articles = db_client.get_wikipedia_articles(landmark_id)
    if not articles:
        print(f"No Wikipedia articles found for landmark {landmark_id}")
        return {"success": False, "reason": "No Wikipedia articles found"}

    print(f"Found {len(articles)} Wikipedia articles")

    # Step 2: Process each article
    all_chunks = []
    all_vectors = []

    for i, article in enumerate(articles):
        print(f"\nProcessing article {i+1}/{len(articles)}: {article.title}")

        # Fetch article content
        content = wiki_fetcher.fetch_wikipedia_content(article.url)
        if not content:
            print(f"Failed to fetch content for article: {article.title}")
            continue

        print(f"Successfully fetched article content ({len(content)} chars)")

        # Chunk the content
        chunks = wiki_fetcher.chunk_wikipedia_text(
            content, chunk_size=1000, chunk_overlap=200
        )

        print(f"Split article into {len(chunks)} chunks")

        # Add article metadata to chunks
        for chunk in chunks:
            chunk["metadata"]["article_title"] = article.title
            chunk["metadata"]["article_url"] = article.url
            chunk["metadata"]["source_type"] = "wikipedia"
            chunk["metadata"]["landmark_id"] = landmark_id

        all_chunks.extend(chunks)

    # Step 3: Generate embeddings (limit to first 5 chunks for testing)
    test_chunks = all_chunks[:5] if len(all_chunks) > 5 else all_chunks
    print(f"\nGenerating embeddings for {len(test_chunks)} chunks")

    chunks_with_embeddings = embedding_generator.process_chunks(test_chunks)
    print(f"Generated embeddings for {len(chunks_with_embeddings)} chunks")

    # Step 4: Store in Pinecone
    print("\nStoring embeddings in Pinecone...")
    vector_ids = pinecone_db.store_chunks(
        chunks=chunks_with_embeddings,
        id_prefix=f"wiki-{landmark_id}-",
        landmark_id=landmark_id,
        use_fixed_ids=True,
        delete_existing=True,
    )

    print(f"Stored {len(vector_ids)} vectors in Pinecone")

    # Step 5: Query the vectors
    landmark_name = next((l.name for l in landmarks if l.lpNumber == landmark_id), "landmark")
    test_query = f"Tell me about the history of {landmark_name}"
    print(f"\nTest query: '{test_query}'")

    query_embedding = embedding_generator.generate_embedding(test_query)
    results = pinecone_db.query_vectors(
        query_embedding,
        top_k=3,
        filter_dict={"landmark_id": landmark_id, "source_type": "wikipedia"}
    )

    print(f"Found {len(results)} matching results")

    # Return the results
    return {
        "success": True,
        "landmark_id": landmark_id,
        "articles_processed": len(articles),
        "chunks_generated": len(all_chunks),
        "vectors_stored": len(vector_ids),
        "query_results": results
    }

# Test with a landmark that has Wikipedia articles
if landmark_articles:
    test_landmark_id = next(iter(landmark_articles.keys()))
    print(f"Running end-to-end test with landmark: {test_landmark_id}")
    result = process_landmark_wikipedia_articles(test_landmark_id)

    if result["success"]:
        print("\nTest completed successfully!")
        print(f"Articles processed: {result['articles_processed']}")
        print(f"Chunks generated: {result['chunks_generated']}")
        print(f"Vectors stored: {result['vectors_stored']}")

        # Display query results
        print("\nQuery results:")
        for i, match in enumerate(result["query_results"]):
            print(f"\nMatch {i+1} - Score: {match['score']:.4f}")
            print(f"Article: {match['metadata'].get('article_title', 'Unknown')}")
            print(f"Text: {match['metadata'].get('text', '')[:200]}...")
    else:
        print(f"Test failed: {result['reason']}")
else:
    print("No landmarks with Wikipedia articles available for testing")