## Prerequisites

- Java 8/11/17 installed and configured
- Apache Spark installed and in PATH
- Python 3.12+ with dependencies installed (`uv sync`)

In [None]:
import os
import sys
import subprocess
import json

# Ensure we are in the project root
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
print(f"Current working directory: {os.getcwd()}")

## 1. Check Environment

Verify that Java and Spark are properly configured.

In [None]:
# Check Java version
print("Java Version:")
!java -version

print("\nSpark Version:")
!spark-submit --version 2>&1 | head -n 1

## 2. Generate Data

Run the Python script to generate dummy JSON data locally.

In [None]:
!python3 data_gen/generate_data.py

# Verify data was created
if os.path.exists("data/dummy_data.json"):
    with open("data/dummy_data.json") as f:
        lines = f.readlines()
    print(f"✓ Generated {len(lines)} records")
    print(f"Sample record: {lines[0][:100]}...")
else:
    print("✗ Data generation failed")

## 3. Setup Local Solr

Download and start a local Solr instance on port 8983.

In [None]:
!./scripts/setup_solr.sh

# Verify Solr is running
import time
import requests

time.sleep(5)  # Wait for Solr to fully start

try:
    response = requests.get("http://localhost:8983/solr/admin/info/system")
    if response.status_code == 200:
        print("✓ Solr is running")
        solr_info = response.json()
        print(f"  Version: {solr_info['lucene']['solr-spec-version']}")
    else:
        print("✗ Solr is not responding")
except Exception as e:
    print(f"✗ Cannot connect to Solr: {e}")

## 4. Index Data with Local Spark

Submit the Spark job to index the generated data into local Solr.

In [None]:
# Run spark-submit with local Spark master
!spark-submit \
    --master "local[*]" \
    --packages com.lucidworks.spark:spark-solr:4.0.0 \
    spark_job/index_to_solr.py

## 5. Verify Indexing

Query local Solr to ensure data was indexed successfully.

In [None]:
# Query Solr for document count
try:
    response = requests.get("http://localhost:8983/solr/dummy_data/select?q=*:*&rows=0")
    if response.status_code == 200:
        result = response.json()
        num_docs = result['response']['numFound']
        print(f"✓ Indexed {num_docs} documents in Solr")
    else:
        print("✗ Failed to query Solr")
except Exception as e:
    print(f"✗ Query failed: {e}")

# Show sample documents
print("\nSample documents:")
!curl -s "http://localhost:8983/solr/dummy_data/select?q=*:*&rows=3" | python3 -m json.tool

## 6. Cleanup (Optional)

Stop Solr and clean up generated data.

In [None]:
# Uncomment to stop Solr and clean data
# !./solr-dist/solr-8.11.3/bin/solr stop -all
# !rm -rf data/dummy_data.json
# print("✓ Cleaned up")