## Prerequisites

- Google Cloud SDK (`gcloud`) installed
- GCP project with billing enabled
- Dataproc API enabled
- Compute Engine API enabled (for Solr VM)
- Appropriate IAM permissions

In [None]:
import os
import sys
import subprocess
import json
import time
from google.cloud import storage
from google.cloud import dataproc_v1

# Ensure we are in the project root
if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")
print(f"Current working directory: {os.getcwd()}")

## Configuration

Set your GCP project details and cluster configuration.

In [None]:
# GCP Configuration
PROJECT_ID = "your-project-id"  # TODO: Set your GCP project ID
REGION = "us-central1"
ZONE = "us-central1-a"
BUCKET_NAME = f"{PROJECT_ID}-spark-solr-data"  # GCS bucket for data

# Dataproc Configuration
CLUSTER_NAME = "spark-solr-cluster"
DATAPROC_MASTER_TYPE = "n1-standard-4"
DATAPROC_WORKER_TYPE = "n1-standard-4"
DATAPROC_WORKER_COUNT = 2

# Solr Configuration (VM-based)
SOLR_VM_NAME = "solr-instance"
SOLR_VM_TYPE = "n1-standard-2"
SOLR_EXTERNAL_IP = None  # Will be set after VM creation

print(f"Project: {PROJECT_ID}")
print(f"Region: {REGION}")
print(f"Bucket: {BUCKET_NAME}")

## 1. Authenticate with GCP

Login to Google Cloud Platform.

In [None]:
!gcloud auth login
!gcloud config set project {PROJECT_ID}

## 2. Create GCS Bucket

Create a Cloud Storage bucket to store data and scripts.

In [None]:
storage_client = storage.Client(project=PROJECT_ID)

try:
    bucket = storage_client.create_bucket(BUCKET_NAME, location=REGION)
    print(f"✓ Created bucket: {BUCKET_NAME}")
except Exception as e:
    if "409" in str(e):  # Bucket already exists
        bucket = storage_client.bucket(BUCKET_NAME)
        print(f"✓ Using existing bucket: {BUCKET_NAME}")
    else:
        print(f"✗ Failed to create bucket: {e}")
        raise

## 3. Generate and Upload Data

Generate dummy data locally and upload to GCS.

In [None]:
# Check if data already exists locally
local_data_exists = False
if os.path.exists("data/dummy_data.json"):
    with open("data/dummy_data.json") as f:
        lines = f.readlines()
    if len(lines) > 0:
        print(f"⏭️  Using existing local data: {len(lines)} records")
        local_data_exists = True

# Generate data if needed
if not local_data_exists:
    print("Generating data...")
    !python3 data_gen/generate_data.py
    with open("data/dummy_data.json") as f:
        lines = f.readlines()
    print(f"✓ Generated {len(lines)} records locally")

# Check if data already exists in GCS
blob = bucket.blob("data/dummy_data.json")
if blob.exists():
    print(f"⏭️  Data already exists in GCS: gs://{BUCKET_NAME}/data/dummy_data.json")
else:
    # Upload to GCS
    blob.upload_from_filename("data/dummy_data.json")
    print(f"✓ Uploaded to gs://{BUCKET_NAME}/data/dummy_data.json")

## 4. Create Solr VM on GCE

Launch a Compute Engine VM and install Solr Cloud.

In [None]:
# Check if Solr VM already exists
vm_exists_cmd = f"gcloud compute instances describe {SOLR_VM_NAME} --zone={ZONE} --format='get(name)' 2>/dev/null"
result = subprocess.run(vm_exists_cmd, shell=True, capture_output=True, text=True)

if result.stdout.strip() == SOLR_VM_NAME:
    print(f"⏭️  Solr VM '{SOLR_VM_NAME}' already exists")
    # Get existing IP
    ip_result = subprocess.run(
        f"gcloud compute instances describe {SOLR_VM_NAME} --zone={ZONE} --format='get(networkInterfaces[0].accessConfigs[0].natIP)'",
        shell=True, capture_output=True, text=True
    )
    SOLR_EXTERNAL_IP = ip_result.stdout.strip()
    print(f"  Solr URL: http://{SOLR_EXTERNAL_IP}:8983")
else:
    # Create Solr VM with startup script
    startup_script = """#!/bin/bash
apt-get update
apt-get install -y openjdk-11-jdk wget

# Download and setup Solr
cd /opt
wget https://archive.apache.org/dist/lucene/solr/8.11.3/solr-8.11.3.tgz
tar xzf solr-8.11.3.tgz
cd solr-8.11.3

# Start Solr in cloud mode
bin/solr start -c -m 2g

# Create collection
bin/solr create -c dummy_data -s 1 -rf 1

echo "Solr started on port 8983"
"""

    # Create VM
    create_vm_cmd = f"""
gcloud compute instances create {SOLR_VM_NAME} \\
    --project={PROJECT_ID} \\
    --zone={ZONE} \\
    --machine-type={SOLR_VM_TYPE} \\
    --image-family=debian-11 \\
    --image-project=debian-cloud \\
    --boot-disk-size=50GB \\
    --tags=solr-server \\
    --metadata=startup-script='{startup_script}'
"""

    print("Creating Solr VM...")
    !{create_vm_cmd}

    # Create firewall rule for Solr
    !gcloud compute firewall-rules create allow-solr \
        --project={PROJECT_ID} \
        --allow=tcp:8983 \
        --target-tags=solr-server \
        --description="Allow Solr traffic" \
        2>/dev/null || echo "Firewall rule already exists"

    # Wait for VM to be ready
    time.sleep(60)

    # Get external IP
    result = subprocess.run(
        f"gcloud compute instances describe {SOLR_VM_NAME} --zone={ZONE} --format='get(networkInterfaces[0].accessConfigs[0].natIP)'",
        shell=True, capture_output=True, text=True
    )
    SOLR_EXTERNAL_IP = result.stdout.strip()
    print(f"✓ Solr VM created with IP: {SOLR_EXTERNAL_IP}")
    print(f"  Solr URL: http://{SOLR_EXTERNAL_IP}:8983")

## 5. Create Dataproc Cluster

Launch a Dataproc cluster for running Spark jobs.

In [None]:
# Check if Dataproc cluster already exists
cluster_exists_cmd = f"gcloud dataproc clusters describe {CLUSTER_NAME} --region={REGION} --format='get(clusterName)' 2>/dev/null"
result = subprocess.run(cluster_exists_cmd, shell=True, capture_output=True, text=True)

if result.stdout.strip() == CLUSTER_NAME:
    print(f"⏭️  Dataproc cluster '{CLUSTER_NAME}' already exists")
else:
    # Create Dataproc cluster
    create_cluster_cmd = f"""
gcloud dataproc clusters create {CLUSTER_NAME} \\
    --project={PROJECT_ID} \\
    --region={REGION} \\
    --master-machine-type={DATAPROC_MASTER_TYPE} \\
    --worker-machine-type={DATAPROC_WORKER_TYPE} \\
    --num-workers={DATAPROC_WORKER_COUNT} \\
    --image-version=2.1-debian11 \\
    --enable-component-gateway \\
    --optional-components=JUPYTER \\
    --max-idle=3600s
"""

    print("Creating Dataproc cluster (this may take 3-5 minutes)...")
    !{create_cluster_cmd}
    print(f"✓ Dataproc cluster '{CLUSTER_NAME}' created")

## 6. Index Data with Dataproc

Submit the Spark job to index data into GCP Solr. This step checks if indexing is already complete and skips if verified.

In [None]:
# Check if indexing is already complete
import requests

def check_gcp_indexing_complete():
    """Check if data is already indexed in GCP Solr"""
    try:
        # Get local document count
        with open("data/dummy_data.json") as f:
            local_count = sum(1 for _ in f)
        
        # Get Solr document count
        response = requests.get(
            f"http://{SOLR_EXTERNAL_IP}:8983/solr/dummy_data/select?q=*:*&rows=0", 
            timeout=10
        )
        if response.status_code == 200:
            solr_count = response.json()['response']['numFound']
            
            if local_count == solr_count and solr_count > 0:
                # Verify sample document exists
                with open("data/dummy_data.json") as f:
                    first_doc = json.loads(f.readline())
                    doc_id = first_doc['id']
                
                check_response = requests.get(
                    f"http://{SOLR_EXTERNAL_IP}:8983/solr/dummy_data/select?q=id:{doc_id}&rows=1",
                    timeout=10
                )
                if check_response.status_code == 200:
                    match_count = check_response.json()['response']['numFound']
                    if match_count > 0:
                        return True, solr_count
        return False, 0
    except Exception as e:
        print(f"Check failed: {e}")
        return False, 0

already_indexed, doc_count = check_gcp_indexing_complete()

if already_indexed:
    print(f"⏭️  Skipping indexing: {doc_count} documents already indexed and verified in GCP Solr")
else:
    # Upload job to GCS if not already there
    blob = bucket.blob("jobs/index_to_solr_gcp.py")
    if not blob.exists():
        # Modify the Spark job to use GCS paths and remote Solr
        gcp_spark_job = f"""
from pyspark.sql import SparkSession
import os

def main():
    spark = SparkSession.builder \\
        .appName("SolrIndexer-GCP") \\
        .getOrCreate()

    # Read JSON data from GCS
    input_file = "gs://{BUCKET_NAME}/data/dummy_data.json"
    print(f"Reading data from {{input_file}}")
    
    df = spark.read.json(input_file)
    
    print("Schema:")
    df.printSchema()
    
    # Solr configuration - using external Solr VM
    # Note: In production, use internal IP and VPC peering
    zk_host = "{SOLR_EXTERNAL_IP}:9983"
    collection = "dummy_data"
    
    print(f"Indexing to Solr collection '{{collection}}' at ZK '{{zk_host}}'...")
    
    # Write to Solr
    df.write.format("solr") \\
        .option("zkhost", zk_host) \\
        .option("collection", collection) \\
        .option("gen_uniq_key", "true") \\
        .option("commit_within", "1000") \\
        .mode("overwrite") \\
        .save()
        
    print("Indexing complete.")
    spark.stop()

if __name__ == "__main__":
    main()
"""
        # Save and upload job
        with open("spark_job/index_to_solr_gcp.py", "w") as f:
            f.write(gcp_spark_job)
        blob.upload_from_filename("spark_job/index_to_solr_gcp.py")
        print(f"✓ Uploaded job to gs://{BUCKET_NAME}/jobs/index_to_solr_gcp.py")
    
    # Submit job to Dataproc
    submit_job_cmd = f"""
gcloud dataproc jobs submit pyspark \\
    gs://{BUCKET_NAME}/jobs/index_to_solr_gcp.py \\
    --cluster={CLUSTER_NAME} \\
    --region={REGION} \\
    --jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \\
    --properties=spark.jars.packages=com.lucidworks.spark:spark-solr:4.0.0
"""

    print("Submitting Spark job to Dataproc...")
    !{submit_job_cmd}
    print("✓ Job completed")

## 7. Verify Indexing

Query the GCP-hosted Solr instance to verify data was indexed.

In [None]:
import requests

# Query Solr for document count
try:
    response = requests.get(f"http://{SOLR_EXTERNAL_IP}:8983/solr/dummy_data/select?q=*:*&rows=0")
    if response.status_code == 200:
        result = response.json()
        num_docs = result['response']['numFound']
        print(f"✓ Indexed {num_docs} documents in Solr")
    else:
        print("✗ Failed to query Solr")
except Exception as e:
    print(f"✗ Query failed: {e}")

# Show sample documents
print("\nSample documents:")
!curl -s "http://{SOLR_EXTERNAL_IP}:8983/solr/dummy_data/select?q=*:*&rows=3" | python3 -m json.tool

## 8. Cleanup Resources

**Important:** Delete GCP resources to avoid ongoing charges.

In [None]:
# Uncomment to clean up all GCP resources

# # Delete Dataproc cluster
# !gcloud dataproc clusters delete {CLUSTER_NAME} --region={REGION} --quiet
# print("✓ Deleted Dataproc cluster")

# # Delete Solr VM
# !gcloud compute instances delete {SOLR_VM_NAME} --zone={ZONE} --quiet
# print("✓ Deleted Solr VM")

# # Delete firewall rule
# !gcloud compute firewall-rules delete allow-solr --quiet
# print("✓ Deleted firewall rule")

# # Delete GCS bucket
# !gsutil -m rm -r gs://{BUCKET_NAME}
# print("✓ Deleted GCS bucket")

print("To clean up, uncomment the commands above")

## Cost Estimation

- **Dataproc Cluster**: ~$0.50-1.00/hour (2 workers + 1 master)
- **Solr VM**: ~$0.10-0.20/hour (n1-standard-2)
- **Storage**: ~$0.02/GB/month
- **Network Egress**: Variable

**Remember to delete resources when not in use!**