# Update Plant Species Metadata in BigQuery

This notebook updates plant species metadata in BigQuery from a CSV file stored in GCS.

## Requirements
- Google Cloud credentials configured
- Configuration file: copy `config.example.yml` to `config.yml` and fill in your values
- Required packages: google-cloud-bigquery, google-cloud-storage, pandas, pyyaml


In [None]:
# Import required libraries
import yaml
import pandas as pd
from pathlib import Path
from google.cloud import bigquery
from google.cloud import storage

print("Libraries imported successfully")


In [None]:
# Load configuration from YAML file
config_path = Path("../config.yml")

if not config_path.exists():
    raise FileNotFoundError(
        f"Configuration file not found: {config_path}\n"
        "Please copy config.example.yml to config.yml and fill in your values."
    )

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Extract configuration values
GCS_CSV_URL = config['gcs']['csv_url']
BACKUP_BUCKET = config['gcs'].get('backup_bucket')
BACKUP_PREFIX = config['gcs'].get('backup_prefix', 'backups')
BQ_TABLE_ID = config['bigquery']['table_id']
BQ_PROJECT = config['bigquery'].get('project')

# Verify required config values
if not GCS_CSV_URL or GCS_CSV_URL.startswith('gs://your-'):
    raise ValueError("Please configure gcs.csv_url in config.yml")
if not BQ_TABLE_ID or 'your-project' in BQ_TABLE_ID:
    raise ValueError("Please configure bigquery.table_id in config.yml")

print("✓ Configuration loaded successfully")
print(f"  CSV URL: {GCS_CSV_URL[:50]}..." if len(GCS_CSV_URL) > 50 else f"  CSV URL: {GCS_CSV_URL}")
print(f"  Table ID: {BQ_TABLE_ID}")
print(f"  Backup: gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}" if BACKUP_BUCKET else "  Backup: Not configured")


In [None]:
# Initialize clients
bq_client = bigquery.Client(project=BQ_PROJECT) if BQ_PROJECT else bigquery.Client()
storage_client = storage.Client(project=BQ_PROJECT) if BQ_PROJECT else storage.Client()

print(f"✓ Clients initialized")
print(f"  Project: {bq_client.project}")


In [None]:
# Read CSV from GCS (new data)
print("Reading CSV from GCS...")
df_new = pd.read_csv(GCS_CSV_URL)

print(f"✓ CSV loaded successfully:")
print(f"  Rows: {len(df_new)}")
print(f"  Columns: {list(df_new.columns)}")
print(f"\nFirst few rows:")
df_new.head()


In [None]:
# Display basic statistics about the new data
print("New Data Info:")
df_new.info()
print("\nNew Data Description:")
df_new.describe()


## Read Existing BigQuery Table

Load the current data from BigQuery to compare with the new data.


In [None]:
# Read existing data from BigQuery
print(f"Reading existing data from {BQ_TABLE_ID}...")
query = f"SELECT * FROM `{BQ_TABLE_ID}`"

try:
    df_existing = bq_client.query(query).to_dataframe()
    print(f"✓ Existing table loaded:")
    print(f"  Rows: {len(df_existing)}")
    print(f"  Columns: {list(df_existing.columns)}")
except Exception as e:
    print(f"⚠ Error reading table: {e}")
    print("  This may be expected if the table doesn't exist yet.")
    df_existing = None


In [None]:
# Display existing data (if available)
if df_existing is not None:
    print("Existing data sample:")
    display(df_existing.head())
    print("\nExisting Data Info:")
    df_existing.info()


## Compare Differences

Compare the new CSV data with the existing BigQuery table to identify changes.


In [None]:
# Compare datasets
if df_existing is not None:
    print("=== Comparison Summary ===\n")
    
    # Row count comparison
    print(f"Row count:")
    print(f"  Existing: {len(df_existing)}")
    print(f"  New:      {len(df_new)}")
    print(f"  Diff:     {len(df_new) - len(df_existing):+d}\n")
    
    # Column comparison
    existing_cols = set(df_existing.columns)
    new_cols = set(df_new.columns)
    
    if existing_cols == new_cols:
        print(f"✓ Columns match ({len(new_cols)} columns)")
    else:
        print("⚠ Column differences detected:")
        if new_cols - existing_cols:
            print(f"  New columns: {new_cols - existing_cols}")
        if existing_cols - new_cols:
            print(f"  Removed columns: {existing_cols - new_cols}")
    
    print(f"\nColumns: {list(df_new.columns)}")
else:
    print("No existing data to compare - this will be a new table creation.")


## Backup Existing Table

Before making any changes, create a backup of the existing table to GCS.


In [None]:
# Backup existing table to GCS
from datetime import datetime

if df_existing is not None and BACKUP_BUCKET:
    # Generate backup path with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = f"gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}/{timestamp}/*.csv"
    
    print(f"Creating backup of existing table...")
    print(f"  Destination: {backup_path}")
    
    # Export table to GCS
    extract_job = bq_client.extract_table(
        BQ_TABLE_ID,
        backup_path,
        location="US"
    )
    
    extract_job.result()  # Wait for job to complete
    
    print(f"✓ Backup completed successfully")
    print(f"  Files: {backup_path}")
elif df_existing is None:
    print("⚠ No existing table to backup (table doesn't exist yet)")
elif not BACKUP_BUCKET:
    print("⚠ Backup bucket not configured in config.yml")
    print("  Set 'gcs.backup_bucket' to enable automatic backups")


## Next Steps

⚠️ **STOP HERE** - Review the comparison before proceeding with updates.

In the next iteration, we'll add:
1. Data validation and cleaning
2. BigQuery table update/replace logic
3. Error handling and logging
4. Rollback capability using backups
