# Update gridVeg Point Intercepts in BigQuery

This notebook appends new point intercept data to BigQuery tables from a CSV file stored in GCS.

**Operation**: APPEND new rows (not replace entire table)

**Target Tables**:
- `gridVeg_point_intercept_vegetation` - vegetation intercept data (4 height layers)
- `gridVeg_point_intercept_ground` - ground cover intercept data

## Requirements
- Google Cloud credentials configured
- Configuration file: copy `config.example.yml` to `config.yml` and fill in your values
- Required packages: google-cloud-bigquery, google-cloud-storage, pandas, pyyaml


In [None]:
# Import required libraries
import yaml
import pandas as pd
from pathlib import Path
from google.cloud import bigquery
from google.cloud import storage
from datetime import datetime

print("Libraries imported successfully")


In [None]:
# Load configuration from YAML file
config_path = Path("../config.yml")

if not config_path.exists():
    raise FileNotFoundError(
        f"Configuration file not found: {config_path}\n"
        "Please copy config.example.yml to config.yml and fill in your values."
    )

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Extract configuration values for gridVeg point intercepts
GCS_CSV_URL = config['gridveg_point_intercepts']['gcs']['csv_url']
BACKUP_BUCKET = config['gridveg_point_intercepts']['gcs'].get('backup_bucket')
BACKUP_PREFIX = config['gridveg_point_intercepts']['gcs'].get('backup_prefix', 'backups/gridveg_point_intercepts')

BQ_TABLE_VEGETATION = config['gridveg_point_intercepts']['bigquery']['table_vegetation']
BQ_TABLE_GROUND = config['gridveg_point_intercepts']['bigquery']['table_ground']
BQ_PROJECT = config['gridveg_point_intercepts']['bigquery'].get('project')

# Verify required config values
if not GCS_CSV_URL or GCS_CSV_URL.startswith('gs://your-'):
    raise ValueError("Please configure gridveg_point_intercepts.gcs.csv_url in config.yml")
if not BQ_TABLE_VEGETATION or 'your-project' in BQ_TABLE_VEGETATION:
    raise ValueError("Please configure gridveg_point_intercepts.bigquery.table_vegetation in config.yml")
if not BQ_TABLE_GROUND or 'your-project' in BQ_TABLE_GROUND:
    raise ValueError("Please configure gridveg_point_intercepts.bigquery.table_ground in config.yml")

print("✓ Configuration loaded successfully")
print(f"  CSV URL: {GCS_CSV_URL[:60]}..." if len(GCS_CSV_URL) > 60 else f"  CSV URL: {GCS_CSV_URL}")
print(f"  Vegetation Table: {BQ_TABLE_VEGETATION}")
print(f"  Ground Table: {BQ_TABLE_GROUND}")
print(f"  Backup: gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}" if BACKUP_BUCKET else "  Backup: Not configured")


In [None]:
# Initialize clients
bq_client = bigquery.Client(project=BQ_PROJECT) if BQ_PROJECT else bigquery.Client()
storage_client = storage.Client(project=BQ_PROJECT) if BQ_PROJECT else storage.Client()

print(f"✓ Clients initialized")
print(f"  Project: {bq_client.project}")


In [None]:
# Read CSV from GCS (new data)
print("Reading CSV from GCS...")
df_new = pd.read_csv(GCS_CSV_URL)

print(f"✓ CSV loaded successfully:")
print(f"  Rows: {len(df_new)}")
print(f"  Columns: {list(df_new.columns)}")
print(f"\nFirst few rows:")
df_new.head()


## Transform CSV Data

The source CSV will be transformed into two separate datasets:

### Vegetation Table Transformations
- Rename columns to match BigQuery schema
- Convert date format from mm/dd/yy to ISO format (YYYY-MM-DD)
- Filter out 2010 records (per requirements)
- Select columns: survey_ID, grid_point, date, year, transect_point, height_intercept_1, intercept_1-4

### Ground Table Transformations
- Rename columns to match BigQuery schema  
- Convert date format from mm/dd/yy to ISO format (YYYY-MM-DD)
- Filter out 2010 records (per requirements)
- Select columns: survey_ID, grid_point, date, year, transect_point, intercept_1, intercept_ground_code


In [None]:
# Define column mapping for VEGETATION table
column_mapping_vegetation = {
    'Survey Data::__kp_Survey': 'survey_ID',
    'Survey Data::_kf_Site': 'grid_point',
    'Survey Data::SurveyDate': 'date',
    'Survey Data::SurveyYear': 'year',
    'PointTrans': 'transect_point',
    'Height': 'height_intercept_1',
    '_kf_Hit1_serial': 'intercept_1',
    '_kf_Hit2_serial': 'intercept_2',
    '_kf_Hit3_serial': 'intercept_3',
    '_kf_Hit4_serial': 'intercept_4'
}

print("Vegetation table column mapping:")
for csv_col, bq_col in column_mapping_vegetation.items():
    print(f"  {csv_col:30s} → {bq_col}")


In [None]:
# Define column mapping for GROUND table
column_mapping_ground = {
    'Survey Data::__kp_Survey': 'survey_ID',
    'Survey Data::_kf_Site': 'grid_point',
    'Survey Data::SurveyDate': 'date',
    'Survey Data::SurveyYear': 'year',
    'PointTrans': 'transect_point',
    '_kf_Hit1_serial': 'intercept_1',
    'GroundCover': 'intercept_ground_code'
}

print("Ground table column mapping:")
for csv_col, bq_col in column_mapping_ground.items():
    print(f"  {csv_col:30s} → {bq_col}")


In [None]:
# Transform data for VEGETATION table
print("=" * 60)
print("TRANSFORMING DATA FOR VEGETATION TABLE")
print("=" * 60)

# Select and rename columns
df_vegetation = df_new[list(column_mapping_vegetation.keys())].copy()
df_vegetation = df_vegetation.rename(columns=column_mapping_vegetation)

print(f"\n✓ Columns renamed")
print(f"  Transformed columns: {list(df_vegetation.columns)}")

# Convert date from m/d/yy to proper datetime/date format
df_vegetation['date'] = pd.to_datetime(df_vegetation['date'], format='%m/%d/%y').dt.date

print(f"✓ Date format converted to date type")
print(f"  Sample dates: {df_vegetation['date'].head(3).tolist()}")

# Filter out 2010 records
rows_before = len(df_vegetation)
df_vegetation = df_vegetation[df_vegetation['year'] != 2010].copy()
rows_after = len(df_vegetation)

print(f"\n✓ Filtered out 2010 records")
print(f"  Rows before: {rows_before}")
print(f"  Rows after:  {rows_after}")
print(f"  Removed:     {rows_before - rows_after}")

print(f"\nVegetation data preview:")
df_vegetation.head()


In [None]:
# Transform data for GROUND table
print("=" * 60)
print("TRANSFORMING DATA FOR GROUND TABLE")
print("=" * 60)

# Select and rename columns
df_ground = df_new[list(column_mapping_ground.keys())].copy()
df_ground = df_ground.rename(columns=column_mapping_ground)

print(f"\n✓ Columns renamed")
print(f"  Transformed columns: {list(df_ground.columns)}")

# Convert date from m/d/yy to proper datetime/date format
df_ground['date'] = pd.to_datetime(df_ground['date'], format='%m/%d/%y').dt.date

print(f"✓ Date format converted to date type")
print(f"  Sample dates: {df_ground['date'].head(3).tolist()}")

# Filter out 2010 records
rows_before = len(df_ground)
df_ground = df_ground[df_ground['year'] != 2010].copy()
rows_after = len(df_ground)

print(f"\n✓ Filtered out 2010 records")
print(f"  Rows before: {rows_before}")
print(f"  Rows after:  {rows_after}")
print(f"  Removed:     {rows_before - rows_after}")

print(f"\nGround data preview:")
df_ground.head()


## Read Existing BigQuery Tables

Load the current data from both BigQuery tables to compare with the new data.


In [None]:
# Read existing VEGETATION table from BigQuery
print(f"Reading existing VEGETATION data from {BQ_TABLE_VEGETATION}...")
query = f"SELECT * FROM `{BQ_TABLE_VEGETATION}`"

try:
    df_existing_vegetation = bq_client.query(query).to_dataframe()
    print(f"✓ Existing vegetation table loaded:")
    print(f"  Rows: {len(df_existing_vegetation)}")
    print(f"  Columns: {list(df_existing_vegetation.columns)}")
    print(f"\nExisting vegetation data preview:")
    display(df_existing_vegetation.head())
except Exception as e:
    print(f"⚠ Error reading table: {e}")
    print("  This may be expected if the table doesn't exist yet.")
    df_existing_vegetation = None


In [None]:
# Read existing GROUND table from BigQuery
print(f"Reading existing GROUND data from {BQ_TABLE_GROUND}...")
query = f"SELECT * FROM `{BQ_TABLE_GROUND}`"

try:
    df_existing_ground = bq_client.query(query).to_dataframe()
    print(f"✓ Existing ground table loaded:")
    print(f"  Rows: {len(df_existing_ground)}")
    print(f"  Columns: {list(df_existing_ground.columns)}")
    print(f"\nExisting ground data preview:")
    display(df_existing_ground.head())
except Exception as e:
    print(f"⚠ Error reading table: {e}")
    print("  This may be expected if the table doesn't exist yet.")
    df_existing_ground = None


## Compare New vs Existing Data

Identify which rows in the new data are not already in the existing tables.

**Unique identifier**: `survey_ID + transect_point` (each survey has multiple transect points)


In [None]:
# Compare VEGETATION datasets
print("=" * 60)
print("COMPARING VEGETATION DATA")
print("=" * 60)

if df_existing_vegetation is not None:
    print(f"\nRow count:")
    print(f"  Existing: {len(df_existing_vegetation)}")
    print(f"  New CSV:  {len(df_vegetation)}")
    
    # Column comparison
    existing_cols = set(df_existing_vegetation.columns)
    new_cols = set(df_vegetation.columns)
    
    if existing_cols == new_cols:
        print(f"\n✓ Columns match ({len(new_cols)} columns)")
    else:
        print("\n⚠ Column differences detected:")
        if new_cols - existing_cols:
            print(f"  New columns: {new_cols - existing_cols}")
        if existing_cols - new_cols:
            print(f"  Missing columns: {existing_cols - new_cols}")
    
    # Create composite key for comparison
    df_existing_vegetation['_composite_key'] = (
        df_existing_vegetation['survey_ID'].astype(str) + '|' + 
        df_existing_vegetation['transect_point'].astype(str)
    )
    df_vegetation['_composite_key'] = (
        df_vegetation['survey_ID'].astype(str) + '|' + 
        df_vegetation['transect_point'].astype(str)
    )
    
    existing_keys = set(df_existing_vegetation['_composite_key'])
    new_keys = set(df_vegetation['_composite_key'])
    
    # Find records to append
    keys_to_append = new_keys - existing_keys
    
    if keys_to_append:
        df_vegetation_to_append = df_vegetation[df_vegetation['_composite_key'].isin(keys_to_append)].copy()
        # Drop the temporary composite key
        df_vegetation_to_append = df_vegetation_to_append.drop(columns=['_composite_key'])
        
        print(f"\n✓ Found {len(df_vegetation_to_append)} new vegetation records to append")
        
        # Show year breakdown
        print(f"\nNew records by year:")
        year_counts = df_vegetation_to_append['year'].value_counts().sort_index()
        for year, count in year_counts.items():
            print(f"  {year}: {count} records")
    else:
        df_vegetation_to_append = None
        print("\n⚠ No new records found - all keys already exist in table")
    
    # Check for duplicates
    duplicate_keys = existing_keys & new_keys
    if duplicate_keys:
        print(f"\n⚠ Warning: {len(duplicate_keys)} records already exist in table")
        print(f"  These will be skipped during append.")
else:
    # No existing table, all records are new
    df_vegetation_to_append = df_vegetation.copy()
    print(f"✓ No existing table - will create new table with {len(df_vegetation_to_append)} records")


In [None]:
# Compare GROUND datasets
print("=" * 60)
print("COMPARING GROUND DATA")
print("=" * 60)

if df_existing_ground is not None:
    print(f"\nRow count:")
    print(f"  Existing: {len(df_existing_ground)}")
    print(f"  New CSV:  {len(df_ground)}")
    
    # Column comparison
    existing_cols = set(df_existing_ground.columns)
    new_cols = set(df_ground.columns)
    
    if existing_cols == new_cols:
        print(f"\n✓ Columns match ({len(new_cols)} columns)")
    else:
        print("\n⚠ Column differences detected:")
        if new_cols - existing_cols:
            print(f"  New columns: {new_cols - existing_cols}")
        if existing_cols - new_cols:
            print(f"  Missing columns: {existing_cols - new_cols}")
    
    # Create composite key for comparison
    df_existing_ground['_composite_key'] = (
        df_existing_ground['survey_ID'].astype(str) + '|' + 
        df_existing_ground['transect_point'].astype(str)
    )
    df_ground['_composite_key'] = (
        df_ground['survey_ID'].astype(str) + '|' + 
        df_ground['transect_point'].astype(str)
    )
    
    existing_keys = set(df_existing_ground['_composite_key'])
    new_keys = set(df_ground['_composite_key'])
    
    # Find records to append
    keys_to_append = new_keys - existing_keys
    
    if keys_to_append:
        df_ground_to_append = df_ground[df_ground['_composite_key'].isin(keys_to_append)].copy()
        # Drop the temporary composite key
        df_ground_to_append = df_ground_to_append.drop(columns=['_composite_key'])
        
        print(f"\n✓ Found {len(df_ground_to_append)} new ground records to append")
        
        # Show year breakdown
        print(f"\nNew records by year:")
        year_counts = df_ground_to_append['year'].value_counts().sort_index()
        for year, count in year_counts.items():
            print(f"  {year}: {count} records")
    else:
        df_ground_to_append = None
        print("\n⚠ No new records found - all keys already exist in table")
    
    # Check for duplicates
    duplicate_keys = existing_keys & new_keys
    if duplicate_keys:
        print(f"\n⚠ Warning: {len(duplicate_keys)} records already exist in table")
        print(f"  These will be skipped during append.")
else:
    # No existing table, all records are new
    df_ground_to_append = df_ground.copy()
    print(f"✓ No existing table - will create new table with {len(df_ground_to_append)} records")
