# Update gridVeg Image Metadata in BigQuery

This notebook appends new image metadata records to the BigQuery table from a CSV file stored in GCS.

**Operation**: APPEND new rows (not replace entire table)

## Requirements
- Google Cloud credentials configured
- Configuration file: copy `config.example.yml` to `config.yml` and fill in your values
- Required packages: google-cloud-bigquery, google-cloud-storage, pandas, pyyaml


In [None]:
# Import required libraries
import yaml
import pandas as pd
from pathlib import Path
from google.cloud import bigquery
from google.cloud import storage
from datetime import datetime

print("Libraries imported successfully")


## Load Configuration

**TODO**: Add configuration section to config.yml for this table


In [None]:
# Load configuration from YAML file
config_path = Path("../config.yml")

if not config_path.exists():
    raise FileNotFoundError(
        f"Configuration file not found: {config_path}\n"
        "Please copy config.example.yml to config.yml and fill in your values."
    )

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Extract configuration values for gridVeg image metadata
# TODO: Update these config keys once added to config.yml
GCS_CSV_URL = config['gridveg_image_metadata']['gcs']['csv_url']
BACKUP_BUCKET = config['gridveg_image_metadata']['gcs'].get('backup_bucket')
BACKUP_PREFIX = config['gridveg_image_metadata']['gcs'].get('backup_prefix', 'backups/gridveg_image_metadata')
BQ_TABLE_ID = config['gridveg_image_metadata']['bigquery']['table_id']
BQ_PROJECT = config['gridveg_image_metadata']['bigquery'].get('project')

# Verify required config values
if not GCS_CSV_URL or GCS_CSV_URL.startswith('gs://your-'):
    raise ValueError("Please configure gridveg_image_metadata.gcs.csv_url in config.yml")
if not BQ_TABLE_ID or 'your-project' in BQ_TABLE_ID:
    raise ValueError("Please configure gridveg_image_metadata.bigquery.table_id in config.yml")

print("✓ Configuration loaded successfully")
print(f"  CSV URL: {GCS_CSV_URL[:60]}..." if len(GCS_CSV_URL) > 60 else f"  CSV URL: {GCS_CSV_URL}")
print(f"  Table ID: {BQ_TABLE_ID}")
print(f"  Backup: gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}" if BACKUP_BUCKET else "  Backup: Not configured")


In [None]:
# Initialize clients
bq_client = bigquery.Client(project=BQ_PROJECT) if BQ_PROJECT else bigquery.Client()
storage_client = storage.Client(project=BQ_PROJECT) if BQ_PROJECT else storage.Client()

print(f"✓ Clients initialized")
print(f"  Project: {bq_client.project}")


## Load CSV Data from GCS

Read the source CSV file containing new image metadata records.


In [None]:
# Read CSV from GCS (new data)
print("Reading CSV from GCS...")
df_new = pd.read_csv(GCS_CSV_URL)

print(f"✓ CSV loaded successfully:")
print(f"  Rows: {len(df_new)}")
print(f"  Columns: {list(df_new.columns)}")
print(f"\nFirst few rows:")
df_new.head()


## Transform CSV Data

Apply schema transformations to match BigQuery table:
- Rename columns to match destination schema
- Convert date format from mm/dd/yy to ISO format (YYYY-MM-DD)
- Add image_url column (initially NULL - to be populated separately)
- Clean up Direction field (handle invisible character issue in "North")


In [None]:
# Define column mapping from CSV to BigQuery
column_mapping = {
    '__kp_Photos': 'image_ID',
    'Survey Data::__kp_Survey': 'survey_ID',
    'Survey Data::SurveyDate': 'date',
    'Survey Data::SurveyYear': 'year',
    'Survey Data::_kf_Site': 'grid_point',
    'Direction': 'image_direction'
}

print("Column mapping:")
for csv_col, bq_col in column_mapping.items():
    print(f"  {csv_col:35s} → {bq_col}")


In [None]:
# Verify CSV columns match expected schema
expected_csv_columns = set(column_mapping.keys())
actual_csv_columns = set(df_new.columns)

if actual_csv_columns == expected_csv_columns:
    print("✓ CSV columns match expected schema")
else:
    print("⚠ CSV column differences detected:")
    if actual_csv_columns - expected_csv_columns:
        print(f"  Unexpected columns: {actual_csv_columns - expected_csv_columns}")
    if expected_csv_columns - actual_csv_columns:
        print(f"  Missing columns: {expected_csv_columns - actual_csv_columns}")
    
print(f"\nCSV columns: {list(df_new.columns)}")


In [None]:
# Apply transformation: rename columns
df_transformed = df_new.copy()
df_transformed = df_transformed.rename(columns=column_mapping)

print("✓ Columns renamed")
print(f"  Transformed columns: {list(df_transformed.columns)}")


In [None]:
# Convert date from m/d/yy to proper datetime/date format
# Explicitly specify format to avoid parsing warnings and ensure consistency
# Note: %y handles 2-digit years (00-68 = 2000-2068, 69-99 = 1969-1999)
df_transformed['date'] = pd.to_datetime(df_transformed['date'], format='%m/%d/%y').dt.date

print("✓ Date format converted to date type")
print(f"  Sample dates: {df_transformed['date'].head().tolist()}")


In [None]:
# Clean up Direction field - strip whitespace and handle invisible characters
# The source mentions "invisible difference in North" that displays as two levels
df_transformed['image_direction'] = df_transformed['image_direction'].str.strip()

# Check for unique values and any issues
print("✓ Direction field cleaned")
print(f"  Unique directions: {sorted(df_transformed['image_direction'].dropna().unique())}")
print(f"  Direction counts:")
for direction, count in df_transformed['image_direction'].value_counts().items():
    print(f"    {repr(direction):12s}: {count}")


In [None]:
# Add image_url column (not in source CSV - will be NULL initially)
# This column exists in the destination schema but not in the source data
df_transformed['image_url'] = None

print("✓ Added image_url column (initially NULL)")
print(f"  Column will be populated separately with actual image URLs")


In [None]:
# Reorder columns to match destination schema
expected_column_order = ['image_ID', 'image_url', 'survey_ID', 'date', 'year', 'grid_point', 'image_direction']
df_transformed = df_transformed[expected_column_order]

print("✓ Columns reordered to match destination schema")
print(f"  Final columns: {list(df_transformed.columns)}")


In [None]:
# Display transformed data info
print("Transformed Data Info:")
df_transformed.info()
print(f"\nTransformed data preview:")
df_transformed.head()
