# Fix NA Rows in gridVeg Point Intercept Vegetation

This notebook investigates and fixes NA/NULL rows in the BigQuery table `mpg-data-warehouse.vegetation_point_intercept_gridVeg.gridVeg_point_intercept_vegetation`.

**Operation**: Identify and remove rows with NULL values in critical fields

## Requirements
- Google Cloud credentials configured
- Configuration file: copy `config.example.yml` to `config.yml` and fill in your values
- Required packages: google-cloud-bigquery, pandas, pyyaml


In [None]:
# Import required libraries
import yaml
import pandas as pd
from pathlib import Path
from google.cloud import bigquery
from datetime import datetime

print("Libraries imported successfully")


## Load Configuration


In [None]:
# Load configuration from YAML file
config_path = Path("../config.yml")

if not config_path.exists():
    raise FileNotFoundError(
        f"Configuration file not found: {config_path}\n"
        "Please copy config.example.yml to config.yml and fill in your values."
    )

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Extract configuration values for gridVeg point intercepts
BQ_TABLE_ID = config['gridveg_point_intercepts']['bigquery']['table_vegetation']
BQ_PROJECT = config['gridveg_point_intercepts']['bigquery'].get('project')
BACKUP_BUCKET = config['gridveg_point_intercepts']['gcs'].get('backup_bucket')
BACKUP_PREFIX = config['gridveg_point_intercepts']['gcs'].get('backup_prefix', 'backups/gridveg_point_intercepts')

# Verify required config values
if not BQ_TABLE_ID or 'your-project' in BQ_TABLE_ID:
    raise ValueError("Please configure gridveg_point_intercepts.bigquery.table_vegetation in config.yml")

print("✓ Configuration loaded successfully")
print(f"  Table ID: {BQ_TABLE_ID}")
print(f"  Backup: gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}" if BACKUP_BUCKET else "  Backup: Not configured")


In [None]:
# Initialize BigQuery client
bq_client = bigquery.Client(project=BQ_PROJECT) if BQ_PROJECT else bigquery.Client()

print(f"✓ BigQuery client initialized")
print(f"  Project: {bq_client.project}")


## Investigate Current Table State


In [None]:
# Get table schema and basic info
table = bq_client.get_table(BQ_TABLE_ID)

print("Table Schema:")
for field in table.schema:
    print(f"  {field.name}: {field.field_type} (nullable: {field.mode != 'REQUIRED'})")

print(f"\nTotal rows in table: {table.num_rows}")


In [None]:
# Query to get all data from the table
query = f"SELECT * FROM `{BQ_TABLE_ID}`"

print("Loading current table data...")
df_current = bq_client.query(query).to_dataframe()

print(f"✓ Data loaded: {len(df_current)} rows")
print(f"  Columns: {list(df_current.columns)}")

# Display info
df_current.info()


## Analyze NULL/NA Values


In [None]:
# Check for NULL values in each column
print("NULL Value Analysis:")
print("=" * 60)

null_counts = df_current.isnull().sum()
null_percentages = (df_current.isnull().sum() / len(df_current) * 100)

for col in df_current.columns:
    null_count = null_counts[col]
    null_pct = null_percentages[col]
    if null_count > 0:
        print(f"  {col:20s}: {null_count:5d} nulls ({null_pct:5.2f}%)")
    else:
        print(f"  {col:20s}: No nulls")

print("\n" + "=" * 60)


In [None]:
# Identify rows with any NULL values
rows_with_nulls = df_current[df_current.isnull().any(axis=1)]

print(f"Rows with at least one NULL value: {len(rows_with_nulls)}")

if len(rows_with_nulls) > 0:
    print(f"\nBreakdown by column with NULL:")
    for col in df_current.columns:
        null_in_col = df_current[df_current[col].isnull()]
        if len(null_in_col) > 0:
            print(f"  {col}: {len(null_in_col)} rows")
    
    print(f"\nSample of rows with NULL values:")
    display(rows_with_nulls.head(20))


## Analysis Summary


In [None]:
# Generate summary report
print("=" * 60)
print("DATA QUALITY ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nTotal records in table: {len(df_current)}")
print(f"Records with NULL values: {len(rows_with_nulls)} ({len(rows_with_nulls)/len(df_current)*100:.2f}%)")
print(f"Clean records: {len(df_current) - len(rows_with_nulls)} ({(len(df_current) - len(rows_with_nulls))/len(df_current)*100:.2f}%)")

print(f"\nNULL values by column:")
for col in df_current.columns:
    null_count = df_current[col].isnull().sum()
    if null_count > 0:
        print(f"  {col}: {null_count} ({null_count/len(df_current)*100:.2f}%)")

print("\n" + "=" * 60)


## Backup Existing Table

Before making any changes, create a backup of the existing table to GCS.


In [None]:
# Backup existing table to GCS
if BACKUP_BUCKET:
    # Generate backup path with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = f"gs://{BACKUP_BUCKET}/{BACKUP_PREFIX}/fix_na_rows_{timestamp}/*.csv"
    
    print(f"Creating backup of existing table...")
    print(f"  Destination: {backup_path}")
    
    # Export table to GCS
    extract_job = bq_client.extract_table(
        BQ_TABLE_ID,
        backup_path,
        location="US"
    )
    
    extract_job.result()  # Wait for job to complete
    
    print(f"✓ Backup completed successfully")
    print(f"  Files: {backup_path}")
else:
    print("⚠ Backup bucket not configured in config.yml")
    print("  Set 'gridveg_point_intercepts.gcs.backup_bucket' to enable automatic backups")


## Prepare Clean Data

Remove rows with NULL values.


In [None]:
# Create clean dataset by removing rows with any NULL values
df_clean = df_current.dropna().copy()

print("Clean Dataset Preparation:")
print(f"  Original rows:    {len(df_current)}")
print(f"  Rows with NULL:   {len(df_current) - len(df_clean)}")
print(f"  Clean rows:       {len(df_clean)}")
print(f"  Rows to remove:   {len(df_current) - len(df_clean)}")

# Verify data integrity
print(f"\nData Integrity Check:")
print(f"  Any NULL values in clean data?: {df_clean.isnull().any().any()}")
print(f"  All rows complete?: {not df_clean.isnull().any().any()}")


## Replace Table with Clean Data

⚠️ **IMPORTANT**: This will REPLACE the entire table with the clean dataset (no NULL rows).

Review the summary above before proceeding.


In [None]:
# Replace table with clean data
print("=" * 60)
print("REPLACING BIGQUERY TABLE WITH CLEAN DATA")
print("=" * 60)
print(f"\nTable: {BQ_TABLE_ID}")
print(f"Current rows: {len(df_current)}")
print(f"New rows (clean): {len(df_clean)}")
print(f"Rows removed: {len(df_current) - len(df_clean)}")
print(f"Mode: WRITE_TRUNCATE (replace entire table)")
print(f"\nStarting replacement at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}...")

# Configure job to replace existing table
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE"  # Replace entire table
)

# Load clean dataframe to BigQuery
load_job = bq_client.load_table_from_dataframe(
    df_clean,
    BQ_TABLE_ID,
    job_config=job_config
)

# Wait for job to complete
load_job.result()

print(f"\n✓ Replacement completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"  Rows written: {load_job.output_rows}")
print(f"  Job ID: {load_job.job_id}")
