"""

NYC Taxi Anomaly Detection Pipeline
Processes each month using previous months' data for training
Stores results in BigQuery anomaly tables

"""

In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
import warnings
from datetime import datetime, timedelta
import logging
from typing import List, Tuple, Dict

warnings.filterwarnings('ignore')


# config

In [None]:
# Configuration
PROJECT_ID = "nyctaxi-467111"
DATASET_NAME = "PreMlGold"
ANOMALY_DATASET = "PostMlGold"  # Dataset for anomaly results
TAXI_TYPES = ["yellow", "green", "fhv", "fhvhv"]

In [None]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Initialize BigQuery client
client = bigquery.Client(project=PROJECT_ID)

In [None]:
def get_available_partitions(taxi_type: str) -> List[str]:
    """Get available year_month partitions for a taxi type."""
    query = f"""
    SELECT table_name
    FROM `{PROJECT_ID}.{DATASET_NAME}.INFORMATION_SCHEMA.TABLES`
    WHERE REGEXP_CONTAINS(table_name, r'^{taxi_type}_[0-9]{{4}}_[0-9]{{2}}_hourly$')
    """

    results = client.query(query).result()
    partitions = []

    import re
    pattern = re.compile(rf'^{taxi_type}_(\d{{4}})_(\d{{2}})_hourly$')

    for row in results:
        match = pattern.match(row.table_name)
        if match:
            year, month = match.groups()
            partitions.append(f"{year}_{month}")

    return sorted(partitions)

def get_processed_months(taxi_type: str) -> List[str]:
    """Get months already processed in anomaly table."""
    anomaly_table = f"{PROJECT_ID}.{ANOMALY_DATASET}.{taxi_type}_anomalies"

    try:
        query = f"""
        SELECT DISTINCT
            FORMAT_DATETIME('%Y_%m', datetime) as year_month
        FROM `{anomaly_table}`
        ORDER BY year_month
        """

        results = client.query(query).result()
        return [row.year_month for row in results]
    except:
        # Table doesn't exist yet
        return []

def load_monthly_data(taxi_type: str, partition: str) -> pd.DataFrame:
    """Load data for a specific month."""
    table_name = f"{taxi_type}_{partition}_hourly"
    query = f"""
    SELECT
        DATETIME(pickup_date, TIME(pickup_hour, 0, 0)) as datetime,
        trips
    FROM `{PROJECT_ID}.{DATASET_NAME}.{table_name}`
    ORDER BY datetime
    """

    df = client.query(query).to_dataframe()
    df.set_index('datetime', inplace=True)
    return df

def load_training_data(taxi_type: str, partitions: List[str]) -> pd.DataFrame:
    """Load and combine data from multiple partitions for training."""
    dfs = []
    for partition in partitions:
        df = load_monthly_data(taxi_type, partition)
        dfs.append(df)

    combined_df = pd.concat(dfs, axis=0).sort_index()
    return combined_df

def create_sequences(data: np.ndarray, window_size: int) -> np.ndarray:
    """Create sequences for autoencoder training."""
    sequences = []
    for i in range(len(data) - window_size + 1):
        sequences.append(data[i:i + window_size])
    return np.array(sequences)

def build_autoencoder(window_size: int):
    """Build autoencoder model for anomaly detection."""
    input_layer = Input(shape=(window_size,))

    # Encoder
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    encoded = Dense(16, activation='relu')(encoded)

    # Decoder
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(decoded)
    output_layer = Dense(window_size, activation='sigmoid')(decoded)

    autoencoder = Model(input_layer, output_layer)
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder

def detect_anomalies_for_month(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    window_size: int = 24 * 7  # 1 week
) -> pd.DataFrame:
    """Detect anomalies for a specific month using autoencoder."""

    # Prepare data
    scaler = StandardScaler()

    # Fit scaler on training data
    train_scaled = scaler.fit_transform(train_data[['trips']].values)
    test_scaled = scaler.transform(test_data[['trips']].values)

    # Create sequences for training
    train_sequences = create_sequences(train_scaled.flatten(), window_size)

    if len(train_sequences) < 100:
        logger.warning("Insufficient training data for robust anomaly detection")
        return pd.DataFrame()

    # Build and train model
    autoencoder = build_autoencoder(window_size)

    early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
    history = autoencoder.fit(
        train_sequences, train_sequences,
        epochs=50,
        batch_size=32,
        verbose=0,
        callbacks=[early_stop],
        validation_split=0.1
    )

    # Create sequences for testing
    test_sequences = create_sequences(test_scaled.flatten(), window_size)

    # Predict and calculate reconstruction error
    if len(test_sequences) > 0:
        predictions = autoencoder.predict(test_sequences, verbose=0)
        mse = np.mean((test_sequences - predictions) ** 2, axis=1)

        # Set threshold (using training data statistics)
        train_predictions = autoencoder.predict(train_sequences, verbose=0)
        train_mse = np.mean((train_sequences - train_predictions) ** 2, axis=1)
        threshold = np.mean(train_mse) + 2.5 * np.std(train_mse)

        # Identify anomalies
        anomalies = mse > threshold

        # Create results dataframe
        results = pd.DataFrame({
            'datetime': test_data.index[window_size-1:],
            'trips': test_data['trips'].iloc[window_size-1:].values,
            'reconstruction_error': mse,
            'threshold': threshold,
            'is_anomaly': anomalies,
            'anomaly_score': mse / threshold  # Normalized score
        })

        return results

    return pd.DataFrame()

def save_anomalies_to_bq(results: pd.DataFrame, taxi_type: str, year_month: str):
    """Save anomaly detection results to BigQuery."""
    if results.empty:
        logger.warning(f"No results to save for {taxi_type} {year_month}")
        return

    # Add metadata
    results['taxi_type'] = taxi_type
    results['year_month'] = year_month
    results['processing_timestamp'] = datetime.now()

    # Table name
    table_id = f"{PROJECT_ID}.{ANOMALY_DATASET}.{taxi_type}_anomalies"

    # Configure job
    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        schema=[
            bigquery.SchemaField("datetime", "TIMESTAMP"),
            bigquery.SchemaField("trips", "INTEGER"),
            bigquery.SchemaField("reconstruction_error", "FLOAT"),
            bigquery.SchemaField("threshold", "FLOAT"),
            bigquery.SchemaField("is_anomaly", "BOOLEAN"),
            bigquery.SchemaField("anomaly_score", "FLOAT"),
            bigquery.SchemaField("taxi_type", "STRING"),
            bigquery.SchemaField("year_month", "STRING"),
            bigquery.SchemaField("processing_timestamp", "TIMESTAMP"),
        ]
    )

    # Load data
    job = client.load_table_from_dataframe(results, table_id, job_config=job_config)
    job.result()  # Wait for job to complete

    logger.info(f"Saved {len(results)} records to {table_id}")

def process_taxi_type(taxi_type: str, min_training_months: int = 3):
    """Process all unprocessed months for a taxi type."""
    logger.info(f"Processing {taxi_type} taxi...")

    # Get available and processed partitions
    available_partitions = get_available_partitions(taxi_type)
    processed_partitions = get_processed_months(taxi_type)

    logger.info(f"Available partitions: {len(available_partitions)}")
    logger.info(f"Already processed: {len(processed_partitions)}")

    # Process each month
    for i, test_partition in enumerate(available_partitions):
        if test_partition in processed_partitions:
            logger.info(f"Skipping {test_partition} - already processed")
            continue

        # Need at least min_training_months for training
        if i < min_training_months:
            logger.info(f"Skipping {test_partition} - insufficient training history")
            continue

        # Get training partitions (all previous months)
        train_partitions = available_partitions[:i]

        logger.info(f"Processing {test_partition} using {len(train_partitions)} months for training")

        try:
            # Load data
            train_data = load_training_data(taxi_type, train_partitions[-6:])  # Use last 6 months for efficiency
            test_data = load_monthly_data(taxi_type, test_partition)

            # Detect anomalies
            results = detect_anomalies_for_month(train_data, test_data)

            if not results.empty:
                # Save to BigQuery
                save_anomalies_to_bq(results, taxi_type, test_partition)
                logger.info(f"Successfully processed {test_partition}: {results['is_anomaly'].sum()} anomalies found")
            else:
                logger.warning(f"No results generated for {test_partition}")

        except Exception as e:
            logger.error(f"Error processing {test_partition}: {str(e)}")
            continue

def create_summary_view(taxi_type: str):
    """Create a summary view of anomalies by month."""
    query = f"""
    CREATE OR REPLACE VIEW `{PROJECT_ID}.{ANOMALY_DATASET}.{taxi_type}_anomaly_summary` AS
    SELECT
        year_month,
        COUNT(*) as total_hours,
        SUM(CAST(is_anomaly AS INT64)) as anomaly_count,
        ROUND(AVG(CAST(is_anomaly AS INT64)) * 100, 2) as anomaly_percentage,
        AVG(trips) as avg_trips,
        MAX(trips) as max_trips,
        AVG(CASE WHEN is_anomaly THEN anomaly_score END) as avg_anomaly_score
    FROM `{PROJECT_ID}.{ANOMALY_DATASET}.{taxi_type}_anomalies`
    GROUP BY year_month
    ORDER BY year_month
    """

    client.query(query).result()
    logger.info(f"Created summary view for {taxi_type}")

In [None]:
def main():
    """Main pipeline execution."""
    logger.info("Starting NYC Taxi Anomaly Detection Pipeline")

    for taxi_type in TAXI_TYPES:
        logger.info(f"\n{'='*50}")
        logger.info(f"Processing {taxi_type.upper()} taxi")
        logger.info(f"{'='*50}")

        try:
            # Process all unprocessed months
            process_taxi_type(taxi_type)

            # Create summary view
            create_summary_view(taxi_type)

            logger.info(f"Completed processing for {taxi_type}")

        except Exception as e:
            logger.error(f"Failed to process {taxi_type}: {str(e)}")
            continue

    logger.info("\nPipeline completed!")

    # Print summary
    print("\n\nSUMMARY REPORT")
    print("="*60)
    for taxi_type in TAXI_TYPES:
        try:
            query = f"""
            SELECT
                COUNT(DISTINCT year_month) as months_processed,
                SUM(CAST(is_anomaly AS INT64)) as total_anomalies
            FROM `{PROJECT_ID}.{ANOMALY_DATASET}.{taxi_type}_anomalies`
            """

            result = list(client.query(query).result())[0]
            print(f"{taxi_type.upper():>6}: {result.months_processed} months, {result.total_anomalies:,} anomalies")
        except:
            print(f"{taxi_type.upper():>6}: No data")

In [None]:
main()