# =====================================================================
# TLC NYC Taxi Data Ingestion Notebook (Google Cloud Platform)
# =====================================================================

In [None]:
import os
import re
import logging
import requests
import pandas as pd
from io import BytesIO
from datetime import datetime
from dateutil.relativedelta import relativedelta
from google.cloud import storage

# ------------------------------
# CONFIGURATION
# ------------------------------

In [None]:
BUCKET_NAME = "nyc_raw_data_bucket"
TAXI_TYPES = ["fhv", "green", "yellow", "fhvhv"]
START_DATE = datetime(2023, 1, 1)   # <-- change to 2019,2,1 for full history
END_DATE = datetime.today().replace(day=1) - relativedelta(months=2)

# GCS client

In [None]:
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

# ------------------------------
# LOGGING SETUP
# ------------------------------

In [None]:
run_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f"tlc_ingestion_{run_time}.log"
local_log_path = f"/tmp/{log_filename}"     # local log file
gcs_log_path = f"logs/{log_filename}"       # GCS path for log
print(gcs_log_path)

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# remove duplicate handlers if re-running notebook cells
if logger.hasHandlers():
    logger.handlers.clear()

# file handler
fh = logging.FileHandler(local_log_path)
fh.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)

# console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)


# ------------------------------
# HELPERS
# ------------------------------

In [None]:
# --- Modified Functions to use logger instead of print ---
def process_and_upload_data_by_period(taxi_type, start_year_month, end_year_month):
    start_year, start_month = map(int, start_year_month.split('-'))
    end_year, end_month = map(int, end_year_month.split('-'))

    current_year, current_month = start_year, start_month

    while (current_year, current_month) <= (end_year, end_month):
        year_str = str(current_year)
        month_str = str(current_month).zfill(2)
        # Build the source URL for the parquet file
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year_str}-{month_str}.parquet"
        # Build the destination path in GCS (organized by taxi_type/year/)
        destination_blob_name = f"{taxi_type}/{year_str}/{taxi_type}_tripdata_{year_str}-{month_str}.parquet"

        try:
            logger.info(f"Downloading data from: {url}")
            # Download the parquet file into memory
            response = requests.get(url)
            response.raise_for_status()

            # Wrap bytes in a memory buffer
            parquet_file_in_memory = BytesIO(response.content)
            # Validate the file by loading into pandas
            df = pd.read_parquet(parquet_file_in_memory)
            logger.info(f"Successfully loaded {len(df)} rows for {year_str}-{month_str}")

            # Reset buffer cursor before uploading the same bytes to GCS
            parquet_file_in_memory.seek(0)
            # Create a blob handle and upload the file as-is to GCS
            blob = bucket.blob(destination_blob_name)
            blob.upload_from_file(parquet_file_in_memory, content_type='application/octet-stream')

            logger.info(f" Uploaded {destination_blob_name} to gs://{BUCKET_NAME}")

        except requests.exceptions.RequestException as e:
            logger.error(f" Error downloading {url}: {e}")
        except Exception as e:
            logger.error(f" Unexpected error for {url}: {e}")

        if current_month == 12:
            current_month = 1
            current_year += 1
        else:
            current_month += 1

In [None]:
def automate_missing_downloads(missing_summary):
    status_records = []

    for taxi_type, missing in missing_summary.items():
        if not missing:
            logger.info(f" No missing months for {taxi_type.upper()}")
            continue

        logger.info(f" Processing missing months for {taxi_type.upper()} ({len(missing)} months)")

        for ym in missing:
            try:
                process_and_upload_data_by_period(taxi_type, ym, ym)
                status_records.append({
                    "taxi_type": taxi_type,
                    "year_month": ym,
                    "status": "SUCCESS"
                })
            except Exception as e:
                logger.error(f" Error processing {taxi_type} {ym}: {e}")
                status_records.append({
                    "taxi_type": taxi_type,
                    "year_month": ym,
                    "status": "ERROR",
                    "error": str(e)
                })

    logger.info("\n DOWNLOAD STATUS SUMMARY")
    for rec in status_records:
        if rec["status"] == "SUCCESS":
            logger.info(f" {rec['taxi_type']} {rec['year_month']} - SUCCESS")
        else:
            logger.error(f" {rec['taxi_type']} {rec['year_month']} - ERROR: {rec['error']}")
    return status_records

In [None]:


def list_available_year_months(bucket_name, taxi_type):
    """Return set of (year, month) available in GCS for a given taxi type."""
    prefix = f"{taxi_type}/"
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    ym_set = set()

    pattern = re.compile(r"(\d{4})-(\d{2})\.parquet$")

    for blob in blobs:
        match = pattern.search(blob.name)
        if match:
            year, month = int(match.group(1)), int(match.group(2))
            ym_set.add((year, month))
    return ym_set

def build_expected_range(start_date, end_date):
    """Return list of (year, month) tuples between start_date and end_date."""
    months = []
    current = start_date
    while current <= end_date:
        months.append((current.year, current.month))
        current += relativedelta(months=1)
    return months

# # --- Run checks with logging ---
# expected = build_expected_range(START_DATE, END_DATE)

# logger.info(f"Checking data availability from {START_DATE:%Y-%m} to {END_DATE:%Y-%m}")

# missing_summary = {}
# for taxi_type in TAXI_TYPES:
#     logger.info(f"Checking available files for {taxi_type.upper()}...")
#     available = list_available_year_months(BUCKET_NAME, taxi_type)
#     missing = [f"{y}-{str(m).zfill(2)}" for (y,m) in expected if (y,m) not in available]
#     missing_summary[taxi_type] = missing

#     if missing:
#         logger.warning(f" {taxi_type.upper()} missing months: {len(missing)} -> {', '.join(missing)}")
#     else:
#         logger.info(f" {taxi_type.upper()} has all months available.")

# logger.info("Finished missing months check.")

# # --- Run ingestion for missing months ---
# automate_missing_downloads(missing_summary)


# ------------------------------
# STEP 1: Identify Missing Months
# ------------------------------

In [None]:
expected = build_expected_range(START_DATE, END_DATE)
missing_summary = {}

logger.info(f"Checking availability from {START_DATE:%Y-%m} to {END_DATE:%Y-%m}")
for taxi_type in TAXI_TYPES:
    logger.info(f"Checking {taxi_type.upper()}...")
    available = list_available_year_months(BUCKET_NAME, taxi_type)
    missing = [f"{y}-{str(m).zfill(2)}" for (y, m) in expected if (y, m) not in available]
    missing_summary[taxi_type] = missing
    if missing:
        logger.warning(f"   Missing {len(missing)} months -> {', '.join(missing)}")
    else:
        logger.info(f"   All months available ")

# ------------------------------
# STEP 2: Ingest Missing Files
# ------------------------------

In [None]:
records = automate_missing_downloads(missing_summary)


# ------------------------------
# STEP 3: Upload Logs to GCS
# ------------------------------

In [None]:
# ------------------------------
# STEP 3: Upload Logs to GCS
# ------------------------------
try:
    # ensure file is written
    for handler in logger.handlers:
        handler.flush()
        if hasattr(handler, "close"):
            handler.close()

    if os.path.exists(local_log_path):
        log_blob = bucket.blob(gcs_log_path)
        log_blob.upload_from_filename(local_log_path)
        print(f" Logs uploaded to gs://{BUCKET_NAME}/{gcs_log_path}")
    else:
        print(f" Log file not found at {local_log_path}")
except Exception as e:
    print(f" Failed to upload logs to GCS: {e}")
