In [3]:
import sys
from pathlib import Path
import logging
import pandas as pd
from typing import List

# Setup logging early
logging.basicConfig(
    level=logging.INFO, 
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# ---------------------------------------------------
# Set up paths so local modules work
# ---------------------------------------------------
sys.path.append(str(Path(__file__).resolve().parent))

# ---------------------------------------------------
# Import local modules
# ---------------------------------------------------
from data.extract import initialize_bigquery_client, extract_data
from data.bigquery_queries import get_marketing_data, get_dps_data
from data.transform import apply_cleanup
from data.cuped import apply_cuped_adjustment
from data.store import store_data_cloud
from utils.dates import get_iso_week_mondays
from utils.summary_stats import summarize_columns

# ---------------------------------------------------
# Pipeline function
# ---------------------------------------------------

def run_pipeline(project_id: str,
                 entities: List[str],
                 weeks: List[pd.Timestamp],
                 restaurant_flag: str = 'IN',
                 pre_post_metric_pairs: List = [("orders_pre", "orders_post"), 
                                                ("analytical_profit_pre", "analytical_profit_post")],
                 save_cloud: bool = False,
                 save_local: bool = False) -> pd.DataFrame:
    """
    Full holdout CUPED analysis pipeline.

    Args:
        project_id (str): GCP project ID.
        entities (List[str]): List of entity IDs.
        weeks (List[pd.Timestamp]): List of weeks to iterate.
        restaurant_flag (str): 'IN' or 'NOT IN' for restaurant filtering.
        pre_post_metric_pairs (List[tuple]): Pre/post metrics for CUPED.
        save_cloud (bool): If True, save output to GCS.
        save_local (bool): If True, also save weekly parquet locally.

    Returns:
        pd.DataFrame: Final combined DataFrame after CUPED adjustment.
    """

    logger.info("Starting pipeline...")
    client = initialize_bigquery_client(project_id)

    final_df = pd.DataFrame()

    for week in weeks:
        logger.info(f"Processing week: {week}")

        mkt_query = get_marketing_data(entities, week, restaurant_flag=restaurant_flag)
        dps_query = get_dps_data(entities, week, restaurant_flag=restaurant_flag)

        # Extract
        raw_data = extract_data(client, mkt_query, dps_query)
        raw_data["as_of_date"] = week

        # Transform
        cleaned_data = apply_cleanup(raw_data)

        final_df = pd.concat([final_df, cleaned_data], ignore_index=True)

    # CUPED
    logger.info("Applying CUPED adjustment...")
    cuped_data = apply_cuped_adjustment(final_df, pre_post_metric_pairs=pre_post_metric_pairs)

    # Store results
    if save_cloud or save_local:
        logger.info("Saving data...")
        store_data_cloud(
            df=cuped_data,
            week_dates=weeks,
            save_cloud_storage=save_cloud,
            save_local=save_local
        )

    logger.info("Pipeline complete.")
    return cuped_data



NameError: name '__file__' is not defined

In [9]:
import logging
from datetime import date

import sys
from pathlib import Path

from historical_pipeline import store_data_historically

# Set up logging globally
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def main():
    
    project_id = "logistics-customer-staging"
    entities = ('FP_PK', 'PY_DO')
    
    year = 2025

    min_date = date(2025, 2,2)    
    max_date = date(2025, 2, 3) 

    store_data_historically(
        project_id=project_id,
        entities=entities,
        year=year,
        min_date=min_date,
        max_date=max_date,
        restaurant_flag='IN',
        save_local=True
    )

if __name__ == "__main__":
    main()


2025-05-14 23:24:40,355 - INFO - Initializing BigQuery client for project: logistics-customer-staging
2025-05-14 23:24:41,457 - INFO - BigQuery client initialized for project: logistics-customer-staging
2025-05-14 23:24:41,458 - INFO - Starting historical storage for 2025 with 1 weeks.
2025-05-14 23:24:41,460 - INFO - Processing week: 2025-02-03
2025-05-14 23:24:41,460 - INFO - Starting data extraction...
2025-05-14 23:26:18,987 - INFO - Data extraction completed in 97.53 seconds.
2025-05-14 23:26:22,158 - INFO - apply_cleanup(): 0 rows removed. Final dataset size: 28855358


NameError: name 'convert_dtypes' is not defined