In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os
import sys
from pathlib import Path

from loguru import logger

module_path = os.path.abspath("..")
sys.path.append(module_path)

from src.base.utilities import generate_query, read_yaml
from src.utils.notebooks import patch_kfp

patch_kfp()

from src.components.bigquery import execute_query
from src.components.data import get_data_version
from src.components.helpers import get_current_time

# Remove default logger and set level to DEBUG
logger.remove()
logger.add(sys.stderr, level="DEBUG")

In [None]:
with open("../src/pipelines/training/payloads/dev.json") as f:
    payload = json.load(f)
    payload = payload["data"]

project_id = os.environ.get("VERTEX_PROJECT_ID")
dataset_id = payload["dataset_id"]
dataset_location = payload["dataset_location"]
data_version = payload["data_version"]
create_replace_tables = payload["create_replace_tables"]

In [None]:
config_params = read_yaml("../src/pipelines/configuration/params.yaml")
features = "`" + "`,\n`".join(f for f in config_params["features"]) + "`"

In [None]:
queries_folder = Path.cwd().parent / "src/pipelines/training/queries"

In [None]:
current_timestamp = get_current_time()

In [None]:
data_version = get_data_version(
    payload_data_version=data_version,
    project_id=project_id,
    dataset_id=dataset_id,
    dataset_location=dataset_location,
)

In [None]:
dataset_name = f"{project_id}.{dataset_id}_{data_version}"
transactions_table = f"{dataset_name}.transactions"
users_table = f"{dataset_name}.users"
cards_table = f"{dataset_name}.cards"
holidays_table = f"{dataset_name}.holidays"
preprocessed_table = f"{dataset_name}.preprocessed_local"

preprocessing_query = generate_query(
    queries_folder / "q_preprocessing.sql",
    transactions_table=transactions_table,
    users_table=users_table,
    cards_table=cards_table,
    holidays_table=holidays_table,
    preprocessed_table=preprocessed_table,
    fraud_delay_seconds=(config_params["fraud_delay_days"] * 24 * 60 * 60),
    features=features,
    create_replace_table=create_replace_tables,
)

query_job_config = json.dumps(dict(use_query_cache=True))

execute_query(
    query=preprocessing_query,
    bq_client_project_id=project_id,
    query_job_config=query_job_config,
)