# Imports

In [77]:
import kfp
from kfp import dsl, compiler, components
from kfp.dsl import component

import json
import boto3
import logging
import pandas as pd
from datetime import datetime, timedelta

from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs

# Variables

In [78]:
# Temporary secrets manager
with open("configs.json", 'r') as secrets_file:
    configs = json.load(secrets_file)

max_objects = 1

# Generate timestamp in a Vertex-compatible format
TIMESTAMP = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")

VAI_GCP_PROJECT_ID = configs.get("VAI_GCP_PROJECT_ID")
VAI_GCP_PROJECT_LOCATION = configs.get("VAI_GCP_PROJECT_LOCATION")
VAI_GCP_PIPELINE_BUCKET = configs.get("VAI_GCP_PIPELINE_BUCKET")

VAI_GCP_PIPELINE_RUN_NAME = f"{configs.get("VAI_GCP_PIPELINE_NAME")}-{TIMESTAMP}"
GCP_PIPELINE_ROOT = f"gs://{VAI_GCP_PIPELINE_BUCKET}/{VAI_GCP_PIPELINE_RUN_NAME}"

VAI_AWS_ACCESS_KEY = configs.get("VAI_AWS_ACCESS_KEY")
VAI_AWS_SECRET_KEY = configs.get("VAI_AWS_SECRET_KEY")

VAI_S3_ANALYSIS_BUCKET = configs.get("VAI_S3_ANALYSIS_BUCKET")
VAI_S3_TRANSCRIPTS_LOCATION = configs.get("VAI_S3_TRANSCRIPTS_LOCATION")

  TIMESTAMP = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")


In [79]:
VAI_GCP_PIPELINE_RUN_NAME

'VAI_PIPELINE_RUN-2025-03-02-17-00-46'

# Set up Logging

In [80]:
# Set up logging
def setup_logger():
    """Set up a logger for the pipeline run."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    return logging.getLogger(__name__)

logger = setup_logger()

# Set up Error Handling

In [81]:
def handle_exception(
    file_id: str,
    vai_gcs_bucket: str,
    run_folder: str,
    error_folder: str,
    error_message: str
):
    """
    Logs the error, appends the file_id to error tracking CSV, and triggers a notification.
    """
    try:
        error_df_path = f"{error_folder}/{run_folder}_errors.csv"

        logger.error(f"Error processing file {file_id}: {error_message}")

        gcs_client = storage.Client()
        bucket = gcs_client.bucket(vai_gcs_bucket)
        blob = bucket.blob(error_df_path)

        if blob.exists():
            error_df = pd.read_csv(f"gs://{vai_gcs_bucket}/{error_df_path}")
        else:
            error_df = pd.DataFrame(columns=["File_ID", "Error_Message"])

        error_df = pd.concat([error_df, pd.DataFrame([{"File_ID": file_id, "Error_Message": error_message}])], ignore_index=True)
        error_df.to_csv(f"gs://{vai_gcs_bucket}/{error_df_path}", index=False)
        logger.info(f"Logged error for file {file_id} in {error_df_path}")

    except Exception as e:
        logger.error(f"Failed to write to error tracking file: {e}")

# Component: Listing new Transcripts

In [82]:
# pipeline_run_name=VAI_GCP_PIPELINE_RUN_NAME
# aws_access_key=VAI_AWS_ACCESS_KEY
# aws_secret_key=VAI_AWS_SECRET_KEY
# s3_analysis_bucket=VAI_S3_ANALYSIS_BUCKET
# s3_transcript_location=VAI_S3_TRANSCRIPTS_LOCATION
# vai_gcs_bucket=VAI_GCP_PIPELINE_BUCKET
# max_objects=max_objects

In [83]:
@dsl.component(
    base_image=f"us-central1-docker.pkg.dev/dev-posigen/dev-voice-ai/voice-ai-docker-image:latest"
)
def list_s3_files_to_gcs(
    pipeline_run_name: str,
    aws_access_key: str,
    aws_secret_key: str,
    s3_analysis_bucket: str,
    s3_transcript_location: str,
    vai_gcs_bucket: str,  
    max_objects: int
):
    """
    Fetch audio file from S3 and return it as a BytesIO object
    """
    import boto3
    import pandas as pd
    import logging
    from google.cloud import storage
    from datetime import datetime
    
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    try:
        # Generate pipeline folder paths
        staging_folder = f"{pipeline_run_name}/Stagging"
        errored_folder = f"{pipeline_run_name}/Errored"
        
        # Initialize GCS Client
        gcs_client = storage.Client()
        bucket = gcs_client.bucket(vai_gcs_bucket)
        
        # 🔹 Corrected: Create empty folders directly
        for folder in [staging_folder, errored_folder]:
            blob = bucket.blob(f"{folder}/")
            blob.upload_from_string("", content_type="application/x-www-form-urlencoded")
        
        logging.info(f"Created folders: {staging_folder} and {errored_folder} in GCS.")
        
        # Initialize S3 Client
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
        
        logger.info("Fetching New Transcripts to process")
        response = s3_client.list_objects_v2(Bucket=s3_analysis_bucket, Prefix=s3_transcript_location)
        
        all_files = []
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('.json'):
                file_path = obj['Key']
                s3_ts = obj['LastModified']
                
                call_id = file_path.split('/')[-1].split("_analysis_")[0]
                TIMESTAMP = datetime.strptime(file_path.split('analysis_')[-1].split('.')[0].replace('_', ':'), '%Y-%m-%dT%H:%M:%SZ')    
        
                all_files.append({
                    'File': file_path,
                    'ID': call_id,
                    'File_TIMESTAMP': TIMESTAMP,
                    'File_Date': TIMESTAMP.strftime('%Y-%m-%d'),
                    'File_Time': TIMESTAMP.strftime('%H:%M:%S'),
                    'S3_TIMESTAMP': s3_ts,
                    'S3_Date': s3_ts.strftime('%Y-%m-%d'),
                    'S3_Time': s3_ts.strftime('%H:%M:%S')
                })
        
        files_sorted = pd.DataFrame(all_files).sort_values(['File_TIMESTAMP'], ascending=False, ignore_index=True)
        
        # Write DataFrame to the correct GCS path
        csv_path = f"gs://{vai_gcs_bucket}/{staging_folder}/{pipeline_run_name}_S3_Transcripts_fetched.csv"
        files_sorted.to_csv(csv_path, index=False)
        
        logger.info(f"Written Transcripts to GCS Bucket: {csv_path}")
    
    except Exception as e:
        handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, errored_folder, str(e))

# Component: Process Audio Files

# Define the Pipeline

In [84]:
@dsl.pipeline(
    name=VAI_GCP_PIPELINE_RUN_NAME,
    description="Proces Amazon Audio Transcripts to KPIs"
)
def vai_audio_to_kpi_pipeline(
    pipeline_run_name: str,
    aws_access_key: str,
    aws_secret_key: str,
    s3_analysis_bucket: str,
    s3_transcript_location: str,
    vai_gcs_bucket: str,
    max_objects: int
):
    fetch_transcripts = list_s3_files_to_gcs(
        pipeline_run_name=VAI_GCP_PIPELINE_RUN_NAME,
        aws_access_key=VAI_AWS_ACCESS_KEY,
        aws_secret_key=VAI_AWS_SECRET_KEY,
        s3_analysis_bucket=VAI_S3_ANALYSIS_BUCKET,
        s3_transcript_location=VAI_S3_TRANSCRIPTS_LOCATION,
        vai_gcs_bucket=VAI_GCP_PIPELINE_BUCKET,
        max_objects=max_objects
    )

# Compile the Pipeline

In [85]:
compiler.Compiler().compile(vai_audio_to_kpi_pipeline, f'{VAI_GCP_PIPELINE_NAME}.yaml')

# Run the Pipeline 

## Run in Vertex AI

In [86]:
# Initialize Vertex AI
aiplatform.init(project=VAI_GCP_PROJECT_ID, location=VAI_GCP_PROJECT_LOCATION)

max_objects = 1

# Create pipeline job
job = pipeline_jobs.PipelineJob(
    display_name = f"{VAI_GCP_PIPELINE_RUN_NAME}".lower(),
    job_id = f"vai-pipeline-run-{TIMESTAMP}".lower(),
    template_path = f"{VAI_GCP_PIPELINE_NAME}.yaml",
    pipeline_root = f"gs://{VAI_GCP_PIPELINE_BUCKET}",
    project = VAI_GCP_PROJECT_ID,
    location = VAI_GCP_PROJECT_LOCATION,
    enable_caching = False,
    parameter_values={
        "pipeline_run_name":VAI_GCP_PIPELINE_RUN_NAME,
        "aws_access_key":VAI_AWS_ACCESS_KEY,
        "aws_secret_key":VAI_AWS_SECRET_KEY,
        "s3_analysis_bucket":VAI_S3_ANALYSIS_BUCKET,
        "s3_transcript_location":VAI_S3_TRANSCRIPTS_LOCATION,
        "vai_gcs_bucket":VAI_GCP_PIPELINE_BUCKET,
        "max_objects":max_objects
    }
)

In [87]:
job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/vai-pipeline-run-2025-03-02-17-00-46?project=275963620760
PipelineJob projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46 current state:
3
PipelineJob projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46 current state:
3
PipelineJob projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46 current state:
3
PipelineJob projects/275963620760/locations/us-central1/pipelineJobs/vai-pipeline-run-2025-03-02-17-00-46 current state:
3
PipelineJob proje