In [1]:
import boto3
import pandas as pd
import logging, json
from google.cloud import secretmanager
from google.cloud import storage
from google.cloud import logging as cloud_logging
from datetime import datetime, timedelta, timezone, UTC
from concurrent.futures import ThreadPoolExecutor, as_completed

import warnings
warnings.filterwarnings("ignore", message="Skipping checksum validation")
warnings.filterwarnings("ignore", category=UserWarning)

# Function: Fetch Secrets

In [16]:
def fetch_secrets(
    project_id,
    secret_id,
    version_id
):
    """
    Access a secret from Google Secret Manager

    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret to access
        version_id: The version of the secret (default: "latest")

    Returns:
        The secret payload as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

    # Access the secret version
    response = client.access_secret_version(request={"name": name})

    # Decode and parse the JSON payload
    secret_payload = response.payload.data.decode("UTF-8")

    try:
        return json.loads(secret_payload)  # Convert string to JSON
    except json.JSONDecodeError:
        raise ValueError("The secret payload is not a valid JSON")

# Util Functions

### Function: Setup Logger

In [None]:
def setup_logger(log_file):
    """
    Sets up a logger that writes to a log file, console, and Google Cloud Logging.

    Args:
        log_file (str): Path of the log file.

    Returns:
        logger: Configured logger instance.
    """
    try:
        logger = logging.getLogger("vertex_pipeline_logger")
        logger.setLevel(logging.INFO)
        logger.propagate = False  # Prevent duplicate logs

        if not logger.handlers:  # Avoid adding multiple handlers
            formatter = logging.Formatter(
                '%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S'
            )

            # File Handler
            file_handler = logging.FileHandler(log_file)
            file_handler.setLevel(logging.INFO)
            file_handler.setFormatter(formatter)
            logger.addHandler(file_handler)

            # Console Handler
            console_handler = logging.StreamHandler()
            console_handler.setLevel(logging.INFO)
            console_handler.setFormatter(formatter)
            logger.addHandler(console_handler)

        return logger

    except Exception as e:
        print(f"Failed to initialize logger: {e}")
        return None

### Function: Handle Exception

In [None]:
def handle_exception(
    file_id,
    vai_gcs_bucket,
    run_folder,
    error_folder,
    error_message
):
    """
    Logs the error, appends the file_id to error tracking CSV, and triggers a notification.
    """
    try:
        error_df_path = f"{error_folder}/{run_folder}_errors.csv"

        logger.error(f"Error processing file {file_id}: {error_message}")

        gcs_client = storage.Client()
        bucket = gcs_client.bucket(vai_gcs_bucket)
        blob = bucket.blob(error_df_path)

        if blob.exists():
            error_df = pd.read_csv(f"gs://{vai_gcs_bucket}/{error_df_path}")
        else:
            error_df = pd.DataFrame(columns=["File_ID", "Error_Message"])

        error_df = pd.concat([error_df, pd.DataFrame([{"File_ID": file_id, "Error_Message": error_message}])], ignore_index=True)
        error_df.to_csv(f"gs://{vai_gcs_bucket}/{error_df_path}", index=False)
        logger.info(f"Logged error for file {file_id} in {error_df_path}")

    except Exception as e:
        logger.error(f"Failed to write to error tracking file: {e}")

# Functions:

## Function: Generate GCS Folders

In [None]:
def generate_gcs_folders(    
    pipeline_run_name,
    vai_gcs_bucket
):
    try:
         # Setup logger
        logging.info("Started: generating GCS pipeline folders.")
        gcs_folders = {}
        gcs_folders['gcs_staging_folder'] = f"{pipeline_run_name}/Stagging"
        gcs_folders['gcs_intra_call_dfs_folder'] = f"{pipeline_run_name}/Stagging/IntraCallDFs"
        gcs_folders['gcs_inter_call_dfs_folder'] = f"{pipeline_run_name}/Stagging/InterCallDFs"
        gcs_folders['gcs_transcripts_folder'] = f"{pipeline_run_name}/Transcripts"
        gcs_folders['gcs_errored_folder'] = f"{pipeline_run_name}/Errored"
        gcs_folders['gcs_logs_folder'] = f"{pipeline_run_name}/Logs"

        # Initialize GCS Client
        gcs_client = storage.Client()
        bucket = gcs_client.bucket(vai_gcs_bucket)

        # Create empty folders directly
        for folder in gcs_folders.values():
            blob = bucket.blob(f"{folder}/")
            blob.upload_from_string("", content_type="application/x-www-form-urlencoded")
            logging.info(f"Created folder: {folder}")

        logging.info("Completed: generating GCS pipeline folders.")
        return gcs_folders

    except Exception as e:
        handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, f"{pipeline_run_name}/Errored", str(e))

## Function: Generate S3 Folders

In [None]:
def generate_s3_folder_prefix(
    pipeline_run_name,
    vai_gcs_bucket,
    gcs_errored_folder
):
    try:
        logger.info("Started: generating S3 folder prefix.")
        # Get current date and time
        current_datetime = datetime.now()

        # Check if the run is around midnight (e.g., between 00:00 and 01:00)
        if current_datetime.hour == 0:
            adjusted_datetime = current_datetime - timedelta(days=1)  # Move to the previous day
        else:
            adjusted_datetime = current_datetime  # Keep the current day

        # Extract year, month, and day from the adjusted date
        year = str(adjusted_datetime.year)
        month = f"{adjusted_datetime.month:02d}"
        day = f"{adjusted_datetime.day:02d}"

        # Construct the prefix for S3 listing
        prefix = f"{year}/{month}/{day}/"
        logger.info(f"Completed: generating S3 folder prefix {prefix}.")

        return prefix

    except Exception as e:
        handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, gcs_errored_folder, str(e))

## Function: Get List Calls to Process

In [None]:
def get_list_calls_to_process(
    pipeline_run_name,
    vai_gcs_bucket,
    gcs_staging_folder,
    gcs_errored_folder,
    aws_access_key,
    aws_secret_key,
    s3_analysis_bucket,
    s3_transcripts_location,
    s3_prefix,
    time_interval
):
    try:
        logger.info(f"Started: listing calls from: {s3_transcripts_location}/{s3_prefix}")
        # Initialize S3 Client
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )

        all_files = []
        paginator = s3_client.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=s3_analysis_bucket, Prefix=f"{s3_transcripts_location}/{s3_prefix}")

        # Get current UTC time (timezone-aware)
        # current_time = datetime.now(timezone.utc)
        current_time = datetime.now(timezone.utc) - timedelta(days=5) # ToTest

        # Calculate the time threshold (2 hours before the current time)
        time_threshold = current_time - timedelta(hours=time_interval) # ToTest
        logger.info(f"Fetching Calls between: {time_threshold.time()} and {current_time.time()}")

        all_files = []

        for page in pages:
            for obj in page.get('Contents', []):
                file_path = obj['Key']
                s3_ts = obj['LastModified']

                # Extract timestamp from filename
                try:
                    # Skip non-JSON files
                    if file_path.endswith('.json'):
                        call_id = file_path.split('/')[-1].split("_analysis_")[0]
                        call_timestamp = pd.to_datetime(file_path.split('analysis_')[-1].split('.')[0].replace('Z', ""), utc=True)

                        # Compare only the time part
                        if call_timestamp.time() <= time_threshold.time():
                            all_files.append({
                                'File': file_path,
                                'Call_ID': call_id,
                                'File_Timestamp': call_timestamp,    # Amazon Connect Application Timestamp
                                'File_Date': call_timestamp.date().strftime('%Y-%m-%d'),
                                'File_Time': call_timestamp.time().strftime('%H:%M:%S'),
                                'S3_Timestamp': s3_ts,               # Amazon S3 LastModified Timestamp
                                'S3_Date': s3_ts.strftime('%Y-%m-%d'),
                                'S3_Time': s3_ts.strftime('%H:%M:%S')
                            })
                except Exception as e:
                    logger.warning(f"Skipping file {file_path} due to timestamp parsing error: {e}")
                    continue

        if all_files:
            df_calls_list = pd.DataFrame(all_files).sort_values(['File_Timestamp'], ascending=False)
            df_calls_list['Time_Bin'] = df_calls_list['File_Timestamp'].dt.floor('2h')
            # Subset the DataFrame for only the most recent 2 hours bin
            df_calls_list = df_calls_list[df_calls_list['Time_Bin'] == df_calls_list['Time_Bin'].max()]
            logger.info(f"Files to process for the last 2 hours: {len(df_calls_list)}")

            # Write the DataFrame to GCS
            logger.info(f"Files to process for the last 2 hours: {len(df_calls_list)}")
            csv_path = f"gs://{vai_gcs_bucket}/{gcs_folders['gcs_staging_folder']}/{pipeline_run_name}_transcripts_to_process.csv"
            df_calls_list.to_csv(csv_path, index=False)
            logger.info(f"Written Transcripts list to GCS: {csv_path}")
            logger.info(f"Completed: listing calls to process Calls#: {len(df_calls_list)}")

            return df_calls_list

        else:
            logger.info(f"0 Files fetched.")
            return pd.DataFrame()

    except Exception as e:
        handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, gcs_errored_folder, str(e))

## Function: Download Transcripts to GCS

In [21]:
def download_transcripts_to_gcs(
    file,
    pipeline_run_name,
    vai_gcs_bucket,
    gcs_staging_folder,
    gcs_errored_folder,
    gcs_transcripts_folder,
    s3_client,
    s3_analysis_bucket
):
    """Download transcript from S3 and upload to GCS."""

    local_file_path = f"/tmp/{file.split('/')[-1]}"  # Temporary local storage
    gcs_blob_path = f"{gcs_transcripts_folder}/{file.split('/')[-1]}"
    gcs_bucket = storage.Client().bucket(vai_gcs_bucket)

    try:
        # Download file from S3
        s3_client.download_file(s3_analysis_bucket, file, local_file_path)

        # Upload to GCS
        blob = gcs_bucket.blob(gcs_blob_path)
        blob.upload_from_filename(local_file_path, checksum=None)

        return file, None

    except Exception as e:
        logger.error(f"Error: Failed to process {file} -> {str(e)}")
        handle_exception(file, vai_gcs_bucket, pipeline_run_name, gcs_errored_folder, str(e))
        return None, file

# Main Function

In [None]:
# ========================================================
# Variables
# ========================================================
log_file = f"{pipeline_run_name}.logs"
logger = setup_logger(log_file)

logger.info("============================================================================")
logger.info("COMPONENT: Fetch Transcripts from S3 into GCS.")
logger.info("============================================================================")

# Fetch Configs
configs = fetch_secrets(
    project_id,
    secret_id,
    version_id
)

time_interval = 2
vai_gcs_bucket = configs.get("VAI_GCP_PIPELINE_BUCKET")
aws_access_key = configs.get("VAI_AWS_ACCESS_KEY")
aws_secret_key = configs.get("VAI_AWS_SECRET_KEY")
s3_analysis_bucket = configs.get("VAI_S3_ANALYSIS_BUCKET")
s3_transcripts_location = configs.get("VAI_S3_TRANSCRIPTS_LOCATION")

# Generate required GCS folder paths
gcs_folders = generate_gcs_folders(pipeline_run_name, vai_gcs_bucket)

gcs_staging_folder = gcs_folders["gcs_staging_folder"]
gcs_transcripts_folder = gcs_folders["gcs_transcripts_folder"]
gcs_errored_folder = gcs_folders["gcs_errored_folder"]
gcs_logs_folder = gcs_folders["gcs_logs_folder"]

# Generate S3 Prefix
s3_prefix = generate_s3_folder_prefix(
    pipeline_run_name, vai_gcs_bucket, gcs_errored_folder
)

# ========================================================
# Fetch Calls List from S3
# ========================================================
df_calls_list = get_list_calls_to_process(
    pipeline_run_name,
    vai_gcs_bucket,
    gcs_staging_folder,
    gcs_errored_folder,
    aws_access_key,
    aws_secret_key,
    s3_analysis_bucket,
    s3_transcripts_location,
    s3_prefix,
    time_interval
)

call_count = len(df_calls_list)

if call_count > 0:
    files_list = df_calls_list.File.to_list()
    s3_client = boto3.client(
        "s3", aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key
    )

    success_downloads = []
    failed_downloads = []

    # Start Multithreaded Download
    with ThreadPoolExecutor(max_workers=5) as executor:
        logger.info(f"Started: Bulk download to GCS transcripts#: {call_count}")

        future_to_file = {
            executor.submit(
                download_transcripts_to_gcs,
                file,
                pipeline_run_name,
                vai_gcs_bucket,
                gcs_staging_folder,
                gcs_errored_folder,
                gcs_transcripts_folder,
                s3_client,
                s3_analysis_bucket
            ): file for file in files_list
        }

        for future in as_completed(future_to_file):
            try:
                success, failed = future.result()  # Get results

                if success:
                    success_downloads.append(success)
                if failed:
                    failed_downloads.append(failed)

            except Exception as e:
                logger.error(f"Unexpected Error: {str(e)}")
                handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, gcs_errored_folder, str(e))

    logger.info(
        f"Completed: Bulk download to GCS transcripts, "
        f"Success#: {len(success_downloads)}, Failed#: {len(failed_downloads)}"
    )

else:
    logger.info("No Calls to Process.")

gcs_bucket = storage.Client().bucket(vai_gcs_bucket)
blob = gcs_bucket.blob(f"{gcs_logs_folder}/{log_file}")
blob.upload_from_filename(log_file, checksum=None)