# Imports

In [1]:
import kfp
from kfp import dsl, compiler, components
from kfp.dsl import component

import json
import boto3
import logging
import pandas as pd
from datetime import datetime, timedelta, timezone

from google.cloud import storage
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google.cloud import secretmanager

# Variables

In [2]:
def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Secret Manager
    
    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret to access
        version_id: The version of the secret (default: "latest")
    
    Returns:
        The secret payload as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()
    
    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
    
    # Access the secret version
    response = client.access_secret_version(request={"name": name})
    
    # Decode and parse the JSON payload
    secret_payload = response.payload.data.decode("UTF-8")
    
    try:
        return json.loads(secret_payload)  # Convert string to JSON
    except json.JSONDecodeError:
        raise ValueError("The secret payload is not a valid JSON")

In [3]:
project_id = "dev-posigen"
secret_id = "dev-cx-voiceai"
version_id="1"
configs = access_secret(project_id, secret_id)
# configs

In [4]:
max_objects = 1

# Generate timestamp in a Vertex-compatible format
TIMESTAMP = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S")

VAI_GCP_PROJECT_ID = configs.get("VAI_GCP_PROJECT_ID")
VAI_GCP_PROJECT_LOCATION = configs.get("VAI_GCP_PROJECT_LOCATION")
VAI_GCP_PIPELINE_BUCKET = configs.get("VAI_GCP_PIPELINE_BUCKET")

VAI_GCP_PIPELINE_NAME = "fetch_file_from_s3"
VAI_GCP_PIPELINE_RUN_NAME = f"{configs.get("VAI_GCP_PIPELINE_NAME")}-{TIMESTAMP}"
GCP_PIPELINE_ROOT = f"gs://{VAI_GCP_PIPELINE_BUCKET}/{VAI_GCP_PIPELINE_RUN_NAME}"

VAI_AWS_ACCESS_KEY = configs.get("VAI_AWS_ACCESS_KEY")
VAI_AWS_SECRET_KEY = configs.get("VAI_AWS_SECRET_KEY")

VAI_S3_ANALYSIS_BUCKET = configs.get("VAI_S3_ANALYSIS_BUCKET")
VAI_S3_TRANSCRIPTS_LOCATION = configs.get("VAI_S3_TRANSCRIPTS_LOCATION")

# Set up Logging

In [5]:
# Set up logging
def setup_logger():
    """Set up a logger for the pipeline run."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    return logging.getLogger(__name__)

logger = setup_logger()

# Set up Error Handling

In [6]:
def handle_exception(
    file_id,
    vai_gcs_bucket,
    run_folder,
    error_folder,
    error_message
):
    """
    Logs the error, appends the file_id to error tracking CSV, and triggers a notification.
    """
    try:
        error_df_path = f"{error_folder}/{run_folder}_errors.csv"

        logger.error(f"Error processing file {file_id}: {error_message}")

        gcs_client = storage.Client()
        bucket = gcs_client.bucket(vai_gcs_bucket)
        blob = bucket.blob(error_df_path)

        if blob.exists():
            error_df = pd.read_csv(f"gs://{vai_gcs_bucket}/{error_df_path}")
        else:
            error_df = pd.DataFrame(columns=["File_ID", "Error_Message"])

        error_df = pd.concat([error_df, pd.DataFrame([{"File_ID": file_id, "Error_Message": error_message}])], ignore_index=True)
        error_df.to_csv(f"gs://{vai_gcs_bucket}/{error_df_path}", index=False)
        logger.info(f"Logged error for file {file_id} in {error_df_path}")

    except Exception as e:
        logger.error(f"Failed to write to error tracking file: {e}")

# Component: Listing new Transcripts

In [73]:
# @dsl.component(
#     base_image=f"us-central1-docker.pkg.dev/dev-posigen/dev-voice-ai/voice-ai-docker-image:latest"
# )
# def list_s3_files_to_gcs(
#     pipeline_run_name: str,
#     aws_access_key: str,
#     aws_secret_key: str,
#     s3_analysis_bucket: str,
#     s3_transcript_location: str,
#     vai_gcs_bucket: str,  
#     max_objects: int
# ):
"""
Fetch audio file from S3 and return it as a BytesIO object
"""
import boto3
import pandas as pd
import logging
from google.cloud import storage
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Generate pipeline folder paths
    staging_folder = f"{pipeline_run_name}/Stagging"
    transcripts_folder = f"{staging_folder}/Transcripts"
    max_objects = f"{pipeline_run_name}/Errored"

    # Initialize GCS Client
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(vai_gcs_bucket)

    # Create empty folders directly
    for folder in [staging_folder, transcripts_folder, max_objects]:
        blob = bucket.blob(f"{folder}/")
        blob.upload_from_string("", content_type="application/x-www-form-urlencoded")

    logging.info(f"Created folders: {staging_folder}, {transcripts_folder}, and {max_objects} in GCS.")

    # Initialize S3 Client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key
    )

    all_files = []
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=s3_analysis_bucket, Prefix=s3_transcript_location)

    # Get current timestamp and calculate 2-hour window
    current_time = datetime.utcnow()
    time_threshold = current_time - timedelta(hours=2)

    for page in pages:
        for obj in page.get('Contents', []):
            file_path = obj['Key']

            # Skip non-JSON files
            if not file_path.endswith('.json'):
                continue

            call_id = file_path.split('/')[-1].split("_analysis_")[0]

            # Extract timestamp from filename
            try:
                call_timestamp = pd.to_datetime(file_path.split('analysis_')[-1].split('.')[0].replace('Z', ""))
            except Exception as e:
                logger.warning(f"Skipping file {file_path} due to timestamp parsing error: {e}")
                continue

            # Check if the file falls within the last 2 hours
            if call_timestamp >= time_threshold:
                all_files.append({
                    'File': file_path,
                    'ID': call_id,
                    'File_Timestamp': call_timestamp,
                    'File_Date': call_timestamp.date().strftime('%Y-%m-%d'),
                    'File_Time': call_timestamp.time().strftime('%H:%M:%S')
                })

    if all_files:
        df_calls_list = pd.DataFrame(all_files).sort_values(['File_Timestamp'], ascending=False)
    else:
        df_calls_list = pd.DataFrame()

    # Write the DataFrame to GCS
    csv_path = f"gs://{vai_gcs_bucket}/{staging_folder}/{pipeline_run_name}_s3_Transcripts_fetched.csv"
    df_calls_list.to_csv(csv_path, index=False)
    logger.info(f"Written Transcripts list to GCS: {csv_path}")

    # Bulk Download filtered files
    if not df_calls_list.empty:
        for _, row in df_calls_list.iterrows():
            file_key = row['File']
            local_file_path = f"/tmp/{file_key.split('/')[-1]}"  # Temporary local storage
            gcs_blob_path = f"{transcripts_folder}/{file_key.split('/')[-1]}"

            try:
                # Download file from S3
                s3_client.download_file(s3_analysis_bucket, file_key, local_file_path)

                # Upload to GCS
                blob = bucket.blob(gcs_blob_path)
                blob.upload_from_filename(local_file_path)
                logger.info(f"Downloaded and uploaded: {file_key} -> {gcs_blob_path}")

            except Exception as e:
                logger.error(f"Failed to download {file_key}: {e}")

except Exception as e:
    handle_exception("N/A", vai_gcs_bucket, pipeline_run_name, max_objects, str(e))

# Component: Process Audio Files

### Initiate Master Dataframes

In [74]:
def initiate_master_dataframes(
    stagging_folder,
    bucket_name
):
    """
    Checks for 'df_intra_calls_data.csv' and 'df_inter_calls_data.csv' in GCS.
    If files exist, loads them into Pandas DataFrames; otherwise, creates empty DataFrames.
    """
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Define file paths in GCS
    intra_calls_path = f"{stagging_folder}/df_intra_calls_data.csv"
    inter_calls_path = f"{stagging_folder}/df_inter_calls_data.csv"

    # Load or create intra_calls DataFrame
    if storage.Blob(bucket=bucket, name=stagging_folder).exists(client):
        logger.info(f"df_intra_calls_data.csv exists in GCS.")
        blob = bucket.blob(intra_calls_path)
        df_intra_calls_data = pd.read_csv(blob.open("r"))
        df_intra_calls_data["contact_id"] = df_intra_calls_data["contact_id"].astype('string')
    else:
        logger.info(f"df_intra_calls_data.csv does not exist in GCS. Creating an empty DataFrame.")
        df_intra_calls_data = pd.DataFrame()

    # Load or create inter_calls DataFrame
    if storage.Blob(bucket=bucket, name=stagging_folder).exists(client):
        logger.info(f"df_inter_calls_data.csv exists in GCS.")
        blob = bucket.blob(inter_calls_path)
        df_inter_calls_data = pd.read_csv(blob.open("r"))
        df_inter_calls_data["contact_id"] = df_inter_calls_data["contact_id"].astype('string')
    else:
        logger.info(f"df_inter_calls_data.csv does not exist in GCS. Creating an empty DataFrame.")
        df_inter_calls_data = pd.DataFrame()

    return df_intra_calls_data, df_inter_calls_data

### Fetch Transcript from S3

In [75]:
def fetch_transcript_from_s3(
    contact_id,
    aws_access_key,
    aws_secret_key,
    s3_analysis_bucket,
    file_key
):
    """
    Read Transcript JSON content from a specific file in S3.
    
    :param bucket_name: Name of the S3 bucket
    :param file_key: Full path/key of the JSON file
    :return: Parsed JSON content
    """
    s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
    
    try:
        # Download the file
        response = s3_client.get_object(Bucket=s3_analysis_bucket, Key=file_key)    
        # Read the content
        json_content = response['Body'].read().decode('utf-8')    
        # Parse JSON
        return json.loads(json_content)
    
    except Exception as e:
        handle_exception(contact_id, vai_gcs_bucket, pipeline_run_name, max_objects, str(e))

### Create Intra Call DataFrame

In [76]:
def mask_pii_in_captions(
    contact_id,
    df,
    project_id
):
    """
    Masks PII data in the 'caption' column of a pandas DataFrame using Google Cloud DLP API.
    
    Args:
        df (pandas.DataFrame): DataFrame with a 'caption' column to process
        project_id (str): Your Google Cloud project ID
        location_id (str, optional): GCP location ID. Defaults to "global".
        
    Returns:
        pandas.DataFrame: DataFrame with masked PII in the 'caption' column
    """
    logger.info(f"{contact_id}: Masking PII Data")
    # Create a copy of the DataFrame to avoid modifying the original
    masked_df = df.copy()
    
    # Concatenate all captions for bulk processing
    all_captions = "\n".join(df['caption'].astype(str).tolist())
    
    # Initialize DLP client
    dlp_client = dlp_v2.DlpServiceClient()
    
    # Specify the parent resource name
    parent = f"projects/{project_id}"
    
    # Configure inspection config
    inspect_config = {
        "info_types": [
            {"name": "CREDIT_CARD_NUMBER"},
            {"name": "STREET_ADDRESS"},
            {"name": "IP_ADDRESS"},
            {"name": "ORGANIZATION_NAME"}
        ],
        # Custom exclusion rules
        "rule_set": [
            {
                "info_types": [{"name": "ORGANIZATION_NAME"}],
                "rules": [
                    {
                        "exclusion_rule": {
                            "dictionary": {
                                "word_list": {
                                    "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
                                }
                            },
                            "matching_type": dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH,
                        }
                    }
                ]
            }
        ]
    }
    
    # Configure deidentification
    # Important: Use different transformations for different info types
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                # Mask credit cards, addresses, IPs, organizations (except posigen)
                {
                    "info_types": [
                        {"name": "CREDIT_CARD_NUMBER"},
                        {"name": "STREET_ADDRESS"},
                        {"name": "IP_ADDRESS"},
                        {"name": "ORGANIZATION_NAME"}
                    ],
                    "primitive_transformation": {
                        "character_mask_config": {
                            "masking_character": "*",
                            "number_to_mask": 100  # Mask all characters
                        }
                    }
                }
            ]
        }
    }
    
    # Create deidentify request
    item = {"value": all_captions}
    
    # Call the DLP API
    response = dlp_client.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": item,
        }
    )
    
    # Split the processed content back into separate captions
    processed_captions = response.item.value.split("\n")
    
    # Additional processing to mask all digits with asterisks
    # (while preserving the DLP API masking for specific PII types)
    def mask_digits(text):
        # Function to replace digits with asterisks, preserving DLP masked content
        return re.sub(r'(?<!\*)\d(?!\*)', '*', text)
    
    # Apply the digit masking function to each processed caption
    masked_captions = [mask_digits(caption) for caption in processed_captions]
    
    # Update the DataFrame with masked captions
    masked_df['caption'] = masked_captions[:len(masked_df)]
    
    logger.info(f"{contact_id}: Completed Masking PII Data")
    return masked_df

    
def get_sentiment_label(row):
    # Check conditions in order of priority (Positive > Negative > Neutral)
    if row['positive'] > row['negative'] and row['positive'] > row['neutral']:
        return 'Positive'
    elif row['negative'] > row['positive'] and row['negative'] > row['neutral']:
        return 'Negative'
    else:
        return 'Neutral'

def get_different_times(intra_call):
    # Apply formatting to both time columns
    intra_call['start_time_second'] = (intra_call['Begin_Offset'] / 1000).astype(int)
    intra_call['end_time_second'] = (intra_call['End_Offset'] / 1000).astype(int)
    intra_call['time_spoken_second'] = intra_call['end_time_second'] - intra_call['start_time_second']
    intra_call['time_spoken_second'] = intra_call['time_spoken_second'].where(intra_call['time_spoken_second'] >= 0, 0)
    intra_call['time_spoken_second'] = intra_call['time_spoken_second'].fillna(0).astype(int)
    intra_call['time_silence_second'] = intra_call['start_time_second'].shift(-1) - intra_call['end_time_second']
    intra_call['time_silence_second'] = intra_call['time_silence_second'].where(intra_call['time_silence_second'] >= 0, 0)
    intra_call['time_silence_second'] = intra_call['time_silence_second'].fillna(0).astype(int)
    intra_call['load_date'] = datetime.now()

    # Dropping time formatted columns
    intra_call = intra_call.drop(['Begin_Offset', 'End_Offset'], axis=1)

    return intra_call

def get_sentiment_scores(
    contact_id,
    text_list
):
    logger.info(f"{contact_id}: Calculating Caption Sentiments.")
    dict_sentiments = []
    for text in text_list:
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model_sentiment(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = np.round(np.multiply(softmax(scores), 100), 2)
        merged_dict = dict(zip(list(config.id2label.values()), list(scores)))
        dict_sentiments.append(merged_dict)

    df_dict_sentiments = pd.DataFrame(dict_sentiments)
    df_dict_sentiments['sentiment_lable'] = df_dict_sentiments[['positive','negative','neutral']].apply(get_sentiment_label, axis=1)
    logger.info(f"{contact_id}: Completed calculating Caption Sentiments.")
    
    return df_dict_sentiments

def process_transcript(
    contact_id,
    transcript_data,
    tokenizer
):
    """
    Pre-process the transcript loaded from S3 Buckets:
    1. Load the transcript as Pandas Dataframe.
    2. Select only the necessary columns ['BeginOffsetMillis', 'EndOffsetMillis', 'ParticipantId', 'Content', 'Sentiment', 'LoudnessScore'].
    3. Format the time in minutes and seconds.
    4. Rename the columns for better understanding.
    """
    logger.info(f"{contact_id}: Loading the Transcript as Pandas Dataframe.")
    transcript_df = pd.json_normalize(transcript_data['Transcript'])

    # Select the relevant Columns
    columns_to_select = [
        'BeginOffsetMillis',
        'EndOffsetMillis',
        'ParticipantId',
        'Content'
    ]
    formatted_df = transcript_df[columns_to_select].copy()
    
    # Optionally rename columns to reflect their new format
    formatted_df = formatted_df.rename(columns={
        'BeginOffsetMillis': 'Begin_Offset',
        'EndOffsetMillis': 'End_Offset',
        'Content': 'caption',
        'Sentiment': 'sentiment_label',
        'ParticipantId': 'speaker_tag'
    })

    # Inserting the Call ID:
    formatted_df.insert(loc=0, column='contact_id', value=contact_id)
    formatted_df['call_language'] = transcript_data['LanguageCode']

    logger.info(f"{contact_id}: Returning formated DataFrame.")
    return formatted_df
    
def create_intra_call_df(
    contact_id,
    vai_gcs_bucket,
    pipeline_run_name,
    max_objects,
    transcript_data,
    tokenizer
):
    intra_call = process_transcript(contact_id, transcript_data, tokenizer)
    df_sentiment_scores = get_sentiment_scores(contact_id, intra_call.caption.to_list())
    intra_call = pd.concat([intra_call, df_sentiment_scores], axis=1)    
    intra_call = get_different_times(intra_call)
    intra_call = mask_pii_in_captions(contact_id, intra_call, gcp_project_id)
    
    return intra_call

### Create Inter Call Dataframe

In [77]:
def dict_to_newline_string(data):
    """Converts a dictionary into a new-line formatted string."""
    formatted_str = ""
    for key, value in data.items():
        formatted_str += f"{key}:\n"
        for item in value:
            formatted_str += f"  - {item}\n"
    return formatted_str.strip()
    

class CategoryValidator:
    def __init__(self, catsubcat_conn_params, snf_catsubcat_view):
        """
        Initialize with category mapping from a Snowflake View.
        :param snowflake_conn_paramsionary containing Snowflake connection details.
        :param view_name: Name of the Snowflake View containing category mappings.
        """
        self.catsubcat_conn_params = catsubcat_conn_params
        self.snf_catsubcat_view = snf_catsubcat_view
        self.category_mapping = self._fetch_category_mapping_from_snowflake()
        self.valid_categories = set(self.category_mapping['CATEGORY'].unique())
        self.category_subcategory_map = self._create_category_mapping()

    def _fetch_category_mapping_from_snowflake(self):
        """Fetch Category-Subcategory mapping from a Snowflake View and return as DataFrame."""
        try:
            conn = sc.connect(**self.catsubcat_conn_params)
            query = f"SELECT CATEGORY, SUBCATEGORY FROM {self.snf_catsubcat_view}"
            df = pd.read_sql(query, conn)
            conn.close()
            return df
        except Exception as e:
            raise RuntimeError(f"Error fetching category mapping from Snowflake: {e}")

    def _create_category_mapping(self):
        """Create category to subcategory mapping."""
        mapping = {}
        for _, row in self.category_mapping.iterrows():
            if row['CATEGORY'] not in mapping:
                mapping[row['CATEGORY']] = set()
            mapping[row['CATEGORY']].add(row['SUBCATEGORY'])
        return mapping

    def validate_category(self, category):
        """Check if category is valid."""
        return category in self.valid_categories

    def validate_subcategory(self, category, subcategory):
        """Check if subcategory is valid for given category."""
        return category in self.category_subcategory_map and subcategory in self.category_subcategory_map[category]

    def get_valid_subcategories(self, category):
        """Get valid subcategories for a category."""
        return self.category_subcategory_map.get(category, set())

class CallSummary(BaseModel):
    summary: str = Field(..., max_length=500)
    # key_points: List[str] = Field(..., max_items=5)
    # outcome = Field(..., max_length=200)
    # follow_up_recommendations: List[str] = Field(..., max_items=3)

class CallTopic(BaseModel):
    primary_topic: str = Field(..., max_length=100)
    category: str = Field(..., max_length=100)
    sub_category: str = Field(..., max_length=100)

    def validate_category_mapping(self, category_validator: CategoryValidator):
        """Validate category and subcategory against mapping"""
        if not category_validator.validate_category(self.category):
            logger.error(f"Invalid category: {self.category}")
        if not category_validator.validate_subcategory(self.category, self.sub_category):
            logger.error(f"Invalid subcategory '{self.sub_category}' for category '{self.category}'")

class AgentCoaching(BaseModel):
    strengths: List[str] = Field(..., max_items=3)
    improvement_areas: List[str] = Field(..., max_items=3)
    specific_recommendations: List[str] = Field(..., max_items=4)
    skill_development_focus: List[str] = Field(..., max_items=3)

class TranscriptAnalysis(BaseModel):
    call_summary: CallSummary
    call_topic: CallTopic
    agent_coaching: AgentCoaching

class KPIExtractor:
    def __init__(self, project_id, location):
        vertexai.init(project=project_id, location=location)
        self.model = GenerativeModel("gemini-1.5-flash-002")
        self.category_validator = CategoryValidator(catsubcat_conn_params, snf_catsubcat_view)
        
        self.generation_config = {
            "temperature": 0.3,
            "max_output_tokens": 1024,
            "top_p": 0.8,
            "top_k": 40,
            "response_format": "json"
        }
        
        self.safety_settings = {
            generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        }

    def get_categories_prompt(self) -> str:
        """Create prompt section for valid categories and subcategories, handling null values"""
        categories_prompt = []
        
        for category, subcategories in self.category_validator.category_subcategory_map.items():
            if category is None:  # Skip if category is None
                continue
            
            # Ensure subcategories are valid (remove None values)
            valid_subcategories = [subcat for subcat in subcategories if subcat is not None]
            
            if valid_subcategories:
                subcats = ', '.join(sorted(valid_subcategories))
            else:
                subcats = "No defined subcategories"
            
            categories_prompt.append(f"Category '{category}' can have subcategories: {subcats}")
        
        return '\n'.join(categories_prompt)

    def create_prompt(self, transcript):
        """Create structured prompt with category guidance"""
        categories_guidance = self.get_categories_prompt()
        
        return f"""
        Analyze this call transcript and provide a structured analysis in the exact JSON format specified below.
        Keep responses concise, specific, and actionable.

        Guidelines:
        - Call summary should be factual and highlight key interactions
        - Topics and categories MUST match the following valid mappings:
        {categories_guidance}
        - Coaching points should be specific and actionable
        - All responses must follow the exact structure specified
        - Ensure all lists have the specified maximum number of items
        - All text fields must be clear, professional, and free of fluff

        Transcript:
        {transcript}

        Required Output Structure:
        {{
            "call_summary": {{
                "summary": "3-4 line overview of the call"
            }},
            "call_topic": {{
                "primary_topic": "Main topic of discussion",
                "category": "MUST BE ONE OF THE VALID CATEGORIES LISTED ABOVE",
                "sub_category": "MUST BE A VALID SUB-CATEGORY FOR THE CHOSEN CATEGORY"
            }},
            "agent_coaching": {{
                "strengths": ["Strength 1", "Strength 2", "Strength 3"],
                "improvement_areas": ["Area 1", "Area 2", "Area 3"],
                "specific_recommendations": ["Rec 1", "Rec 2", "Rec 3", "Rec 4"],
                "skill_development_focus": ["Skill 1", "Skill 2", "Skill 3"]
            }}
        }}

        Rules:
        1. Maintain exact JSON structure
        2. No additional fields or comments
        3. No markdown formatting
        4. Ensure all arrays have the exact number of items specified
        5. Keep all text concise and professional
        6. Do not mention any PII information such as Customer Name etc.
        7. STRICTLY use only the categories and subcategories from the provided mapping
        """

    def extract_json(self, response):
        """Extract valid JSON from response"""
        match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
        if match:
            json_str = match.group(1)
        else:
            json_str = response.strip()
        
        try:
           return json.loads(json_str)
        except json.JSONDecodeError:
           logger.error("Invalid JSON response")
           pass

    def validate_response(self, response_json, contact_id = None):
        """Validate response using Pydantic models and category mapping"""
        try:
            # First validate basic structure with Pydantic
            analysis = TranscriptAnalysis(**response_json)
            
            # Then validate category mapping
            analysis.call_topic.validate_category_mapping(self.category_validator)
            
            return analysis
        except ValidationError as e:
            logger.error(f"{contact_id if contact_id else ''}: Pydantic validation error - {e}")
            pass
        except ValueError as e:
            logger.error(f"{contact_id if contact_id else ''}: Category validation error - {e}")
            pass

    def extract_genai_kpis(self, transcript, contact_id = None):
        """Extract KPIs from transcript with validation"""
        try:
            # Generate prompt
            prompt = self.create_prompt(transcript)
            
            # Get response from Gemini
            response = self.model.generate_content(
                prompt
                # generation_config=self.generation_config,
                # safety_settings=self.safety_settings
            )

            
            logger.debug(f"Gemini API Response: {response}")
            
            # Parse JSON response
            response_json = self.extract_json(response.text)
            
            # Validate response structure and categories
            validated_response = self.validate_response(response_json, contact_id)
            
            return validated_response.model_dump()
            
        except Exception as e:
            logger.error(f"{contact_id if contact_id else ''}: Error extracting KPIs: {str(e)}")
            pass

    
def create_inter_call_df(
    contact_id,
    vai_gcs_bucket,
    vai_gcs_folder_stagging,
    max_objects,
    pipeline_run_name,
    transcript_data,
    ac_last_modified_date,
    df_intra_call,
    gcp_project_id,
    gcp_project_location,
    snf_account,
    snf_user,
    snf_private_key_file,
    snf_private_key_pwd,
    snf_warehouse,
    snf_catsubcat_databse,
    snf_catsubcat_schema,
    snf_catsubcat_view
):
    try:
        logger.info(f"{contact_id}: Extracting KPIs from Gemini")      
        extractor = KPIExtractor(gcp_project_id, gcp_project_location)
        transcript = " ".join(df_intra_call.caption)
        call_gen_kpis = extractor.extract_genai_kpis(transcript)
        logger.info(f"{contact_id}: Completed Extracting KPIs from Gemini") 
        
        logger.info(f"{contact_id}: Creating Inter Call df")
        inter_call_dict = {}
        inter_call_dict['contact_id'] = str(df_intra_call['contact_id'][0])
        inter_call_dict['call_text'] = " ".join(df_intra_call.caption)
        inter_call_dict['call_summary'] = call_gen_kpis['call_summary']['summary']
        inter_call_dict['topic'] = call_gen_kpis['call_topic']['primary_topic']
        inter_call_dict['category'] = call_gen_kpis['call_topic']['category']
        inter_call_dict['category_generated'] = call_gen_kpis['call_topic']['category']
        inter_call_dict['sub_category'] = call_gen_kpis['call_topic']['sub_category']
        inter_call_dict['sub_category_generated'] = call_gen_kpis['call_topic']['sub_category']
        inter_call_dict['agent_coaching'] = dict_to_newline_string(call_gen_kpis['agent_coaching'])
        df_inter_call = pd.DataFrame(pd.Series(inter_call_dict)).T
        
        logger.info(f"{contact_id}:  Add metadata from AWS")
        df_inter_call['agent_speech_speed'] = transcript_data['ConversationCharacteristics']['TalkSpeed']['DetailsByParticipant']['AGENT']['AverageWordsPerMinute']
        df_inter_call['customer_speech_speed'] = transcript_data['ConversationCharacteristics']['TalkSpeed']['DetailsByParticipant']['CUSTOMER']['AverageWordsPerMinute']
        df_inter_call['total_talktime_agent_second'] = int(transcript_data['ConversationCharacteristics']['TalkTime']['DetailsByParticipant']['AGENT']['TotalTimeMillis']/1000)
        df_inter_call['total_talktime_customer_second'] = int(transcript_data['ConversationCharacteristics']['TalkTime']['DetailsByParticipant']['CUSTOMER']['TotalTimeMillis']/1000)
        df_inter_call['total_talktime_call_second'] = int(transcript_data['ConversationCharacteristics']['TalkTime']['TotalTimeMillis']/1000)
        df_inter_call['total_duration_call_second'] = int(transcript_data['ConversationCharacteristics']['TotalConversationDurationMillis']/1000)
        df_inter_call['total_dead_air_call_second'] = df_inter_call['total_duration_call_second'] - df_inter_call['total_talktime_call_second']
        # df_inter_call['customer_instance_id'] = transcript_data['CustomerMetadata']['InstanceId']
        # df_inter_call['call_job_status'] = transcript_data['JobStatus']
        df_inter_call['call_language'] = transcript_data['LanguageCode']
        df_inter_call['call_s3_uri'] = transcript_data['CustomerMetadata']['InputS3Uri']
        df_inter_call['ac_last_modified_date'] = ac_last_modified_date
        logger.info(f"{contact_id}: Completed creating Inter Call df")
        
        return df_inter_call

    except Exception as e:
        handle_exception(contact_id, vai_gcs_bucket, pipeline_run_name, max_objects, str(e))

### Run Process Audio Files

In [78]:
@dsl.component(
    base_image=f"us-central1-docker.pkg.dev/dev-posigen/dev-voice-ai/voice-ai-docker-image:latest"
)
def process_files(
    pipeline_run_name: str,
    vai_gcs_bucket: str,
    vai_gcs_folder_stagging: str,
    vai_gcs_folder_errored: str,
    gcp_project_id: str,
    gcp_project_location: str,
    snf_account: str,
    snf_user: str,
    snf_private_key_file: str,
    snf_private_key_pwd: str,
    snf_warehouse: str,
    snf_catsubcat_databse: str,
    snf_catsubcat_schema: str,
    snf_catsubcat_view: str,
    max_objects: int
):
    """
    Reads CSV from staging, processes files, and writes processed data back.
    """
    import pandas as pd
    import numpy as np
    from scipy.special import softmax
    import logging
    import re, os
    from datetime import datetime
    from typing import List, Dict
    
    from pydantic import BaseModel, Field, ValidationError
    
    from google.cloud import storage
    from google.cloud import dlp_v2
    import vertexai
    import vertexai.preview.generative_models as generative_models
    from vertexai.generative_models import GenerativeModel, GenerationConfig, Part
    
    # Sentiments
    from transformers import pipeline
    from transformers import AutoTokenizer, AutoConfig
    from transformers import AutoModelForSequenceClassification
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    
    # Initialize GCS Client
    storage_client = storage.Client()
    
    # Download the private key from GCS
    bucket_name, blob_name = snf_private_key_file.replace("gs://", "").split("/", 1)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    private_key_path = "/tmp/snowflake_key.p8"  # Local temp storage
    blob.download_to_filename(private_key_path)
    
    catsubcat_conn_params = {
        'account': snf_account,
        'user': snf_user,
        'private_key_file': private_key_path,
        'private_key_file_pwd': snf_private_key_pwd,
        'warehouse': snf_warehouse,
        'database': snf_catsubcat_databse,
        'schema': snf_catsubcat_schema
    }
    
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    model_sentiment = AutoModelForSequenceClassification.from_pretrained(MODEL)
    
    stagging_folder = f"gs://{vai_gcs_bucket}/{pipeline_run_name}/{pipeline_run_name}/{vai_gcs_folder_stagging}"
    logger.info(f"Setting Stagging Folder: {stagging_folder}")
    
    logger.info("Initiating Master DataFrames: df_intra_calls_data, df_inter_calls_data")
    df_intra_calls_data, df_inter_calls_data = initiate_master_dataframes(stagging_folder, vai_gcs_bucket)
    
    s3_Transcripts_fetched = f"gs://{vai_gcs_bucket}/{pipeline_run_name}/{vai_gcs_folder_stagging}/{pipeline_run_name}_s3_Transcripts_fetched.csv"
    transcripts = pd.read_csv(s3_Transcripts_fetched)
    
    logger.info("--------------")
    logger.info(f"Transcripts to process: {transcripts.shape[0]}")
    logger.info("--------------")
    
    transcript = transcripts.File[0]
    for transcript in transcripts.File[:max_objects]:
        try:
            transcript = transcripts.File[0]
            contact_id = transcript.split('/')[-1].split('.')[0].split('analysis')[0].strip('_')
            ac_last_modified_date = datetime.strptime(transcript.split('analysis_')[-1].split('.')[0].replace('_', ':'), '%Y-%m-%dT%H:%M:%SZ')
            
            if (len(df_intra_calls_data) > 0 and contact_id in df_intra_calls_data.CONTACT_ID.unique()) and (len(df_inter_calls_data) > 0 and contact_id in df_inter_calls_data.CONTACT_ID.unique()):
                logger.info(f"{contact_id}: Call already Processed.")
                logger.info("")
                logger.info("")
            else:
                logger.info(f"{contact_id}: Processing")
            
                logger.info(f"{contact_id}: Fetching Transcript from S3")
                transcript_data = fetch_transcript_from_s3(contact_id, aws_access_key, aws_secret_key, s3_analysis_bucket, transcript)
                logger.info(f"{contact_id}: Completed fetching Transcript from S3")
            
                logger.info(f"{contact_id}: Creating df_intra_call ")
                df_intra_call = create_intra_call_df(
                    contact_id,
                    vai_gcs_bucket,
                    pipeline_run_name,
                    max_objects,
                    transcript_data,
                    tokenizer
                )
                logger.info(f"{contact_id}: Successfully created df_intra_call ")
            
                logger.info(f"{contact_id}: Creating df_inter_call ")
                df_inter_call = create_inter_call_df(
                    contact_id,
                    vai_gcs_bucket,
                    vai_gcs_folder_stagging,
                    max_objects,
                    pipeline_run_name,
                    transcript_data,
                    ac_last_modified_date,
                    df_intra_call,
                    gcp_project_id,
                    gcp_project_location,
                    snf_account,
                    snf_user,
                    snf_private_key_file,
                    snf_private_key_pwd,
                    snf_warehouse,
                    snf_catsubcat_databse,
                    snf_catsubcat_schema,
                    snf_catsubcat_view
                )
                logger.info(f"{contact_id}: Successfully created df_inter_call ")
    
    
                ###============================================================###
                if not df_intra_call.empty and not df_inter_call.empty:
                    df_intra_call.columns = df_intra_call.columns.str.upper()  # Capitalising Column names for Snowflake
                    df_intra_calls_data = pd.concat([df_intra_calls_data, df_intra_call], ignore_index=True)
                    df_intra_calls_data.to_csv(f"{stagging_folder}/df_intra_calls_data.csv", index=False)
                    logger.info(f"{contact_id}: Persisted df_intra_calls_data to CSV.")
                    
                    # Appending to Inter-calls Master DataFrame
                    df_inter_call.columns = df_inter_call.columns.str.upper()  # Capitalising Column names for Snowflake
                    df_inter_calls_data = pd.concat([df_inter_calls_data, df_inter_call], ignore_index=True)
                    df_inter_calls_data.to_csv(f"{stagging_folder}/df_inter_calls_data.csv", index=False)
                    logger.info(f"{contact_id}: Persisted df_intra_calls_data to CSV.")
                    logger.info(f"{contact_id}: Processing Complete")
                    logger.info("")
                    logger.info("")
        
        except Exception as e:
            logger.error("--------------------------")
            handle_exception(contact_id, vai_gcs_bucket, pipeline_run_name, max_objects, str(e))
            logger.error("--------------------------")
            continue

# Component: Write Data to Snowflake

# Define the Pipeline

In [79]:
@dsl.pipeline(
    name=VAI_GCP_PIPELINE_RUN_NAME,
    description="Proces Amazon Audio Transcripts to KPIs"
)
def vai_audio_to_kpi_pipeline(
    pipeline_run_name: str,
    aws_access_key: str,
    aws_secret_key: str,
    s3_analysis_bucket: str,
    s3_transcript_location: str,
    vai_gcs_bucket: str,
    max_objects: int
):
    fetch_transcripts = list_s3_files_to_gcs(
        pipeline_run_name=VAI_GCP_PIPELINE_RUN_NAME,
        aws_access_key=VAI_AWS_ACCESS_KEY,
        aws_secret_key=VAI_AWS_SECRET_KEY,
        s3_analysis_bucket=VAI_S3_ANALYSIS_BUCKET,
        s3_transcript_location=VAI_S3_TRANSCRIPTS_LOCATION,
        vai_gcs_bucket=VAI_GCP_PIPELINE_BUCKET,
        max_objects=max_objects
    )

ValueError: Constant argument inputs must be one of type ['String', 'Integer', 'Float', 'Boolean', 'List', 'Dict'] Got: None of type <class 'NoneType'>.

# Compile the Pipeline

In [None]:
compiler.Compiler().compile(vai_audio_to_kpi_pipeline, f'{VAI_GCP_PIPELINE_NAME}.yaml')

# Run the Pipeline 

In [None]:
# Initialize Vertex AI
aiplatform.init(project=VAI_GCP_PROJECT_ID, location=VAI_GCP_PROJECT_LOCATION)

max_objects = 1

# # Create pipeline job
# job = aiplatform.PipelineJob(
#     display_name = f"{VAI_GCP_PIPELINE_RUN_NAME}".lower(),
#     job_id = f"vai-pipeline-run--{TIMESTAMP}".lower(),
#     template_path = f"{VAI_GCP_PIPELINE_NAME}.yaml",
#     pipeline_root = f"gs://{VAI_GCP_PIPELINE_BUCKET}",
#     project = VAI_GCP_PROJECT_ID,
#     location = VAI_GCP_PROJECT_LOCATION,
#     enable_caching = False,
#     parameter_values={
#         "pipeline_run_name": VAI_GCP_PIPELINE_RUN_NAME,
#         "vai_gcs_bucket": VAI_GCP_PIPELINE_BUCKET,
#         "vai_gcs_folder_stagging": VAI_GCP_PIPELINE_FOLDER_STAGGING,
#         "vai_gcs_folder_errored": VAI_GCP_PIPELINE_FOLDER_ERRORED,
#         "gcp_project_id": VAI_GCP_PROJECT_ID,
#         "gcp_project_location": VAI_GCP_PROJECT_LOCATION,
        
#         "aws_access_key":VAI_AWS_ACCESS_KEY,
#         "aws_secret_key":VAI_AWS_SECRET_KEY,
#         "s3_analysis_bucket":VAI_S3_ANALYSIS_BUCKET,
#         "s3_transcript_location":VAI_S3_TRANSCRIPTS_LOCATION,

#         "snf_account": VAI_SNF_ACCOUNT,
#         "snf_user": VAI_SNF_USER,
#         "snf_private_key_file": VAI_SNF_PRIVATE_KEY_FILE,
#         "snf_private_key_pwd": VAI_SNF_PRIVATE_KEY_PWD,
#         "snf_warehouse": VAI_SNF_WAREHOUS,
#         "snf_catsubcat_databse": VAI_SNF_CATSUBCAT_DATABASE,
#         "snf_catsubcat_schema": VAI_SNF_CATSUBCAT_SCHEMA,
#         "snf_catsubcat_view": VAI_SNF_CATSUBCAT_VIEW,
        
#         "max_objects":max_objects
#     }
# )
# Create pipeline job
job = pipeline_jobs.PipelineJob(
    display_name = f"{VAI_GCP_PIPELINE_RUN_NAME}".lower(),
    job_id = f"vai-pipeline-run-{TIMESTAMP}".lower(),
    template_path = f"{VAI_GCP_PIPELINE_NAME}.yaml",
    pipeline_root = f"gs://{VAI_GCP_PIPELINE_BUCKET}",
    project = VAI_GCP_PROJECT_ID,
    location = VAI_GCP_PROJECT_LOCATION,
    enable_caching = False,
    parameter_values={
        "pipeline_run_name":VAI_GCP_PIPELINE_RUN_NAME,
        "aws_access_key":VAI_AWS_ACCESS_KEY,
        "aws_secret_key":VAI_AWS_SECRET_KEY,
        "s3_analysis_bucket":VAI_S3_ANALYSIS_BUCKET,
        "s3_transcript_location":VAI_S3_TRANSCRIPTS_LOCATION,
        "vai_gcs_bucket":VAI_GCP_PIPELINE_BUCKET,
        "max_objects":max_objects
    }
)

In [None]:
job.run()