# Variables

In [1]:
# from kfp import dsl
# from kfp.v2 import compiler
from google.cloud import aiplatform
from typing import List, Dict
import boto3, logging
from botocore.exceptions import ClientError
import json, os, ast, re
from datetime import datetime, timedelta
import pandas as pd, numpy as np
from scipy.special import softmax
from pydantic import BaseModel, Field, ValidationError

# import scrubadub, scrubadub_spacy

import snowflake.connector as sc
from snowflake.connector.pandas_tools import write_pandas

import vertexai
import vertexai.preview.generative_models as generative_models
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part

# # Sentiments
# from transformers import pipeline
# from transformers import AutoTokenizer, AutoConfig
# from transformers import AutoModelForSequenceClassification

In [8]:
# Temporary secrets manager
with open("configs.json", 'r') as secrets_file:
    configs = json.load(secrets_file)
aws_access_key = configs.get("aws_access_key")
aws_secret_key = configs.get("aws_secret_key")

# AWS
s3_source_bucket = configs.get('s3_source_bucket')
s3_transcripts_location = configs.get('s3_transcripts_location')

# GCP
gcp_project_id=configs.get('gcp_project_id')
gcp_prjct_location=configs.get('gcp_prjct_location')

# Snowflake
private_key_file = configs.get('snowflakegcp_rsa_key')
private_key_file_pwd = configs.get('snf_ssh_key_pass')

conn_params = {
    'account': configs.get('snf_account'),
    'user': configs.get('snf_user'),
    'private_key_file': configs.get('snf_private_key_file'),
    'private_key_file_pwd':configs.get('snf_private_key_pwd'),
    'warehouse': configs.get('snf_warehouse'),
    'database': configs.get('snf_database'),
    'schema': configs.get('snf_schema')
}

# # # Sentiment Scores
# MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
# tokenizer = AutoTokenizer.from_pretrained(MODEL)
# config = AutoConfig.from_pretrained(MODEL)
# model_sentiment = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Gemini Prompt

In [85]:
# Pydantic models for validation
class CallSummary(BaseModel):
    summary: str = Field(..., max_length=500)
    key_points: List[str] = Field(..., max_items=5)
    outcome: str = Field(..., max_length=200)
    follow_up_recommendations: List[str] = Field(..., max_items=3)

class CallTopic(BaseModel):
    primary_topic: str = Field(..., max_length=100)
    category: str = Field(..., max_length=100)
    sub_category: str = Field(..., max_length=100)

class AgentCoaching(BaseModel):
    strengths: List[str] = Field(..., max_items=3)
    improvement_areas: List[str] = Field(..., max_items=3)
    specific_recommendations: List[str] = Field(..., max_items=4)
    skill_development_focus: List[str] = Field(..., max_items=3)

class TranscriptAnalysis(BaseModel):
    call_summary: CallSummary
    call_topic: CallTopic
    agent_coaching: AgentCoaching

class KPIExtractor:
    def __init__(self, project_id: str, location: str):
        vertexai.init(project=project_id, location=location)
        self.model = GenerativeModel("gemini-1.5-flash-002")
        self.generation_config = {
            "temperature": 0.3,
            "max_output_tokens": 1024,
            "top_p": 0.8,
            "top_k": 40,
            "response_format": "json"
        }
        self.safety_settings = {
            generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        }
        
    def create_prompt(self, transcript: str) -> str:
        """Create a structured prompt for KPI extraction"""
        return f"""
        Analyze this call transcript and provide a structured analysis in the exact JSON format specified below.
        Keep responses concise, specific, and actionable.

        Guidelines:
        - Call summary should be factual and highlight key interactions
        - Topics and categories should match standard business taxonomies
        - Coaching points should be specific and actionable
        - All responses must follow the exact structure specified
        - Ensure all lists have the specified maximum number of items
        - All text fields must be clear, professional, and free of fluff

        Transcript:
        {transcript}

        Required Output Structure:
        {{
            "call_summary": {{
                "summary": "3-4 line overview of the call",
                "key_points": ["Point 1", "Point 2", "Point 3", "Point 4", "Point 5"],
                "outcome": "Clear statement of call resolution",
                "follow_up_recommendations": ["Rec 1", "Rec 2", "Rec 3"]
            }},
            "call_topic": {{
                "primary_topic": "Main topic of discussion",
                "category": "Business category",
                "sub_category": "Specific sub-category"
            }},
            "agent_coaching": {{
                "strengths": ["Strength 1", "Strength 2", "Strength 3"],
                "improvement_areas": ["Area 1", "Area 2", "Area 3"],
                "specific_recommendations": ["Rec 1", "Rec 2", "Rec 3", "Rec 4"],
                "skill_development_focus": ["Skill 1", "Skill 2", "Skill 3"]
            }}
        }}

        Rules:
        1. Maintain exact JSON structure
        2. No additional fields or comments
        3. No markdown formatting
        4. Ensure all arrays have the exact number of items specified
        5. Keep all text concise and professional
        6. Do not mention any PII information such as Customer Name etc.
        """
    
    
    def extract_json(self, response: str):
        """Extracts valid JSON from a response that may contain extra characters like ```json."""
        match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
        if match:
            json_str = match.group(1)  # Extract JSON content
        else:
            json_str = response.strip()  # If no markdown, assume raw JSON
        
        try:
            return json.loads(json_str)  # Convert to dictionary
        except json.JSONDecodeError:
            raise ValueError("Invalid JSON response")
    
            
    def validate_response(self, response_json: Dict) -> TranscriptAnalysis:
        """Validate the response using Pydantic models"""
        try:
            return TranscriptAnalysis(**response_json)
        except ValidationError as e:
            print(f"Skipping call {i + 1}: Error extracting KPIs - {e}")

    def extract_genai_kpis(self, transcript: str):
        """
        Extract KPIs from transcript using Gemini API
        
        Args:
            transcript (str): Call transcript text
            
        Returns:
            Dict: Structured KPI data or None if extraction fails
        """
        try:
            # Generate prompt
            prompt = self.create_prompt(transcript)
            
            # Get response from Gemini
            response = self.model.generate_content(prompt)
            
            # Parse JSON response
            response_json = self.extract_json(response.text)
            
            # Validate response structure
            validated_response = self.validate_response(response_json)
            
            return validated_response.model_dump()
            
        except Exception as e:
            print(f"Error extracting KPIs: {str(e)}")
            return None

def dict_to_newline_string(data: dict) -> str:
    """Converts a dictionary into a new-line formatted string."""
    formatted_str = ""
    for key, value in data.items():
        formatted_str += f"{key}:\n"
        for item in value:
            formatted_str += f"  - {item}\n"
    return formatted_str.strip()

# AWS

In [15]:
import boto3, json
import os
from urllib.parse import quote

In [16]:
# Temporary secrets manager
with open("configs.json", 'r') as secrets_file:
    secrets = json.load(secrets_file)
aws_access_key = secrets.get("aws_access_key")
aws_secret_key = secrets.get("aws_secret_key")

In [29]:
aws_access_key_id="AKIA5XPRFL2ZMKPR6QJ3"
aws_secret_access_key="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"
s3_source_bucket="amazon-connect-39f6aa5d9242"
s3_transcripts_location="Analysis/Voice/2025/03/03/"

In [30]:
# s3://amazon-connect-39f6aa5d9242/Analysis/Voice/2025/03/03/

In [31]:
# Create an S3 client using the credentials
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

In [32]:
# Example: List objects in your S3 bucket
# bucket_name = secrets.get("s3_source_bucket")
# custom_location = secrets.get("s3_transcripts_location")

# List objects from the specified location (prefix) in the S3 bucket
response = s3.list_objects_v2(Bucket=s3_source_bucket, Prefix=s3_transcripts_location)

# Set a counter to limit the number of objects printed
count = 0
max_objects = 5  # Limit to 5 objects

for obj in response.get('Contents', []):
    print(f"Object: {obj['Key']}")
    count += 1
    if count >= max_objects:
        break  # Exit the loop after printing 5 objects

Object: Analysis/Voice/2025/03/03/0072de6e-fab3-446c-bb16-7e2c9e7719d6_analysis_2025-03-03T21:32:13Z.json
Object: Analysis/Voice/2025/03/03/007be69d-fef9-4f59-b773-63e287d0fd91_analysis_2025-03-03T16:22:17Z.json
Object: Analysis/Voice/2025/03/03/0093b0bc-dcbd-4555-a62a-682083914a7a_analysis_2025-03-03T16:09:31Z.json
Object: Analysis/Voice/2025/03/03/00ada971-89b5-4f30-be80-7ca212e4ffa7_analysis_2025-03-03T19:39:23Z.json
Object: Analysis/Voice/2025/03/03/00ae8e87-e44f-4f5d-a022-950ba78bffd6_analysis_2025-03-03T18:49:13Z.json


In [36]:
response = s3.get_object(Bucket=s3_source_bucket, Key="Analysis/Voice/2025/03/03/0072de6e-fab3-446c-bb16-7e2c9e7719d6_analysis_2025-03-03T21:32:13Z.json")    
# Read the content
json_content = response['Body'].read().decode('utf-8')

In [37]:
json_content

'{"AccountId":"943787957938","Categories":{"MatchedCategories":[],"MatchedDetails":{}},"Channel":"VOICE","ContentMetadata":{"Output":"Raw"},"ConversationCharacteristics":{"Interruptions":{"InterruptionsByInterrupter":{},"TotalCount":0,"TotalTimeMillis":0},"NonTalkTime":{"Instances":[],"TotalTimeMillis":0},"Sentiment":{"OverallSentiment":{"AGENT":0,"CUSTOMER":0},"SentimentByPeriod":{"QUARTER":{"AGENT":[{"BeginOffsetMillis":0,"EndOffsetMillis":0,"Score":0},{"BeginOffsetMillis":0,"EndOffsetMillis":0,"Score":0},{"BeginOffsetMillis":0,"EndOffsetMillis":0,"Score":0},{"BeginOffsetMillis":0,"EndOffsetMillis":0,"Score":0}],"CUSTOMER":[{"BeginOffsetMillis":0,"EndOffsetMillis":204,"Score":0},{"BeginOffsetMillis":204,"EndOffsetMillis":409,"Score":0},{"BeginOffsetMillis":409,"EndOffsetMillis":614,"Score":0},{"BeginOffsetMillis":614,"EndOffsetMillis":819,"Score":0}]}}},"TalkSpeed":{"DetailsByParticipant":{"AGENT":{"AverageWordsPerMinute":0},"CUSTOMER":{"AverageWordsPerMinute":220}}},"TalkTime":{"Det

In [43]:
def get_s3_signed_url(bucket_name, object_key, aws_access_key, aws_secret_key, expiration=3600):
    """
    Generate a pre-signed URL for an S3 object
    
    Args:
        bucket_name (str): Name of the S3 bucket
        object_key (str): Path to the object in the bucket
        aws_access_key_id (str): AWS access key ID
        aws_secret_access_key (str): AWS secret access key
        expiration (int): URL expiration time in seconds (default 1 hour)
        
    Returns:
        str: Pre-signed URL for the S3 object
    """
    try:
        # Create S3 client with credentials
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
        
        # Generate pre-signed URL
        url = s3_client.generate_presigned_url(
            'get_object',
            Params={
                'Bucket': bucket_name,
                'Key': object_key
            },
            ExpiresIn=expiration
        )
        
        # URL encode the signed URL to ensure it's properly formatted
        encoded_url = quote(url, safe=':/?=&')
        return encoded_url
    
    except Exception as e:
        raise Exception(f"Error generating pre-signed URL: {str(e)}")

In [51]:
audio_aws_uri = get_s3_signed_url(bucket_name, object_key, aws_access_key, aws_secret_key)

In [52]:
audio_aws_uri

'https://amazon-connect-dev-poc.s3.amazonaws.com/connect-audio-files/3c2ddea2-d6e3-4acd-8abd-6eb98c192bd8.wav?AWSAccessKeyId=AKIARFBDD7MVMPCPRNFB&Signature=vf1tgErgZkSl0nCparMF74ip5hY%253D&Expires=1737516232'

# Snowflake

In [44]:
import snowflake.connector, os, pathlib
import snowflake.connector as sc
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization

In [66]:
private_key_file = 'snowflakegcp_rsa_key.p8'
private_key_file_pwd = '$07@rF0r@77!'

conn_params = {
    # 'account': 'FGIXSNB.HP20007',
    'account': 'XV37144.us-central1.gcp',
    'user': 'GCP_INTEGRATION',
    'private_key_file': private_key_file,
    'private_key_file_pwd':private_key_file_pwd,
    'warehouse': 'DATAPLATR',
    'database': 'POSIGEN_DEV',
    'schema': 'SRC_GCP'
}

ctx = sc.connect(**conn_params)
cs = ctx.cursor()

# Query to fetch data
query = "SELECT * FROM SRC_GCP_INTER_CALLS LIMIT 10;"  # Modify as needed

# Fetch data into a Pandas DataFrame
try:
    cs.execute(query)
    df = cs.fetch_pandas_all()  # Fetch results as Pandas DataFrame
    # print(df.head())  # Display first few rows
finally:
    cs.close()
    ctx.close()  # Close connection

In [67]:
df.head()

Unnamed: 0,SNF_ID,CONTACT_ID,CALL_SUMMARY,TOPIC,CATEGORY,SUB_CATEGORY,AGENT_COACHING,ACCOUNT_ID,AGENT_SPEECH_SPEED,CUSTOMER_SPEECH_SPEED,...,TOTAL_TALKTIME_CALL,TOTAL_DURATION_CALL_SECOND,TOTAL_DURATION_CALL,TOTAL_DEAD_AIR_CALL_SECOND,TOTAL_DEAD_AIR_CALL,CUSTOMER_CONTACT_ID,CUSTOMER_INSTANCE_ID,CALL_JOB_STATUS,CALL_LANGUAGE,LOAD_DATE
0,1,4a0a35d3-6221-4ccd-be93-83938c24a544,Maryetta called Mariana Ando to discuss procee...,Transaction Processing,Customer Service,Account Management,{'strengths': ['Successfully guided the conver...,943787957938,184,179,...,01:47:00,118739,01:58,11352,00:11,4a0a35d3-6221-4ccd-be93-83938c24a544,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
1,2,309a0db8-2735-4537-988a-a66bff37c159,Ashton from Passage and Solar called Jose Aval...,Solar Panel Installation,Sales,Appointment Confirmation/Details,"{'strengths': ['Professional introduction', 'C...",943787957938,221,155,...,15:48:00,1044130,17:24,95244,01:35,309a0db8-2735-4537-988a-a66bff37c159,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
2,3,4abe04de-5d7d-4eb9-a999-99d1c6ef54c1,The provided transcript is incomplete and lack...,Unknown,Unknown,Unknown,"{'strengths': [], 'improvement_areas': [], 'sp...",943787957938,144,178,...,13:47:00,1423500,23:43,595559,09:55,4abe04de-5d7d-4eb9-a999-99d1c6ef54c1,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
3,4,4d8deee2-8062-40e8-b580-d45d79e94abe,The call transcript is incomplete and lacks co...,Unknown,Unknown,Unknown,{'strengths': ['Polite and professional greeti...,943787957938,226,192,...,03:53:00,199509,03:19,-34316,-1:26,4d8deee2-8062-40e8-b580-d45d79e94abe,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
4,5,515b317f-21d0-4aae-b88b-b90f00240a13,Mary from Passagegen Solar called Jennifer to ...,Lease Agreement Review and Payment Confirmation,Customer Service,Account Management,{'strengths': ['Persistent in obtaining necess...,943787957938,199,155,...,02:05:00,134080,02:14,8346,00:08,515b317f-21d0-4aae-b88b-b90f00240a13,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00


Unnamed: 0,SNF_ID,CONTACT_ID,CALL_SUMMARY,TOPIC,CATEGORY,SUB_CATEGORY,AGENT_COACHING,ACCOUNT_ID,AGENT_SPEECH_SPEED,CUSTOMER_SPEECH_SPEED,...,TOTAL_TALKTIME_CALL,TOTAL_DURATION_CALL_SECOND,TOTAL_DURATION_CALL,TOTAL_DEAD_AIR_CALL_SECOND,TOTAL_DEAD_AIR_CALL,CUSTOMER_CONTACT_ID,CUSTOMER_INSTANCE_ID,CALL_JOB_STATUS,CALL_LANGUAGE,LOAD_DATE
0,1,4a0a35d3-6221-4ccd-be93-83938c24a544,Maryetta called Mariana Ando to discuss procee...,Transaction Processing,Customer Service,Account Management,{'strengths': ['Successfully guided the conver...,943787957938,184,179,...,01:47:00,118739,01:58,11352,00:11,4a0a35d3-6221-4ccd-be93-83938c24a544,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
1,2,309a0db8-2735-4537-988a-a66bff37c159,Ashton from Passage and Solar called Jose Aval...,Solar Panel Installation,Sales,Appointment Confirmation/Details,"{'strengths': ['Professional introduction', 'C...",943787957938,221,155,...,15:48:00,1044130,17:24,95244,01:35,309a0db8-2735-4537-988a-a66bff37c159,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
2,3,4abe04de-5d7d-4eb9-a999-99d1c6ef54c1,The provided transcript is incomplete and lack...,Unknown,Unknown,Unknown,"{'strengths': [], 'improvement_areas': [], 'sp...",943787957938,144,178,...,13:47:00,1423500,23:43,595559,09:55,4abe04de-5d7d-4eb9-a999-99d1c6ef54c1,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
3,4,4d8deee2-8062-40e8-b580-d45d79e94abe,The call transcript is incomplete and lacks co...,Unknown,Unknown,Unknown,{'strengths': ['Polite and professional greeti...,943787957938,226,192,...,03:53:00,199509,03:19,-34316,-1:26,4d8deee2-8062-40e8-b580-d45d79e94abe,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00
4,5,515b317f-21d0-4aae-b88b-b90f00240a13,Mary from Passagegen Solar called Jennifer to ...,Lease Agreement Review and Payment Confirmation,Customer Service,Account Management,{'strengths': ['Persistent in obtaining necess...,943787957938,199,155,...,02:05:00,134080,02:14,8346,00:08,515b317f-21d0-4aae-b88b-b90f00240a13,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US,2025-01-30 11:17:50.435000+00:00


In [34]:
PRIVATE_KEY_PATH = 'snowflakegcp_rsa_key.p8'
# Read and load the private key
def load_private_key(file_path):
    """ Load the private key in the required format """
    with open(file_path, "rb") as key_file:
        private_key = key_file.read()
    return private_key

p_key = load_private_key(PRIVATE_KEY_PATH)

In [40]:
# Establish Snowflake connection
conn = snowflake.connector.connect(
    user='GCP_INTEGRATION',
    account='XV37144',
    private_key=private_key,
    warehouse='POSIGEN_DEV',
    database='SRC_GCP',
    schema='SRC_GCP_INTER_CALLS'
)

ProgrammingError: 251008: Failed to load private key: Password was not given but private key is encrypted
Please provide a valid unencrypted rsa private key in DER format as bytes object

# GCP

In [114]:
project=secrets.get("gcp_project_id")
region=secrets.get("gcp_prjct_location")

## Transcription: Speech to Text

Needs the audio files to be in GCS bucket or local to transcribe

In [79]:
from google.cloud import speech
from datetime import datetime, timedelta

In [100]:
# # Initialize Speech-to-Text client
client = speech.SpeechClient()

audio = speech.RecognitionAudio(uri=gcs_uri)

# Create recognition config
diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=4,
)

config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        audio_channel_count = 2,
        diarization_config=diarization_config,
        # # A model must be specified to use enhanced model. This incurs more cost
        # use_enhanced=True,
        # model="phone_call",
    )

# Detects speech in the audio file
response = client.long_running_recognize(config=config, audio=audio).result()

In [101]:
# Process transcription results
for result in response.results:
        print(f"Transcript: {result.alternatives[0].transcript}")

Transcript: hello
Transcript:  this is her
Transcript:  yeah because okay there's a big problems with my roof and ever since they put it up and made all the holes which started the problem and then the tree came down which finished the problems I'm finding out now that my roof should a never even a bid eligible for solar one because the roof was it old and I never even knew I have a rubber roof on the flat part I didn't know all this stuff I had no idea so now my insurance company send me a check to get a new roof put on I'm waiting for it
Transcript:  and solar said that they were going to remove them of no cost today and so I could get the roof done and then put it back on but I'm like saying I should have never had solar in the beginning and some my contractor wants to know if solar had permits to do this because this should have been done
Transcript: yes
Transcript:  now I was told that your roof is old so there's not supposed to put them on how come you guys did it when I had prob

## Transcription: Gemini 2.0 flash

In [112]:
import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part

In [107]:
vertexai.init(project=project, location=region)

prompt_transcript="""
Transcribe this audio file, in the comma separated format: start,end,speaker,speaker_label,caption.
00:05,00:07,speaker name,System,caption
00:05,00:07,speaker name,Employee,caption
00:05,00:07,speaker name,Agent,caption
00:05,00:07,speaker name,Employee,caption
"""

In [115]:
model = GenerativeModel("gemini-1.5-flash-002")
audio_file = Part.from_uri(mime_type="audio/wav", uri=gcs_uri)
contents = [audio_file, prompt_transcript]

In [116]:
response = model.generate_content(contents, generation_config=GenerationConfig(audio_timestamp=True))

In [117]:
print(response.text)

Here's a transcription of the provided audio in the requested format:

start,end,speaker,speaker_label,caption
00:00,00:01,Denise,Customer,Hello
00:01,00:03,Faren,Employee,Hi, can I speak to Miss Denise?
00:03,00:04,Denise,Customer,This is her.
00:05,00:17,Faren,Employee,Hi Miss Denise. This is Faren calling from Paradise Sun Solar on a recorded line. Um, I was reaching out to you in regards to the SMS that you uh sent over to our team requesting for the permits for your home.
00:19,00:50,Denise,Customer,Yeah, because um, okay. There's big problems with my roof. And ever since solar put it up and made all the holes, which started the problems. And then the tree came down, which finished the problems. I'm finding out now that my roof shouldn't have never even been eligible for solar. One, because the roof was it old. And I never even knew I have a rubber roof on the flat part. I didn't know all this stuff. I had no idea.
00:51,01:09,Denise,Customer,So now my insurance company sent me a 

## Cloud Data Loss Protection

In [3]:
# Temporary secrets manager
with open("3c2ddea2-d6e3-4acd-8abd-6eb98c192bd8_analysis_2025-01-06T18_29_29Z.json", 'r') as audio_metadata_file:
    audio_metadata = json.load(audio_metadata_file)
audio_trnscrpt_df = pd.json_normalize(audio_metadata['Transcript']).drop(['Id'], axis=1)
audio_trnscrpt_df.head()

Unnamed: 0,BeginOffsetMillis,Content,EndOffsetMillis,LoudnessScore,ParticipantId,Sentiment,IssuesDetected,OutcomesDetected
0,0,Thank you for calling Positgen. This is Lakesh...,4389,"[83.22, 87.92, 83.88, 85.01, 68.77]",AGENT,NEUTRAL,,
1,5539,"Yes, this is Shirley or I need to make a payme...",10720,"[76.24, 76.25, 76.64, 77.15, 80.61, 80.88]",CUSTOMER,NEUTRAL,"[{'CharacterOffsets': {'BeginOffsetChar': 24, ...",
2,11670,99.,12750,"[74.46, 74.98]",CUSTOMER,NEUTRAL,,
3,13039,"OK, and what's the address listed on your acco...",15800,"[81.74, 82.69, 79.54]",AGENT,NEUTRAL,,
4,17280,"7520 Lady Great Street, New Orleans.",21469,"[79.95, 78.87, 82.1, 77.21, 75.05]",CUSTOMER,NEUTRAL,,


# Fetching the files to Process

-- Fetch all the files from last 2 days from today<br>
-- Sort them in descending order of timestamp<br>
-- Add last modified datetime as well

In [47]:
aws_access_key="AKIA5XPRFL2ZMKPR6QJ3"
aws_secret_key="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"
s3_source_bucket="amazon-connect-39f6aa5d9242"
s3_transcripts_location="Analysis/Voice"

In [48]:
import re
import boto3
import pandas as pd
from datetime import datetime, timedelta

In [49]:
s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )

print(s3_source_bucket)
print(s3_transcripts_location)

amazon-connect-39f6aa5d9242
Analysis/Voice


In [60]:
"""
Date Handling:

Automatically gets current date and previous day
Handles month transitions (e.g., from March 1 to February 28/29)
Handles year transitions (e.g., from January 1 to December 31)
"""
# Get current date
current_date = datetime.now()

# Get dates for last 2 days
num_days = 6
dates_to_check = []
for i in range(num_days):
    check_date = current_date - timedelta(days=i)
    dates_to_check.append(check_date)

# Process each date
folders = []
for date in dates_to_check:
    year = str(date.year)
    month = f"{date.month:02d}"
    day = f"{date.day:02d}"
    
    # Construct the prefix for S3 listing
    prefix = f"{s3_transcripts_location}/{year}/{month}/{day}/"
    folders.append(prefix)

# folders=['connect-audio-files/2025/02/11/', 'connect-audio-files/2025/02/12/']
folders

TypeError: list indices must be integers or slices, not tuple

In [57]:
all_files = []
    
# Fetch files from both folders
for folder in folders:
    response = s3_client.list_objects_v2(
        Bucket=s3_source_bucket,
        Prefix=folder
    )
    
    for obj in response.get('Contents', []):
        if obj['Key'].endswith('.json'):
          # all_files.append([obj['LastModified'], obj['Key']])
            file_path = obj['Key']
            s3_ts = obj['LastModified']
            # Skip if not a JSON file
            if not file_path.endswith('.json'):
                continue
        
            call_id = file_path.split('/')[-1].split("_analysis_")[0] #match.group(1)
            timestamp = datetime.strptime(file_path.split('analysis_')[-1].split('.')[0].replace('_', ':'), '%Y-%m-%dT%H:%M:%SZ') #match.group(2)    
            all_files.append({
                'File': file_path,
                'ID': call_id,
                'File_Timestamp': timestamp,
                'File_Date': timestamp.strftime('%Y-%m-%d'),
                'File_Time': timestamp.strftime('%H:%M:%S'),
                'S3_Timestamp': s3_ts,
                'S3_Date': s3_ts.strftime('%Y-%m-%d'),
                'S3_Time': s3_ts.strftime('%H:%M:%S')
            })

# all_files

In [53]:
all_files[:2]

[{'File': 'Analysis/Voice/2025/03/04/1db2be8d-1a6c-44f0-8567-dc9968d3efe3_analysis_2025-03-04T00:04:07Z.json',
  'ID': '1db2be8d-1a6c-44f0-8567-dc9968d3efe3',
  'File_Timestamp': datetime.datetime(2025, 3, 4, 0, 4, 7),
  'File_Date': '2025-03-04',
  'File_Time': '00:04:07',
  'S3_Timestamp': datetime.datetime(2025, 3, 4, 0, 17, 33, tzinfo=tzutc()),
  'S3_Date': '2025-03-04',
  'S3_Time': '00:17:33'},
 {'File': 'Analysis/Voice/2025/03/04/6d13b9ef-55f3-49ab-bde0-905a210f32d1_analysis_2025-03-04T00:07:07Z.json',
  'ID': '6d13b9ef-55f3-49ab-bde0-905a210f32d1',
  'File_Timestamp': datetime.datetime(2025, 3, 4, 0, 7, 7),
  'File_Date': '2025-03-04',
  'File_Time': '00:07:07',
  'S3_Timestamp': datetime.datetime(2025, 3, 4, 0, 16, 59, tzinfo=tzutc()),
  'S3_Date': '2025-03-04',
  'S3_Time': '00:16:59'}]

In [58]:
files_sorted = pd.DataFrame(all_files).sort_values(['File_Timestamp'], ascending=False)
files_sorted.shape

(21616, 8)

In [55]:
files_sorted.shape

(4069, 8)

# Spanish Language POCs

## Without Translation

In [91]:
en_transcript = """
Humanity is at an inflection point. From 2012 onwards, developments in building AI systems (using deep neural networks) accelerated so that by the end of the decade, they yielded the first software system able to write
articles indiscernible from those written by humans. This system was an AI model called Generative Pre-trained Transformer 2, or GPT-2. 2022 marked the release of ChatGPT, which demonstrated how profoundly this
technology was poised to revolutionize how we interact with technology and information. Reaching one million active users in five days and then one hundred million active users in two months, the new breed of AI
models started out as human-like chatbots but quickly evolved into a monumental shift in our approach to common tasks, like translation, text generation, summarization, and more. It became an invaluable tool for
programmers, educators, and researchers. The success of ChatGPT was unprecedented and popularized more research into the technology behind it, namely large language models (LLMs). Both proprietary and public models
were being released at a steady pace, closing in on, and eventually catching up to the performance of ChatGPT. It is not an exaggeration to state that almost all attention was on LLMs.As a result, 2023 will always
be known, at least to us, as the year that drastically changed our field, Language Artificial Intelligence (Language AI), a field characterized by the development of systems capable of understanding and generating
human language. However, LLMs have been around for a while now and smaller models are still relevant to this day. LLMs are much more than just a single model and there are many other techniques and models in the 
field of language AI that are worth exploring. In this book, we aim to give readers a solid understanding of the fundamentals of both LLMs and the field of Language AI in general. This chapter serves as the 
scaffolding for the rest of the book and will introduce concepts and terms that we will use throughout the chapters.
"""

extractor = KPIExtractor(gcp_project_id, gcp_prjct_location)
en_call_gen_kpis = extractor.extract_genai_kpis(en_transcript)
print(f"Creating Inter Call df")
en_inter_call_dict = {}
# inter_call_dict['contact_id'] = str(df_intra_call['contact_id'][0])
en_inter_call_dict['call_text'] = en_transcript
en_inter_call_dict['call_summary'] = en_call_gen_kpis['call_summary']['summary']
en_inter_call_dict['topic'] = en_call_gen_kpis['call_topic']['primary_topic']
en_inter_call_dict['category'] = "Static Category TBD"
en_inter_call_dict['category_generated'] = en_call_gen_kpis['call_topic']['category']
en_inter_call_dict['sub_category'] = "Static Sub-Category TBD"
en_inter_call_dict['sub_category_generated'] = en_call_gen_kpis['call_topic']['sub_category']
en_inter_call_dict['agent_coaching'] = dict_to_newline_string(en_call_gen_kpis['agent_coaching'])
en_df_inter_call = pd.DataFrame(pd.Series(en_inter_call_dict)).T

Creating Inter Call df


In [92]:
en_df_inter_call.head()

Unnamed: 0,call_text,call_summary,topic,category,category_generated,sub_category,sub_category_generated,agent_coaching
0,\nHumanity is at an inflection point. From 201...,This transcript details the rapid evolution of...,Language AI and LLMs,Static Category TBD,Technology,Static Sub-Category TBD,Artificial Intelligence,strengths:\n - N/A\n - N/A\n - N/A\nimprove...


In [89]:
sp_transcript = """
La humanidad se encuentra en un punto de inflexión. A partir de 2012, los avances en la construcción de sistemas de IA (utilizando redes neuronales profundas) se aceleraron
de tal manera que, a finales de la década, se obtuvo el primer sistema de software capaz de escribir artículos que no se diferenciaban de los escritos por humanos. Este 
sistema era un modelo de IA llamado Generative Pre-trained Transformer 2 o GPT-2. En 2022 se lanzó ChatGPT, que demostró hasta qué punto esta tecnología estaba preparada 
para revolucionar la forma en que interactuamos con la tecnología y la información. Alcanzando un millón de usuarios activos en cinco días y luego cien millones de usuarios
activos en dos meses, la nueva generación de modelos de IA comenzó como chatbots similares a los humanos, pero rápidamente evolucionó hasta convertirse en un cambio monumental
en nuestro enfoque de tareas comunes, como la traducción, la generación de textos, los resúmenes y más. Se convirtió en una herramienta invaluable para programadores, 
educadores e investigadores. El éxito de ChatGPT no tuvo precedentes y popularizó más investigaciones sobre la tecnología que lo sustentaba, es decir, los grandes modelos
lingüísticos (LLM). Tanto los modelos propietarios como los públicos se estaban lanzando a un ritmo constante, acercándose y finalmente alcanzando el rendimiento de 
ChatGPT. No es una exageración afirmar que casi toda la atención se centró en los LLM. Como resultado, 2023 siempre será conocido, al menos para nosotros, como el año que 
cambió drásticamente nuestro campo, la Inteligencia Artificial del Lenguaje (IA del Lenguaje), un campo caracterizado por el desarrollo de sistemas capaces de entender y 
generar lenguaje humano. Sin embargo, los LLM ya existen desde hace un tiempo y los modelos más pequeños siguen siendo relevantes hasta el día de hoy. Los LLM son mucho más
que un solo modelo y hay muchas otras técnicas y modelos en el campo de la IA del lenguaje que vale la pena explorar. En este libro, nuestro objetivo es brindarles a los 
lectores una comprensión sólida de los fundamentos tanto de los LLM como del campo de la IA del lenguaje en general. Este capítulo sirve como andamiaje para el resto del 
libro y presentará conceptos y términos que utilizaremos a lo largo de los capítulos.
"""

extractor = KPIExtractor(gcp_project_id, gcp_prjct_location)
sp_call_gen_kpis = extractor.extract_genai_kpis(sp_transcript)
print(f"Creating Inter Call df")
sp_inter_call_dict = {}
# inter_call_dict['contact_id'] = str(df_intra_call['contact_id'][0])
sp_inter_call_dict['call_text'] = sp_transcript
sp_inter_call_dict['call_summary'] = sp_call_gen_kpis['call_summary']['summary']
sp_inter_call_dict['topic'] = sp_call_gen_kpis['call_topic']['primary_topic']
sp_inter_call_dict['category'] = "Static Category TBD"
sp_inter_call_dict['category_generated'] = sp_call_gen_kpis['call_topic']['category']
sp_inter_call_dict['sub_category'] = "Static Sub-Category TBD"
sp_inter_call_dict['sub_category_generated'] = sp_call_gen_kpis['call_topic']['sub_category']
sp_inter_call_dict['agent_coaching'] = dict_to_newline_string(sp_call_gen_kpis['agent_coaching'])
sp_df_inter_call = pd.DataFrame(pd.Series(sp_inter_call_dict)).T

Creating Inter Call df


In [90]:
sp_df_inter_call.head()

Unnamed: 0,call_text,call_summary,topic,category,category_generated,sub_category,sub_category_generated,agent_coaching
0,\nLa humanidad se encuentra en un punto de inf...,The transcript describes the evolution of Larg...,Large Language Models (LLMs),Static Category TBD,Technology,Static Sub-Category TBD,Artificial Intelligence,strengths:\n - Concisely summarized complex i...


# Google Secrets Manager

In [16]:
from google.cloud import secretmanager

In [17]:
def access_secret(project_id, secret_id, version_id="latest"):
    """
    Access a secret from Google Secret Manager
    
    Args:
        project_id: Your Google Cloud project ID
        secret_id: The ID of the secret to access
        version_id: The version of the secret (default: "latest")
    
    Returns:
        The secret payload as a string
    """
    # Create the Secret Manager client
    client = secretmanager.SecretManagerServiceClient()
    
    # Build the resource name of the secret version
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
    
    # Access the secret version
    response = client.access_secret_version(request={"name": name})
    
    # Return the decoded payload
    return response.payload.data.decode("UTF-8")

In [18]:
project_id = "dev-posigen"
secret_id = "dev-voice-ai"
version_id="1"

In [19]:
resp = access_secret(project_id, secret_id)

In [20]:
resp

'{\n "access_key": "AKIARFBDD7MVMPCPRNFB"\n "security_key": "hGwjxLO0LW1tdI5c2k3TAJTdfRff4+eRp/NoGyth"\n}'

# Handling PII data

In [5]:
# # Example Usage
# sample_text = """
#     Customer Information:
#     Name: John Smith
#     Email: john.smith@example.com
#     Phone: (555) 123-4567
#     Address: 123 Main Street, Anytown, CA 94043
#     Date of Birth: 01/15/1980
#     Organization: Acme Corp
    
#     Payment Details:
#     Credit Card: 4111 1111 1111 1111
#     Expiration: 12/25
#     CVV: 123
    
#     From customer at Posigen with IP address 192.168.1.1
#     """
# redacted_text = redact_pii_text(gcp_project_id, sample_text)
# print(redacted_text)

In [3]:
from google.cloud import dlp_v2

In [15]:
def mask_pii_in_dataframe(df, column_name, project_id, delimiter="\n"):
    """
    Mask PII data in a specified DataFrame column using GCP DLP.
    
    Args:
        df (pandas.DataFrame): DataFrame containing data to be masked
        column_name (str): Column name containing text to process
        project_id (str): GCP project ID
        delimiter (str): Delimiter to join/split text (default: newline)
        
    Returns:
        pandas.DataFrame: DataFrame with PII masked in the specified column
    """
    # Create a client
    dlp_client = dlp_v2.DlpServiceClient()
    
    # Create inspect configuration
    # Configure what to unmask (these will be identified but not transformed)
    unmask_info_types = [
        "PERSON_NAME",
        "EMAIL_ADDRESS",
        "PHONE_NUMBER"
    ]
    
    # Configure what to mask
    mask_info_types = [
        "CREDIT_CARD_NUMBER",
        "CREDIT_CARD_CVV",
        "STREET_ADDRESS",
        "LOCATION",
        "IP_ADDRESS",
        "ORGANIZATION_NAME"
    ]
    
    # Create all info types list for inspection
    all_info_types = [{"name": info_type} for info_type in unmask_info_types + mask_info_types]
    
    # Configure the inspection
    inspect_config = {
        "info_types": all_info_types,
        "min_likelihood": dlp_v2.Likelihood.POSSIBLE,
        "include_quote": True,
    }
    
    # Set up deidentification configuration for masking
    # Create transformation for each info type that should be masked
    transformations = []
    
    # Add masking for the info types we want to mask
    for info_type in mask_info_types:
        # Exception for organizations - don't mask 'Posigen'
        if info_type == "ORGANIZATION_NAME":
            # Create a condition to exclude 'Posigen'
            condition = {
                "expressions": {
                    "conditions": {
                        "conditions": [
                            {
                                "operator": "NOT_EQUALS",
                                "field": {
                                    "name": "value"
                                },
                                "value": {
                                    "string_value": "Posigen"
                                }
                            }
                        ]
                    }
                }
            }
            
            transformations.append({
                "info_types": [{"name": info_type}],
                "primitive_transformation": {
                    "replace_with_info_type_config": {}
                },
                "condition": condition
            })
        else:
            # Standard masking for other info types
            transformations.append({
                "info_types": [{"name": info_type}],
                "primitive_transformation": {
                    "replace_with_info_type_config": {}
                }
            })
    
    # Create deidentify configuration
    deidentify_config = {
        "info_type_transformations": {
            "transformations": transformations
        }
    }
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_masked = df.copy()
    
    # Process in batches to handle large datasets
    batch_size = 100
    total_rows = len(df)
    
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        print(f"Processing rows {start_idx} to {end_idx-1}...")
        
        # Get the subset of rows for this batch
        batch_df = df.iloc[start_idx:end_idx]
        
        # Process each row in the batch
        for idx, row in batch_df.iterrows():
            # Get the text to process
            text_to_process = row[column_name]
            
            # Skip if text is None or empty
            if pd.isna(text_to_process) or text_to_process == "":
                continue
            
            # Handle list-like data (e.g., captions or transcripts)
            if isinstance(text_to_process, list):
                # Join the list items with the delimiter
                joined_text = delimiter.join(text_to_process)
                
                # Set up the content item
                content_item = {"value": joined_text}
                
                # Call the DLP API
                parent = f"projects/{project_id}"
                response = dlp_client.deidentify_content(
                    request={
                        "parent": parent,
                        "deidentify_config": deidentify_config,
                        "inspect_config": inspect_config,
                        "item": content_item,
                    }
                )
                
                # Get the deidentified text and split it back into a list
                deidentified_text = response.item.value
                df_masked.at[idx, column_name] = deidentified_text.split(delimiter)
            else:
                # Process a single string
                content_item = {"value": text_to_process}
                
                # Call the DLP API
                parent = f"projects/{project_id}"
                response = dlp_client.deidentify_content(
                    request={
                        "parent": parent,
                        "deidentify_config": deidentify_config,
                        "inspect_config": inspect_config,
                        "item": content_item,
                    }
                )
                
                # Get the deidentified text
                df_masked.at[idx, column_name] = response.item.value
    
    return df_masked

In [16]:
data = {
    "id": [1, 2, 3],
    "captions": [
        ["My name is John Doe and my email is john@example.com", 
         "My credit card is 4111-1111-1111-1111 with CVV 123", 
         "I work for Posigen at 123 Main St, New York, NY 10001"],
        ["Jane Smith from Acme Corp can be reached at 555-123-4567", 
         "Our IP address is 192.168.1.1"],
        ["Posigen employee Mike Johnson with card ending in 7890"]
    ]
}

df = pd.DataFrame(data)

In [17]:
# Process the DataFrame
masked_df = mask_pii_in_dataframe(df, "captions", gcp_project_id)

Processing rows 0 to 2...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# Misc