# Imports

In [1]:
# from kfp import dsl
# from kfp.v2 import compiler
from google.cloud import aiplatform
from typing import List, Dict
import boto3
from botocore.exceptions import ClientError
import json, os
from datetime import datetime, timedelta
import pandas as pd, numpy as np
from scipy.special import softmax

import vertexai
import vertexai.preview.generative_models as generative_models
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part


# Sentiments
from transformers import pipeline
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification

# Variables

In [2]:
# Temporary secrets manager
with open("secrets.json", 'r') as secrets_file:
    secrets = json.load(secrets_file)
aws_access_key = secrets.get("aws_access_key")
aws_secret_key = secrets.get("aws_secret_key")

# AWS
s3_source_bucket = secrets.get('s3_source_bucket')
s3_transcripts_location = secrets.get('s3_transcripts_location')

# GCP
gcp_project_id=secrets.get('gcp_project_id')
gcp_prjct_location=secrets.get('gcp_prjct_location')

# Snowflake
private_key_file = secrets.get('snowflakegcp_rsa_key')
private_key_file_pwd = secrets.get('snf_ssh_key_pass')

conn_params = {
    'account': secrets.get('snf_account'),
    'user': secrets.get('snf_user'),
    'private_key_file': secrets.get('snf_private_key_file'),
    'private_key_file_pwd':secrets.get('snf_private_key_pwd'),
    'warehouse': secrets.get('snf_warehouse'),
    'database': secrets.get('snf_database'),
    'schema': secrets.get('snf_schema')
}

# # Sentiment Scores
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model_sentiment = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Util Functions

## Misc Utils

### Initiate Master Inter and Intra Dataframes

In [41]:
def initiate_master_dataframes():
    if os.path.isfile("df_intra_calls_data.csv"):
        df_intra_calls_data = pd.read_csv("df_intra_calls_data.csv")
        df_intra_calls_data.call_id = df_intra_calls_data.call_id.astype('string')
    else:
        df_intra_calls_data = pd.DataFrame()

    if os.path.isfile("df_inter_calls_data.csv"):
        df_inter_calls_data = pd.read_csv("df_inter_calls_data.csv")
        df_inter_calls_data.call_id = df_inter_calls_data.call_id.astype('string')
    else:
        df_inter_calls_data = pd.DataFrame()

    return df_intra_calls_data, df_inter_calls_data

## Function: Listing Transcripts

In [82]:
def list_new_transcripts(aws_access_key_id: str, aws_secret_key: str, source_bucket: str, custom_location: str, max_objects: int):
    """
    Fetch audio file from S3 and return it as a BytesIO object
    """
    try:
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
        
        # List files in the folder
        response = s3_client.list_objects_v2(Bucket=s3_source_bucket, Prefix=s3_transcripts_location)
        
        """
        Sample Obj
        {'Key': 'connect-audio-files/biz-suggested-jsons/309a0db8-2735-4537-988a-a66bff37c159_analysis_2025-01-17T21_06_34Z.json',
         'LastModified': datetime.datetime(2025, 1, 22, 11, 1, 8, tzinfo=tzutc()),
         'ETag': '"7aaa0f61f096f29a4fb5a334d2febe5d"',
         'ChecksumAlgorithm': ['CRC64NVME'],
         'ChecksumType': 'FULL_OBJECT',
         'Size': 60598,
         'StorageClass': 'STANDARD'}
        """
        
        list_transcripts = []
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('.json'):
                list_transcripts.append([obj['Key'], obj['LastModified']])
                print(f"{str(obj['LastModified']) +": "+ obj['Key']}")
                if len(list_transcripts) >= max_objects:
                    break  # Exit the loop after printing max objects
    
        return list_transcripts

    except ClientError as e:
        print(f"Error accessing S3: {e}")
        raise

## Function: Read Transcripts

In [92]:
def read_new_transcripts(aws_access_key_id: str, aws_secret_key: str, source_bucket: str, file_key):
    """
    Read Transcript JSON content from a specific file in S3.
    
    :param bucket_name: Name of the S3 bucket
    :param file_key: Full path/key of the JSON file
    :return: Parsed JSON content
    """
    s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
    
    try:
        # Download the file
        response = s3.get_object(Bucket=bucket_name, Key=file_key)
        
        # Read the content
        json_content = response['Body'].read().decode('utf-8')
        
        # Parse JSON
        return json.loads(json_content)
    
    except Exception as e:
        print(f"Error reading JSON file {file_key}: {e}")
        return None


# Create Intra-call Dataframe

In [50]:
def millis_to_mmss(millis):
    """Convert milliseconds to mm:ss format"""
    total_seconds = int(millis / 1000)
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes:02d}:{seconds:02d}"

def process_transcript(
    aws_access_key_id: str,
    aws_secret_key: str,
    transcript_file: str
):
    """
    Pre-process the transcript loaded from S3 Buckets:
    1. Load the transcript as Pandas Dataframe.
    2. Select only the necessary columns ['BeginOffsetMillis', 'EndOffsetMillis', 'ParticipantId', 'Content', 'Sentiment', 'LoudnessScore'].
    3. Format the time in minutes and seconds.
    4. Rename the columns for better understanding.
    """
    with open(transcript_file, 'r') as audio_metadata_file:
        audio_metadata = json.load(audio_metadata_file)

    # Load the Transcript as Pandas Dataframe
    transcript_df = pd.json_normalize(audio_metadata['Transcript'])

    # Select the relevant Columns
    columns_to_select = [
        'BeginOffsetMillis',
        'EndOffsetMillis',
        'ParticipantId',
        'Content'
    ]
    formatted_df = transcript_df[columns_to_select].copy()
    
    # Apply formatting to both time columns
    formatted_df['BeginOffsetMillis'] = formatted_df['BeginOffsetMillis'].apply(millis_to_mmss)
    formatted_df['EndOffsetMillis'] = formatted_df['EndOffsetMillis'].apply(millis_to_mmss)
    
    # Optionally rename columns to reflect their new format
    formatted_df = formatted_df.rename(columns={
        'BeginOffsetMillis': 'start_time',
        'EndOffsetMillis': 'end_time',
        'Content': 'caption',
        'Sentiment': 'sentiment_label'
    })

    # Inserting the Contact ID:
    formatted_df.insert(loc=0, column='call_id', value=str(transcript_file.split('.')[0]))

    return audio_metadata, formatted_df

def convert_to_seconds(time_str):
    try:
        # Parse time string using datetime
        time_obj = datetime.strptime(time_str, '%M:%S')
        # Convert to timedelta and extract total seconds
        total_seconds = time_obj.minute * 60 + time_obj.second
        return total_seconds
    except ValueError:
        return None

def get_sentiment_label(row):
    # Check conditions in order of priority (Positive > Negative > Neutral)
    if row['positive'] > row['negative'] and row['positive'] > row['neutral']:
        return 'Positive'
    elif row['negative'] > row['positive'] and row['negative'] > row['neutral']:
        return 'Negative'
    else:
        return 'Neutral'

def get_sentiment_scores(text_list):
    dict_sentiments = []
    for text in text_list:
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model_sentiment(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = np.round(np.multiply(softmax(scores), 100), 2)
        merged_dict = dict(zip(list(config.id2label.values()), list(scores)))
        dict_sentiments.append(merged_dict)

    df_dict_sentiments = pd.DataFrame(dict_sentiments)
    df_dict_sentiments['sentiment_lable'] = df_dict_sentiments[['positive','negative','neutral']].apply(get_sentiment_label, axis=1)
    
    return df_dict_sentiments

def get_different_times(intra_call):
    intra_call['start_time_second'] = intra_call.start_time.apply(convert_to_seconds)
    intra_call['end_time_second'] = intra_call.end_time.apply(convert_to_seconds)
    intra_call['time_spoken_second'] = intra_call['end_time_second'] - intra_call['start_time_second']
    intra_call['time_spoken_second'] = intra_call['time_spoken_second'].where(intra_call['time_spoken_second'] >= 0, 0)
    intra_call['time_spoken_second'] = intra_call['time_spoken_second'].fillna(0)
    intra_call['time_silence_second'] = intra_call['start_time_second'].shift(-1) - intra_call['end_time_second']
    intra_call['time_silence_second'] = intra_call['time_silence_second'].where(intra_call['time_silence_second'] >= 0, 0)
    intra_call['time_silence_second'] = intra_call['time_silence_second'].fillna(0)

    return intra_call
    
def create_intra_call_df(aws_access_key, aws_secret_key, transcript_file):
    call_id = transcript_file.split('.')[0]
    
    # get the relevant columns from the loaded transcript
    audio_metadata, intra_call = process_transcript(aws_access_key, aws_secret_key, transcript_file)
    
    df_sentiment_scores = get_sentiment_scores(intra_call.caption.to_list())
    intra_call = pd.concat([intra_call, df_sentiment_scores], axis=1)

    intra_call = get_different_times(intra_call)

    return audio_metadata, intra_call

# Create Inter-call Dataframe

In [61]:
class VoiceCallAnalyzer_genKPI:
    def __init__(self, project_id: str, location: str):
        vertexai.init(project=project_id, location=location)
        self.model = GenerativeModel("gemini-1.5-flash-002")
        self.generation_config = {
            "temperature": 0.3,
            "max_output_tokens": 1024,
            "top_p": 0.8,
            "top_k": 40,
        }
        self.safety_settings = {
            generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
            generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        }
       
    def extract_all_kpis(self, transcript: str):
        """
        Comprehensive KPI extraction with a single model call
        """
        prompt = f"""
        Perform a comprehensive analysis of the following call transcript. 
        Extract the following KPIs in a structured JSON format:

        1. Call Summary:
        - Concise overview in 3 to 4 lines

        2. Call Topic and Categorization:
        - Call Topic
        - Call Category 
        - Call Sub-Category

        3. Agent Coaching Points:
        - Communication strengths
        - Areas of improvement
        - Specific recommendations
        - Skill development focus

        Transcript: {transcript}
        
        Output Format (JSON):
        {{
            "call_summary": {{
                "summary": "",
                "key_points": [],
                "outcome": "",
                "follow_up_recommendations": []
            }},
            "call_topic": {{
                "primary_topic": "",
                "category": "",
                "sub_category": ""
            }},
            "agent_coaching": {{
                "strengths": [],
                "improvement_areas": [],
                "specific_recommendations": [],
                "skill_development_focus": []
            }}
        }}
        """
        
        response = self.model.generate_content(
            prompt,
            generation_config=self.generation_config,
            safety_settings=self.safety_settings
        )

        input_string = response.text
        kpis_json = json.loads(input_string.strip('`').replace('json\n', '').replace('```', ''))
        
        return kpis_json
        
def create_inter_call_df(
    gcp_project_id: str,
    gcp_prjct_location: str,
    df_intra_call: pd.DataFrame,
    audio_metadata: dict
):
    analyzer_gen = VoiceCallAnalyzer_genKPI(gcp_project_id, gcp_prjct_location)
    call_gen_kpis = analyzer_gen.extract_all_kpis(df_intra_call.caption)
    
    inter_call_dict = {}
    inter_call_dict['call_id'] = str(df_intra_call['call_id'][0])
    # inter_call_dict['call_text'] = " ".join(df_intra_call.caption)
    inter_call_dict['call_summary'] = call_gen_kpis['call_summary']['summary']
    inter_call_dict['topic'] = call_gen_kpis['call_topic']['primary_topic']
    inter_call_dict['category'] = call_gen_kpis['call_topic']['category']
    inter_call_dict['sub_category'] = call_gen_kpis['call_topic']['sub_category']
    inter_call_dict['agent_coaching'] = call_gen_kpis['agent_coaching']

    df_inter_call = pd.DataFrame(pd.Series(inter_call_dict)).T

    # Add metadata from AWS
    df_inter_call['account_id'] = audio_metadata['AccountId']
    df_inter_call['agent_speech_speed'] = audio_metadata['ConversationCharacteristics']['TalkSpeed']['DetailsByParticipant']['AGENT']['AverageWordsPerMinute']
    df_inter_call['customer_speech_speed'] = audio_metadata['ConversationCharacteristics']['TalkSpeed']['DetailsByParticipant']['CUSTOMER']['AverageWordsPerMinute']
    df_inter_call['total_talktime_agent_second'] = audio_metadata['ConversationCharacteristics']['TalkTime']['DetailsByParticipant']['AGENT']['TotalTimeMillis']
    df_inter_call['total_talktime_agent'] = millis_to_mmss(df_inter_call['total_talktime_agent_second'])
    df_inter_call['total_talktime_customer_second'] = audio_metadata['ConversationCharacteristics']['TalkTime']['DetailsByParticipant']['CUSTOMER']['TotalTimeMillis']
    df_inter_call['total_talktime_customer'] = millis_to_mmss(df_inter_call['total_talktime_customer_second'])
    df_inter_call['total_talktime_call_second'] = audio_metadata['ConversationCharacteristics']['TalkTime']['TotalTimeMillis']
    df_inter_call['total_talktime_call'] = millis_to_mmss(df_inter_call['total_talktime_call_second'])
    df_inter_call['total_duration_call_second'] = audio_metadata['ConversationCharacteristics']['TotalConversationDurationMillis']
    df_inter_call['total_duration_call'] = millis_to_mmss(df_inter_call['total_duration_call_second'])
    df_inter_call['total_dead_air_call_second'] = metadata_dict['total_duration_call'] - metadata_dict['total_talktime_call']
    df_inter_call['total_dead_air_call'] = millis_to_mmss(df_inter_call['total_dead_air_call_second'])
    df_inter_call['customer_contact_id'] = audio_metadata['CustomerMetadata']['ContactId']
    df_inter_call['customer_instance_id'] = audio_metadata['CustomerMetadata']['InstanceId']
    df_inter_call['call_job_status'] = audio_metadata['JobStatus']
    df_inter_call['call_language'] = audio_metadata['LanguageCode']
    
    return df_inter_call

# Main Function

In [45]:
os.listdir('transcripts')

['515b317f-21d0-4aae-b88b-b90f00240a13_analysis_2025-01-18T21_53_50Z.json',
 '4abe04de-5d7d-4eb9-a999-99d1c6ef54c1_analysis_2024-12-19T17_43_08Z.json',
 'bedd3484-4bbd-4628-8465-42c80d78903d_analysis_2024-11-14T16_16_37Z.json',
 '4a0a35d3-6221-4ccd-be93-83938c24a544_analysis_2025-01-17T19_04_29Z.json',
 '309a0db8-2735-4537-988a-a66bff37c159_analysis_2025-01-17T21_06_34Z.json',
 'fbe2b590-261d-4e07-8417-a4294693001e_analysis_2025-01-16T20_38_40Z.json',
 '.ipynb_checkpoints',
 '4d8deee2-8062-40e8-b580-d45d79e94abe_analysis_2025-01-15T22_57_07Z.json']

In [46]:
# Fetch the Transcripts list to be processed
# list_transcripts = list_new_transcripts(aws_access_key, aws_secret_key, s3_source_bucket, s3_transcripts_location)

# Begin processing the transcripts for KPIs
transcript_file = "transcripts/4a0a35d3-6221-4ccd-be93-83938c24a544_analysis_2025-01-17T19_04_29Z.json"

In [51]:
audio_metadata, df_intra_call = create_intra_call_df(aws_access_key, aws_secret_key, transcript_file)
df_intra_call.head()

Unnamed: 0,call_id,start_time,end_time,ParticipantId,caption,negative,neutral,positive,sentiment_lable,start_time_second,end_time_second,time_spoken_second,time_silence_second
0,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,00:00,00:01,CUSTOMER,Hello?,4.48,79.400002,16.110001,Neutral,0,1,1,0.0
1,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,00:01,00:07,AGENT,Good afternoon. My name is Maryetta. I'm calli...,1.45,81.830002,16.719999,Neutral,1,7,6,0.0
2,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,00:07,00:09,CUSTOMER,Mariana Ando speaking.,1.67,90.510002,7.83,Neutral,7,9,2,1.0
3,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,00:10,00:14,AGENT,"Yes, sir. Uh, the reason why I was giving you ...",1.13,88.169998,10.71,Neutral,10,14,4,1.0
4,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,00:15,00:15,CUSTOMER,Yes.,7.42,58.110001,34.459999,Neutral,15,15,0,1.0


In [62]:
df_inter_call = create_inter_call_df(gcp_project_id, gcp_prjct_location, df_intra_call, audio_metadata)
df_inter_call

  total_seconds = int(millis / 1000)
  total_seconds = int(millis / 1000)
  total_seconds = int(millis / 1000)
  total_seconds = int(millis / 1000)
  total_seconds = int(millis / 1000)


Unnamed: 0,call_id,call_summary,topic,category,sub_category,agent_coaching,account_id,agent_speech_speed,customer_speech_speed,total_talktime_agent_second,...,total_talktime_call_second,total_talktime_call,total_duration_call_second,total_duration_call,total_dead_air_call_second,total_dead_air_call,customer_contact_id,customer_instance_id,call_job_status,call_language
0,transcripts/4a0a35d3-6221-4ccd-be93-83938c24a5...,Maryetta called Mariana Ando to discuss procee...,Transaction Inquiry,Sales/Customer Service,Order Processing/Account Management,{'strengths': ['Successfully guided the custom...,943787957938,184,179,68808,...,107387,01:47,118739,01:58,11352,00:11,4a0a35d3-6221-4ccd-be93-83938c24a544,553e079f-40b6-4dc2-969a-9c0e6ca357f2,COMPLETED,en-US


In [83]:
# Fetch the Transcripts list to be processed
max_objects = 10
list_transcripts = list_new_transcripts(aws_access_key, aws_secret_key, s3_source_bucket, s3_transcripts_location, max_objects)

2025-01-22 11:01:08+00:00: connect-audio-files/biz-suggested-jsons/309a0db8-2735-4537-988a-a66bff37c159_analysis_2025-01-17T21_06_34Z.json
2025-01-22 11:01:02+00:00: connect-audio-files/biz-suggested-jsons/4a0a35d3-6221-4ccd-be93-83938c24a544_analysis_2025-01-17T19_04_29Z.json
2025-01-22 11:01:10+00:00: connect-audio-files/biz-suggested-jsons/4abe04de-5d7d-4eb9-a999-99d1c6ef54c1_analysis_2024-12-19T17_43_08Z.json
2025-01-22 11:01:05+00:00: connect-audio-files/biz-suggested-jsons/4d8deee2-8062-40e8-b580-d45d79e94abe_analysis_2025-01-15T22_57_07Z.json
2025-01-22 11:01:04+00:00: connect-audio-files/biz-suggested-jsons/515b317f-21d0-4aae-b88b-b90f00240a13_analysis_2025-01-18T21_53_50Z.json
2025-01-22 11:01:03+00:00: connect-audio-files/biz-suggested-jsons/bedd3484-4bbd-4628-8465-42c80d78903d_analysis_2024-11-14T16_16_37Z.json
2025-01-22 11:01:06+00:00: connect-audio-files/biz-suggested-jsons/fbe2b590-261d-4e07-8417-a4294693001e_analysis_2025-01-16T20_38_40Z.json
