In [3]:
import pandas as pd
import json
import logging
from transformers import AutoTokenizer
import re

In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
def process_transcript(
    contact_id,
    transcript_data,
    tokenizer
):
    """
    Pre-process the transcript loaded from S3 Buckets:
    1. Load the transcript as Pandas Dataframe.
    2. Select only the necessary columns ['BeginOffsetMillis', 'EndOffsetMillis', 'ParticipantId', 'Content'].
    3. Format the time in minutes and seconds.
    4. Rename the columns for better understanding.
    """
    try:
        logger.info(f"{contact_id}: Loading the Transcript as Pandas Dataframe.")
        
        # Load the transcript into a DataFrame
        transcript_df = pd.json_normalize(transcript_data['Transcript'])

        # Select the relevant Columns
        columns_to_select = [
            'BeginOffsetMillis',
            'EndOffsetMillis',
            'ParticipantId',
            'Content'
        ]
        formatted_df = transcript_df[columns_to_select].copy()

        # Rename columns
        formatted_df = formatted_df.rename(columns={
            'BeginOffsetMillis': 'Begin_Offset',
            'EndOffsetMillis': 'End_Offset',
            'Content': 'caption',
            'ParticipantId': 'speaker_tag'
        })

        # Inserting the Call ID and Language Code:
        formatted_df.insert(loc=0, column='contact_id', value=contact_id)
        formatted_df['call_language'] = transcript_data.get('LanguageCode', 'Unknown')

        logger.info(f"{contact_id}: Returning formatted DataFrame.")
        return formatted_df

    except Exception as e:
        logger.error(f"Error processing transcript: {str(e)}")
        raise

In [6]:
with open('868375c1-2111-4990-b9d2-36693c7bad46_analysis_2025-01-06T18_29_18Z.json', 'r') as f:
    transcript_data = json.load(f)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

contact_id = '868375c1-2111-4990-b9d2-36693c7bad46'
result = process_transcript(contact_id, transcript_data, tokenizer)

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Loading the Transcript as Pandas Dataframe.
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Returning formatted DataFrame.


In [7]:
project_id = "dev-posigen"
secret_id = "dev-cx-voiceai"

In [19]:
def mask_pii_in_captions(
    contact_id,
    df,
    project_id
):
    logger.info(f"{contact_id}: Masking PII Data")
    masked_df = df.copy()
    masked_df['original_index'] = masked_df.index
    masked_df['marked_caption'] = masked_df.index.astype(str) + "|||SEPARATOR|||" + masked_df['caption'].astype(str)
    all_captions = "\n===RECORD_BOUNDARY===\n".join(masked_df['marked_caption'])
    dlp_client = dlp_v2.DlpServiceClient()
    parent = f"projects/{project_id}/locations/global"
    posigen_dictionary = {
        "info_type": {"name": "CUSTOM_DICTIONARY_POSIGEN"},
        "dictionary": {
            "word_list": {
                "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
            }
        }
    }
    custom_cvv_detector = {
        "info_type": {"name": "CUSTOM_CVV"},
        "regex": {"pattern": r"(?:CVV|CVC|code|behind the card|near the sign bar)[^\d]*(\d{3,4})"}
    }
    inspect_config = {
        "info_types": [
            {"name": "CREDIT_CARD_NUMBER"},
            {"name": "CREDIT_CARD_EXPIRATION_DATE"},
            {"name": "STREET_ADDRESS"},
            {"name": "IP_ADDRESS"},
            {"name": "DATE_OF_BIRTH"}
        ],
        "min_likelihood": dlp_v2.Likelihood.POSSIBLE,
        "custom_info_types": [posigen_dictionary, custom_cvv_detector],
    }
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "info_types": [
                        {"name": "CREDIT_CARD_NUMBER"},
                        {"name": "CREDIT_CARD_EXPIRATION_DATE"},
                        {"name": "STREET_ADDRESS"},
                        {"name": "IP_ADDRESS"},
                        {"name": "DATE_OF_BIRTH"}
                    ],
                    "primitive_transformation": {
                        "replace_config": {
                            "new_value": {"string_value": "[REDACTED]"}
                        }
                    }
                }
            ]
        }
    }
    item = {"value": all_captions}
    try:
        response = dlp_client.deidentify_content(
            request={
                "parent": parent,
                "deidentify_config": deidentify_config,
                "inspect_config": inspect_config,
                "item": item,
            }
        )
    except Exception as e:
        logger.error(f"{contact_id}: Error in DLP API call: {e}")
        return df
    processed_content = response.item.value
    processed_records = processed_content.split("\n===RECORD_BOUNDARY===\n")
    processed_dict = {}
    for record in processed_records:
        parts = record.split("|||SEPARATOR|||", 1)
        if len(parts) == 2:
            idx, content = parts
            processed_dict[int(idx)] = content
    masked_df['caption'] = masked_df.apply(
        lambda row: processed_dict.get(row['original_index'], row['caption']), 
        axis=1
    )
    def mask_consecutive_card_numbers(df):
        concatenated_number = ""
        indices_to_mask = []
        for i, row in df.iterrows():
            cleaned_caption = row['caption'].replace(" ", "").replace("-", "")
            if cleaned_caption.isdigit():
                concatenated_number += cleaned_caption
                indices_to_mask.append(i)
                if 13 <= len(concatenated_number) <= 19:
                    for idx in indices_to_mask:
                        df.at[idx, 'caption'] = '[REDACTED]'
                    concatenated_number = ""
                    indices_to_mask = []
            else:
                concatenated_number = ""
                indices_to_mask = []
        return df
    masked_df = mask_consecutive_card_numbers(masked_df)
    masked_df.drop(['original_index', 'marked_caption'], axis=1, inplace=True)
    logger.info(f"{contact_id}: Completed Masking PII Data")
    return masked_df

In [20]:
masked_df=mask_pii_in_captions(
    contact_id,
    result,
    project_id
)

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Masking PII Data
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Completed Masking PII Data


In [21]:
masked_df

Unnamed: 0,contact_id,Begin_Offset,End_Offset,speaker_tag,caption,call_language
0,868375c1-2111-4990-b9d2-36693c7bad46,0,4190,AGENT,Good afternoon. Thank you for calling Positron...,en-US
1,868375c1-2111-4990-b9d2-36693c7bad46,5230,8060,CUSTOMER,"Yes ma'am, this is Regal Liby. How are you doing?",en-US
2,868375c1-2111-4990-b9d2-36693c7bad46,8000,10760,AGENT,"I'm well, thank you for asking, Ms. Liamby. Ho...",en-US
3,868375c1-2111-4990-b9d2-36693c7bad46,11000,12109,CUSTOMER,"Yeah, I'm OK.",en-US
4,868375c1-2111-4990-b9d2-36693c7bad46,12310,14029,AGENT,Mhm. How can I help you today?,en-US
...,...,...,...,...,...,...
76,868375c1-2111-4990-b9d2-36693c7bad46,324250,328359,AGENT,"Alright, Ms. Liby, well thank you so much for ...",en-US
77,868375c1-2111-4990-b9d2-36693c7bad46,328779,330850,CUSTOMER,You too and happy New Year to you.,en-US
78,868375c1-2111-4990-b9d2-36693c7bad46,330170,332630,AGENT,"Happy New Year to you as well, ma'am. Thank you.",en-US
79,868375c1-2111-4990-b9d2-36693c7bad46,332820,333880,CUSTOMER,OK bye bye.,en-US


In [22]:
masked_df['caption'][40:52]

40             So that's gonna be every month from now?
41    Yes, that, so I can tell you your monthly paym...
42    So according to your statement, it's $55.12 bu...
43    Oh, it looks like you had a credit and then $4...
44                So the regular payment would be 5512.
45                                          Yes, ma'am.
46                            OK, alright, card number?
47                                           All right.
48                           I'm ready. Yep, I'm ready.
49                                           [REDACTED]
50                                           [REDACTED]
51                                           [REDACTED]
Name: caption, dtype: object

In [23]:
masked_df['caption'][51:59]

51                                           [REDACTED]
52    Alright, I'm gonna just repeat it back just to...
53                                        That's right.
54                            Alright, expiration date?
55                                            March 25.
56      All right, and then the CVV code for that card.
57                                                 914.
58    Alright, do you want me to save this card to t...
Name: caption, dtype: object