# Imports

In [106]:
import pandas as pd
import json
import logging
import re
from transformers import AutoTokenizer
from google.cloud import dlp_v2
import warnings
warnings.filterwarnings("ignore", message="Skipping checksum validation")

# Variables

In [107]:
project_id = "dev-posigen"
secret_id = "dev-cx-voiceai"

In [108]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Util Functions

In [162]:
def process_transcript(
    contact_id,
    transcript_data
):
    """
    Pre-process the transcript loaded from S3 Buckets:
    1. Load the transcript as Pandas Dataframe.
    2. Select only the necessary columns ['BeginOffsetMillis', 'EndOffsetMillis', 'ParticipantId', 'Content'].
    3. Format the time in minutes and seconds.
    4. Rename the columns for better understanding.
    """
    try:
        logger.info(f"{contact_id}: Loading the Transcript as Pandas Dataframe.")
        
        # Load the transcript into a DataFrame
        transcript_df = pd.json_normalize(transcript_data['Transcript'])

        # Select the relevant Columns
        columns_to_select = [
            'BeginOffsetMillis',
            'EndOffsetMillis',
            'ParticipantId',
            'Content'
        ]
        formatted_df = transcript_df[columns_to_select].copy()

        # Rename columns
        formatted_df = formatted_df.rename(columns={
            'BeginOffsetMillis': 'Begin_Offset',
            'EndOffsetMillis': 'End_Offset',
            'Content': 'caption',
            'ParticipantId': 'speaker_tag'
        })

        # Inserting the Call ID and Language Code:
        formatted_df.insert(loc=0, column='contact_id', value=contact_id)
        formatted_df['call_language'] = transcript_data.get('LanguageCode', 'Unknown')

        logger.info(f"{contact_id}: Returning formatted DataFrame.")
        return formatted_df

    except Exception as e:
        logger.error(f"Error processing transcript: {str(e)}")
        raise

In [163]:
with open('868375c1-2111-4990-b9d2-36693c7bad46_analysis_2025-01-06T18_29_18Z.json', 'r') as f:
    transcript_data = json.load(f)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

contact_id = '868375c1-2111-4990-b9d2-36693c7bad46'
result = process_transcript(contact_id, transcript_data)

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Loading the Transcript as Pandas Dataframe.
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Returning formatted DataFrame.


In [164]:
result.head()

Unnamed: 0,contact_id,Begin_Offset,End_Offset,speaker_tag,caption,call_language
0,868375c1-2111-4990-b9d2-36693c7bad46,0,4190,AGENT,Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?,en-US
1,868375c1-2111-4990-b9d2-36693c7bad46,5230,8060,CUSTOMER,"Yes ma'am, this is Regal Liby. How are you doing?",en-US
2,868375c1-2111-4990-b9d2-36693c7bad46,8000,10760,AGENT,"I'm well, thank you for asking, Ms. Liamby. How are you doing today?",en-US
3,868375c1-2111-4990-b9d2-36693c7bad46,11000,12109,CUSTOMER,"Yeah, I'm OK.",en-US
4,868375c1-2111-4990-b9d2-36693c7bad46,12310,14029,AGENT,Mhm. How can I help you today?,en-US


In [220]:
result[['caption']]

# output = None

Unnamed: 0,caption
0,Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?
1,"Yes ma'am, this is Regal Liby. How are you doing?"
2,"I'm well, thank you for asking, Ms. Liamby. How are you doing today?"
3,"Yeah, I'm OK."
4,Mhm. How can I help you today?
5,"Um,"
6,"Well, I'm sure I owe something."
7,"OK, alright, so can you confirm your service address so that I can pull up your account?"
8,911
9,"8 Walter Butte Street Jeanette, Louisiana 70544"


In [166]:
unmasked_caltions = "\n".join(result['caption'])

In [167]:
unmasked_caltions

"Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?\nYes ma'am, this is Regal Liby. How are you doing?\nI'm well, thank you for asking, Ms. Liamby. How are you doing today?\nYeah, I'm OK.\nMhm. How can I help you today?\nUm,\nWell, I'm sure I owe something.\nOK, alright, so can you confirm your service address so that I can pull up your account?\n911\n8 Walter Butte Street Jeanette, Louisiana 70544\nAll right, let's take a look.\nAlright, so yes, you were due on the 1st and the payment amount is $50.72.\nAlright, are you, do you want me to use the Mastercard that we have on file or do you wanna use your bank account?\nNo, ma'am.\nNo ma'am, I'm gonna give you a new card. You said it's $50?\nOK.\n$50.72. Yes, ma'am. 4.\nThey only have one account, right?\nLet me take a look because it did pop up when you called to accounts, um, hold on.\nWell, that other account, I sold my house. I sold it.\nYou make\nOK, so that's probably wh

# Masking Function

## working cvv masking

In [168]:
# def mask_cvv(text, previous_text):
#     """
#     Masks non-standard CVV patterns with [REDACTED].
#     Also masks standalone 3-digit responses after an agent's CVV request.
#     """
#     if not isinstance(text, str):
#         return text

#     # Standard and non-standard CVV patterns
#     cvv_patterns = [
#         r'\b(?:cvv|cvv number|security code|verification code|number behind the card|sign bar number|digits on the back)\s*[:-]?\s*\d{3}\b',
#         r'\b(?:three digits on the back|last 3 numbers|security code behind)\s*[:-]?\s*\d{3}\b',
#         r'\b(?:cvv|security code|verification code)\s*[:-]?\s*(\d\s\d\s\d)\b'
#     ]

#     # Mask CVV if directly mentioned
#     for pattern in cvv_patterns:
#         text = re.sub(pattern, "[REDACTED]", text, flags=re.IGNORECASE)

#     # Context-aware masking: If the previous message was an agent asking for CVV
#     if previous_text and re.search(r'\b(?:cvv|security code|digits on the back|card verification)\b', previous_text, re.IGNORECASE):
#         text = re.sub(r'\b\d{3}\b', "[REDACTED]", text)  # Mask standalone 3-digit numbers

#     return text

# def preprocess_text(text, previous_text):
#     """
#     Applies all placeholder masking functions before passing to DLP.
#     """
#     return mask_cvv(text, previous_text)

# def mask_pii_in_captions(contact_id, df, project_id):
#     """
#     Masks PII data in the 'caption' column of a pandas DataFrame using Google Cloud DLP API.
#     """
#     logger.info(f"{contact_id}: Masking PII Data")

#     masked_df = df.copy()
#     masked_df['original_index'] = masked_df.index
#     masked_df['marked_caption'] = masked_df.index.astype(str) + "|||SEPARATOR|||" + masked_df['caption'].astype(str)

#     # Context-aware preprocessing: Check previous row's text for CVV-related questions
#     masked_df['previous_caption'] = masked_df['caption'].shift(1)  # Shift for context
#     masked_df['marked_caption'] = masked_df.apply(lambda row: preprocess_text(row['marked_caption'], row['previous_caption']), axis=1)
    
#     all_captions = "\n===RECORD_BOUNDARY===\n".join(masked_df['marked_caption'])

#     # Initialize DLP client
#     dlp_client = dlp_v2.DlpServiceClient()
#     parent = f"projects/{project_id}/locations/global"

#     inspect_config = {
#         "info_types": [
#             {"name": "CREDIT_CARD_NUMBER"},
#             {"name": "CREDIT_CARD_EXPIRATION_DATE"},
#             {"name": "STREET_ADDRESS"},
#             {"name": "IP_ADDRESS"},
#             {"name": "DATE_OF_BIRTH"}
#         ],
#         "min_likelihood": dlp_v2.Likelihood.POSSIBLE
#     }

#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {
#                     "info_types": [
#                         {"name": "CREDIT_CARD_NUMBER"},
#                         {"name": "CREDIT_CARD_EXPIRATION_DATE"},
#                         {"name": "STREET_ADDRESS"},
#                         {"name": "IP_ADDRESS"},
#                         {"name": "DATE_OF_BIRTH"}
#                     ],
#                     "primitive_transformation": {
#                         "replace_config": {"new_value": {"string_value": "[REDACTED]"}}
#                     }
#                 }
#             ]
#         }
#     }

#     # Call the DLP API
#     try:
#         response = dlp_client.deidentify_content(
#             request={"parent": parent, "deidentify_config": deidentify_config, "inspect_config": inspect_config, "item": {"value": all_captions}}
#         )
#     except Exception as e:
#         logger.error(f"{contact_id}: Error in DLP API call: {e}")
#         return df  # Return original DataFrame if masking fails

#     processed_content = response.item.value
#     processed_records = processed_content.split("\n===RECORD_BOUNDARY===\n")

#     processed_dict = {int(parts[0]): parts[1] for record in processed_records if (parts := record.split("|||SEPARATOR|||", 1)) and len(parts) == 2}

#     masked_df['caption'] = masked_df.apply(lambda row: processed_dict.get(row['original_index'], row['caption']), axis=1)
    
#     # Drop temporary columns
#     masked_df.drop(['original_index', 'marked_caption', 'previous_caption'], axis=1, inplace=True)

#     logger.info(f"{contact_id}: Completed Masking PII Data")
#     return masked_df


## working card number masking but not cvv

In [180]:
# def mask_pii_in_captions(
#     contact_id,
#     df,
#     project_id
# ):
#     logger.info(f"{contact_id}: Masking PII Data")
#     masked_df = df.copy()
#     masked_df['original_index'] = masked_df.index
#     masked_df['marked_caption'] = masked_df.index.astype(str) + "|||SEPARATOR|||" + masked_df['caption'].astype(str)
#     all_captions = "\n===RECORD_BOUNDARY===\n".join(masked_df['marked_caption'])
#     dlp_client = dlp_v2.DlpServiceClient()
#     parent = f"projects/{project_id}/locations/global"
#     posigen_dictionary = {
#         "info_type": {"name": "CUSTOM_DICTIONARY_POSIGEN"},
#         "dictionary": {
#             "word_list": {
#                 "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
#             }
#         }
#     }
#     custom_cvv_detector = {
#         "info_type": {"name": "CUSTOM_CVV"},
#         "regex": {"pattern": r"(?:CVV|CVC|code|behind the card|near the sign bar)[^\d]*(\d{3,4})"}
#     }
#     inspect_config = {
#         "info_types": [
#             {"name": "CREDIT_CARD_NUMBER"},
#             {"name": "CREDIT_CARD_EXPIRATION_DATE"},
#             {"name": "STREET_ADDRESS"},
#             {"name": "IP_ADDRESS"},
#             {"name": "DATE_OF_BIRTH"}
#         ],
#         "min_likelihood": dlp_v2.Likelihood.POSSIBLE,
#         "custom_info_types": [posigen_dictionary, custom_cvv_detector],
#     }
#     deidentify_config = {
#         "info_type_transformations": {
#             "transformations": [
#                 {
#                     "info_types": [
#                         {"name": "CREDIT_CARD_NUMBER"},
#                         {"name": "CREDIT_CARD_EXPIRATION_DATE"},
#                         {"name": "STREET_ADDRESS"},
#                         {"name": "IP_ADDRESS"},
#                         {"name": "DATE_OF_BIRTH"}
#                     ],
#                     "primitive_transformation": {
#                         "replace_config": {
#                             "new_value": {"string_value": "[REDACTED]"}
#                         }
#                     }
#                 }
#             ]
#         }
#     }
#     item = {"value": all_captions}
#     try:
#         response = dlp_client.deidentify_content(
#             request={
#                 "parent": parent,
#                 "deidentify_config": deidentify_config,
#                 "inspect_config": inspect_config,
#                 "item": item,
#             }
#         )
#     except Exception as e:
#         logger.error(f"{contact_id}: Error in DLP API call: {e}")
#         return df
#     processed_content = response.item.value
#     processed_records = processed_content.split("\n===RECORD_BOUNDARY===\n")
#     processed_dict = {}
#     for record in processed_records:
#         parts = record.split("|||SEPARATOR|||", 1)
#         if len(parts) == 2:
#             idx, content = parts
#             processed_dict[int(idx)] = content
#     masked_df['caption'] = masked_df.apply(
#         lambda row: processed_dict.get(row['original_index'], row['caption']), 
#         axis=1
#     )
#     def mask_consecutive_card_numbers(df):
#         concatenated_number = ""
#         indices_to_mask = []
#         for i, row in df.iterrows():
#             cleaned_caption = row['caption'].replace(" ", "").replace("-", "")
#             if cleaned_caption.isdigit():
#                 concatenated_number += cleaned_caption
#                 indices_to_mask.append(i)
#                 if 13 <= len(concatenated_number) <= 19:
#                     for idx in indices_to_mask:
#                         df.at[idx, 'caption'] = '[REDACTED]'
#                     concatenated_number = ""
#                     indices_to_mask = []
#             else:
#                 concatenated_number = ""
#                 indices_to_mask = []
#         return df
#     masked_df = mask_consecutive_card_numbers(masked_df)
#     masked_df.drop(['original_index', 'marked_caption'], axis=1, inplace=True)
#     logger.info(f"{contact_id}: Completed Masking PII Data")
#     return masked_df

## trial to achieve both

In [213]:
def mask_pii_in_captions(
    contact_id,
    df,
    project_id
):
    logger.info(f"{contact_id}: Masking PII Data")
    masked_df = df.copy()
    masked_df['original_index'] = masked_df.index
    masked_df['marked_caption'] = masked_df.index.astype(str) + "|||SEPARATOR|||" + masked_df['caption'].astype(str)
    all_captions = "\n===RECORD_BOUNDARY===\n".join(masked_df['marked_caption'])
    dlp_client = dlp_v2.DlpServiceClient()
    parent = f"projects/{project_id}/locations/global"
    posigen_dictionary = {
        "info_type": {"name": "CUSTOM_DICTIONARY_POSIGEN"},
        "dictionary": {
            "word_list": {
                "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
            }
        }
    }
    custom_cvv_detector = {
        "info_type": {"name": "CUSTOM_CVV"},
        "regex": {"pattern": r"(?:CVV|CVC|code|behind the card|near the sign bar)[^\d]*(\d{3,4})"}
    }
    inspect_config = {
        "info_types": [
            {"name": "CREDIT_CARD_NUMBER"},
            {"name": "CREDIT_CARD_EXPIRATION_DATE"},
            {"name": "STREET_ADDRESS"},
            {"name": "IP_ADDRESS"},
            {"name": "DATE_OF_BIRTH"}
        ],
        "min_likelihood": dlp_v2.Likelihood.POSSIBLE,
        "custom_info_types": [posigen_dictionary, custom_cvv_detector],
    }
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "info_types": [
                        {"name": "CREDIT_CARD_NUMBER"},
                        {"name": "CREDIT_CARD_EXPIRATION_DATE"},
                        {"name": "STREET_ADDRESS"},
                        {"name": "IP_ADDRESS"},
                        {"name": "DATE_OF_BIRTH"},
                        {"name": "CUSTOM_CVV"}
                    ],
                    "primitive_transformation": {
                        "replace_config": {
                            "new_value": {"string_value": "[REDACTED]"}
                        }
                    }
                }
            ]
        }
    }
    item = {"value": all_captions}
    try:
        response = dlp_client.deidentify_content(
            request={
                "parent": parent,
                "deidentify_config": deidentify_config,
                "inspect_config": inspect_config,
                "item": item,
            }
        )
    except Exception as e:
        logger.error(f"{contact_id}: Error in DLP API call: {e}")
        return df
    processed_content = response.item.value
    processed_records = processed_content.split("\n===RECORD_BOUNDARY===\n")
    processed_dict = {}
    for record in processed_records:
        parts = record.split("|||SEPARATOR|||", 1)
        if len(parts) == 2:
            idx, content = parts
            processed_dict[int(idx)] = content
    masked_df['caption'] = masked_df.apply(
        lambda row: processed_dict.get(row['original_index'], row['caption']), 
        axis=1
    )
    def mask_consecutive_card_numbers(df):
        concatenated_number = ""
        indices_to_mask = []
        for i, row in df.iterrows():
            cleaned_caption = row['caption'].replace(" ", "").replace("-", "")
            if cleaned_caption.isdigit():
                concatenated_number += cleaned_caption
                indices_to_mask.append(i)
                if 13 <= len(concatenated_number) <= 19:
                    for idx in indices_to_mask:
                        df.at[idx, 'caption'] = '[REDACTED]'
                    concatenated_number = ""
                    indices_to_mask = []
            else:
                concatenated_number = ""
                indices_to_mask = []
        return df
    masked_df = mask_consecutive_card_numbers(masked_df)
    masked_df.drop(['original_index', 'marked_caption'], axis=1, inplace=True)
    logger.info(f"{contact_id}: Completed Masking PII Data")
    return masked_df


# masking Utils

In [214]:
masked_df=mask_pii_in_captions(
    contact_id,
    result,
    project_id
)
# # Concatenate all captions for bulk processing
# all_captions = "\n".join(result['caption'].astype(str).tolist())
# all_captions

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Masking PII Data
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Completed Masking PII Data


In [215]:
# masked_df[['caption']]
masked_df['caption'][56:59]

56              All right, and then the CVV code for that card.
57                                                         914.
58    Alright, do you want me to save this card to the account?
Name: caption, dtype: object

In [216]:
"\n".join(masked_df['caption'][56:59])

'All right, and then the CVV code for that card.\n914.\nAlright, do you want me to save this card to the account?'

In [217]:
# result[54:59]
masked_df[47:54]

Unnamed: 0,contact_id,Begin_Offset,End_Offset,speaker_tag,caption,call_language
47,868375c1-2111-4990-b9d2-36693c7bad46,209600,210529,AGENT,All right.,en-US
48,868375c1-2111-4990-b9d2-36693c7bad46,211050,212970,AGENT,"I'm ready. Yep, I'm ready.",en-US
49,868375c1-2111-4990-b9d2-36693c7bad46,213330,215940,CUSTOMER,[REDACTED],en-US
50,868375c1-2111-4990-b9d2-36693c7bad46,216750,221940,CUSTOMER,[REDACTED],en-US
51,868375c1-2111-4990-b9d2-36693c7bad46,223190,225330,CUSTOMER,[REDACTED],en-US
52,868375c1-2111-4990-b9d2-36693c7bad46,226039,235850,AGENT,"Alright, I'm gonna just repeat it back just to make sure I have it correct. I have [REDACTED].",en-US
53,868375c1-2111-4990-b9d2-36693c7bad46,236330,237309,CUSTOMER,That's right.,en-US


In [218]:
masked_df[55:59]

Unnamed: 0,contact_id,Begin_Offset,End_Offset,speaker_tag,caption,call_language
55,868375c1-2111-4990-b9d2-36693c7bad46,239360,241639,CUSTOMER,March 25.,en-US
56,868375c1-2111-4990-b9d2-36693c7bad46,242449,245550,AGENT,"All right, and then the CVV code for that card.",en-US
57,868375c1-2111-4990-b9d2-36693c7bad46,246350,247919,CUSTOMER,914.,en-US
58,868375c1-2111-4990-b9d2-36693c7bad46,248479,251470,AGENT,"Alright, do you want me to save this card to the account?",en-US


In [219]:
"\n".join(masked_df['caption'])

"Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?\nYes ma'am, this is Regal Liby. How are you doing?\nI'm well, thank you for asking, Ms. Liamby. How are you doing today?\nYeah, I'm OK.\nMhm. How can I help you today?\nUm,\nWell, I'm sure I owe something.\nOK, alright, so can you confirm your service address so that I can pull up your account?\n911\n[REDACTED]\nAll right, let's take a look.\nAlright, so yes, you were due on the 1st and the payment amount is $50.72.\nAlright, are you, do you want me to use the Mastercard that we have on file or do you wanna use your bank account?\nNo, ma'am.\nNo ma'am, I'm gonna give you a new card. You said it's $50?\nOK.\n$50.72. Yes, ma'am. 4.\nThey only have one account, right?\nLet me take a look because it did pop up when you called to accounts, um, hold on.\nWell, that other account, I sold my house. I sold it.\nYou make\nOK, so that's probably why it came up um under your name, but 

In [187]:
unmasked_caltions

"Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?\nYes ma'am, this is Regal Liby. How are you doing?\nI'm well, thank you for asking, Ms. Liamby. How are you doing today?\nYeah, I'm OK.\nMhm. How can I help you today?\nUm,\nWell, I'm sure I owe something.\nOK, alright, so can you confirm your service address so that I can pull up your account?\n911\n8 Walter Butte Street Jeanette, Louisiana 70544\nAll right, let's take a look.\nAlright, so yes, you were due on the 1st and the payment amount is $50.72.\nAlright, are you, do you want me to use the Mastercard that we have on file or do you wanna use your bank account?\nNo, ma'am.\nNo ma'am, I'm gonna give you a new card. You said it's $50?\nOK.\n$50.72. Yes, ma'am. 4.\nThey only have one account, right?\nLet me take a look because it did pop up when you called to accounts, um, hold on.\nWell, that other account, I sold my house. I sold it.\nYou make\nOK, so that's probably wh