In [31]:
import pandas as pd
import json
import logging
from transformers import AutoTokenizer
from google.cloud import dlp_v2

In [28]:
project_id = "dev-posigen"
secret_id = "dev-cx-voiceai"

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Util Functions

In [3]:
def process_transcript(
    contact_id,
    transcript_data
):
    """
    Pre-process the transcript loaded from S3 Buckets:
    1. Load the transcript as Pandas Dataframe.
    2. Select only the necessary columns ['BeginOffsetMillis', 'EndOffsetMillis', 'ParticipantId', 'Content'].
    3. Format the time in minutes and seconds.
    4. Rename the columns for better understanding.
    """
    try:
        logger.info(f"{contact_id}: Loading the Transcript as Pandas Dataframe.")
        
        # Load the transcript into a DataFrame
        transcript_df = pd.json_normalize(transcript_data['Transcript'])

        # Select the relevant Columns
        columns_to_select = [
            'BeginOffsetMillis',
            'EndOffsetMillis',
            'ParticipantId',
            'Content'
        ]
        formatted_df = transcript_df[columns_to_select].copy()

        # Rename columns
        formatted_df = formatted_df.rename(columns={
            'BeginOffsetMillis': 'Begin_Offset',
            'EndOffsetMillis': 'End_Offset',
            'Content': 'caption',
            'ParticipantId': 'speaker_tag'
        })

        # Inserting the Call ID and Language Code:
        formatted_df.insert(loc=0, column='contact_id', value=contact_id)
        formatted_df['call_language'] = transcript_data.get('LanguageCode', 'Unknown')

        logger.info(f"{contact_id}: Returning formatted DataFrame.")
        return formatted_df

    except Exception as e:
        logger.error(f"Error processing transcript: {str(e)}")
        raise

In [None]:
contact_id,
transcript_data

In [7]:
with open('868375c1-2111-4990-b9d2-36693c7bad46_analysis_2025-01-06T18_29_18Z.json', 'r') as f:
    transcript_data = json.load(f)

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

contact_id = '868375c1-2111-4990-b9d2-36693c7bad46'
result = process_transcript(contact_id, transcript_data)

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Loading the Transcript as Pandas Dataframe.
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Returning formatted DataFrame.


In [35]:
# print(result)

In [27]:
# Ensure all rows and columns are displayed
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Prevent column wrapping
pd.set_option('display.max_colwidth', None)

# Print the entire DataFrame
# print(result)
result.head()

Unnamed: 0,contact_id,Begin_Offset,End_Offset,speaker_tag,caption,call_language
0,868375c1-2111-4990-b9d2-36693c7bad46,0,4190,AGENT,Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?,en-US
1,868375c1-2111-4990-b9d2-36693c7bad46,5230,8060,CUSTOMER,"Yes ma'am, this is Regal Liby. How are you doing?",en-US
2,868375c1-2111-4990-b9d2-36693c7bad46,8000,10760,AGENT,"I'm well, thank you for asking, Ms. Liamby. How are you doing today?",en-US
3,868375c1-2111-4990-b9d2-36693c7bad46,11000,12109,CUSTOMER,"Yeah, I'm OK.",en-US
4,868375c1-2111-4990-b9d2-36693c7bad46,12310,14029,AGENT,Mhm. How can I help you today?,en-US


# Masking Function

In [49]:
def mask_pii_in_captions(
    contact_id,
    df,
    project_id
):
    """
    Masks PII data in the 'caption' column of a pandas DataFrame using Google Cloud DLP API.
    
    Args:
        contact_id: Identifier for logging purposes
        df (pandas.DataFrame): DataFrame with a 'caption' column to process
        project_id (str): Your Google Cloud project ID
        
    Returns:
        pandas.DataFrame: DataFrame with masked PII in the 'caption' column
    """
    logger.info(f"{contact_id}: Masking PII Data")

    # Create a copy of the DataFrame to avoid modifying the original
    masked_df = df.copy()
    
    # Add unique markers to each caption to identify them after processing
    masked_df['original_index'] = masked_df.index
    masked_df['marked_caption'] = masked_df.index.astype(str) + "|||SEPARATOR|||" + masked_df['caption'].astype(str)
    
    # Concatenate all captions for bulk processing
    all_captions = "\n===RECORD_BOUNDARY===\n".join(masked_df['marked_caption'])
    
    # Initialize DLP client
    dlp_client = dlp_v2.DlpServiceClient()
    
    # Specify the parent resource name
    parent = f"projects/{project_id}/locations/global"
    
    # Custom dictionary detector for PosiGen
    posigen_dictionary = {
        "info_type": {"name": "CUSTOM_DICTIONARY_POSIGEN"},
        "dictionary": {
            "word_list": {
                "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
            }
        }
    }
    
    # Configure inspection config with rule set for exclusions
    inspect_config = {
        "info_types": [
            {"name": "CREDIT_CARD_NUMBER"},
            {"name": "CREDIT_CARD_EXPIRATION_DATE"},
            {"name": "STREET_ADDRESS"},
            {"name": "IP_ADDRESS"},
            {"name": "DATE_OF_BIRTH"}
        ],
        "min_likelihood": dlp_v2.Likelihood.POSSIBLE,
        "custom_info_types": [posigen_dictionary],  # ✅ Custom info types should be a list
        "rule_set": [
            {
                "info_types": [{"name": "CUSTOM_DICTIONARY_POSIGEN"}],  # ✅ Specify info_type for rule
                "rules": [
                    {
                        "exclusion_rule": {
                            "matching_type": dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH,
                            "dictionary": {
                                "word_list": {
                                    "words": ["posigen", "Posigen", "PosiGen", "POSIGEN"]
                                }
                            }
                        }
                    }
                ]
            }
        ]
    }
    
    # Configure deidentification to use "[REDACTED]" instead of asterisks
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "info_types": [
                        {"name": "CREDIT_CARD_NUMBER"},
                        {"name": "CREDIT_CARD_EXPIRATION_DATE"},
                        {"name": "STREET_ADDRESS"},
                        {"name": "IP_ADDRESS"},
                        {"name": "DATE_OF_BIRTH"}
                    ],
                    "primitive_transformation": {
                        "replace_config": {
                            "new_value": {"string_value": "[REDACTED]"}
                        }
                    }
                }
            ]
        }
    }
    
    # Create deidentify request
    item = {"value": all_captions}
    
    # Call the DLP API
    try:
        response = dlp_client.deidentify_content(
            request={
                "parent": parent,
                "deidentify_config": deidentify_config,
                "inspect_config": inspect_config,
                "item": item,
            }
        )
    except Exception as e:
        logger.error(f"{contact_id}: Error in DLP API call: {e}")
        return df  # Return original DataFrame if masking fails

    # Get processed content and split by record boundaries
    processed_content = response.item.value
    processed_records = processed_content.split("\n===RECORD_BOUNDARY===\n")
    
    # Create mapping from original indices to processed captions
    processed_dict = {}
    for record in processed_records:
        parts = record.split("|||SEPARATOR|||", 1)
        if len(parts) == 2:
            idx, content = parts
            processed_dict[int(idx)] = content
    
    # Update the DataFrame with redacted content
    masked_df['caption'] = masked_df.apply(
        lambda row: processed_dict.get(row['original_index'], row['caption']), 
        axis=1
    )
    
    # Additional processing to mask all digits with asterisks
    def mask_digits(text):
        """Replaces digits with asterisks while preserving '[REDACTED]' markers."""
        if not isinstance(text, str):
            return text
        parts = text.split("[REDACTED]")
        for i in range(len(parts)):
            parts[i] = re.sub(r'\d', '*', parts[i])
        return "[REDACTED]".join(parts)
    
    # Apply the digit masking function to each processed caption
    # masked_df['caption'] = masked_df['caption'].apply(mask_digits)
    
    # Drop temporary columns
    masked_df.drop(['original_index', 'marked_caption'], axis=1, inplace=True)
    
    logger.info(f"{contact_id}: Completed Masking PII Data")
    return masked_df

# masking Utils

In [51]:
masked_df=mask_pii_in_captions(
    contact_id,
    result,
    project_id
)
# # Concatenate all captions for bulk processing
# all_captions = "\n".join(result['caption'].astype(str).tolist())
# all_captions

INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Masking PII Data
INFO:__main__:868375c1-2111-4990-b9d2-36693c7bad46: Completed Masking PII Data


In [53]:
masked_df[['caption']]

0                                                                                                                                                                                                                                                                                Good afternoon. Thank you for calling Positron. This is Brittany on a recorded line. How may I assist you today?
1                                                                                                                                                                                                                                                                                                                                               Yes ma'am, this is Regal Liby. How are you doing?
2                                                                                                                                                                                                                                   