In [43]:
from openai import OpenAI
import json
import re
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from IPython.display import display, Markdown
from copy import deepcopy
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
client = OpenAI()

# Step 1: Explore the data

Explore the data in full

## 1. Metdata csv file

In [3]:
df_call_metadata = pd.read_csv('/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/call_metadata.csv')
print(df_call_metadata.head())
print(df_call_metadata.info())

                                    _id     duration            disposition  \
0  1558137f-4ab5-409e-8159-109c2e9358d5  3927.771375    No Pay - Bankruptcy   
1  05c54daf-9bca-4bd7-b833-b3ba8c1e06e0   146.520000  No Pay - Cant pay now   
2  84fb4fec-b36f-43bc-a4ef-a27ba322c146   749.635875       No Pay - Dispute   
3  2133927a-0d11-4d54-98d6-cd21c71417f0   448.104375       No Pay - Dispute   
4  048518c6-c17c-4524-8ff3-6a0b2735863c   108.432000       No Pay - Dispute   

       type  
0  negative  
1  negative  
2  negative  
3  negative  
4  negative  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _id          2000 non-null   object 
 1   duration     2000 non-null   float64
 2   disposition  2000 non-null   object 
 3   type         2000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 62.6+ KB
None


In [4]:
print(df_call_metadata['type'].value_counts())

type
negative    1000
positive    1000
Name: count, dtype: int64


In [5]:
print(df_call_metadata['disposition'].value_counts())

disposition
No Pay - Dispute                    439
Promise - Payment in full           313
No Pay - Not right now              296
Promise - Payment plan              256
No Pay - Cant pay now               182
Promise - Settlement in full        163
Promise - One time payment          104
Payment Plan Modification           101
Promise - Settlement in payments     63
No Pay - Can't pay now               42
No Pay - Bankruptcy                  30
No Pay - Cancel payment plan         11
Name: count, dtype: int64


In [6]:
disposition_type_counts = df_call_metadata.groupby('disposition')['type'].value_counts().unstack(fill_value=0)
print(disposition_type_counts)

type                              negative  positive
disposition                                         
No Pay - Bankruptcy                     30         0
No Pay - Can't pay now                  42         0
No Pay - Cancel payment plan            11         0
No Pay - Cant pay now                  182         0
No Pay - Dispute                       439         0
No Pay - Not right now                 296         0
Payment Plan Modification                0       101
Promise - One time payment               0       104
Promise - Payment in full                0       313
Promise - Payment plan                   0       256
Promise - Settlement in full             0       163
Promise - Settlement in payments         0        63


In [8]:
positive_dispositions = set(disposition_type_counts[disposition_type_counts['positive'] > 0].index.tolist())
negative_dispositions = set(disposition_type_counts[disposition_type_counts['negative'] > 0].index.tolist())

print("Positive disposition types:", positive_dispositions)
print("Negative disposition types:", negative_dispositions)

Positive disposition types: {'Promise - Settlement in full', 'Promise - One time payment', 'Promise - Payment plan', 'Promise - Settlement in payments', 'Promise - Payment in full', 'Payment Plan Modification'}
Negative disposition types: {'No Pay - Not right now', 'No Pay - Cant pay now', 'No Pay - Bankruptcy', 'No Pay - Dispute', 'No Pay - Cancel payment plan', "No Pay - Can't pay now"}


In [9]:
len(positive_dispositions), len(negative_dispositions)

(6, 6)

### Split dataset into train-val-test

We do 80-10-10 train-val-test split, ensuring there is no class imbalance in either split

In [10]:
from sklearn.model_selection import train_test_split

# Separate positive and negative samples
df_positive = df_call_metadata[df_call_metadata['type'] == 'positive']
df_negative = df_call_metadata[df_call_metadata['type'] == 'negative']

# Calculate split sizes
train_size = 0.8
val_size = 0.1
test_size = 0.1

def split_df(df, train_size, val_size, test_size):
    df_train, df_temp = train_test_split(df, train_size=train_size, random_state=42, shuffle=True)
    relative_val_size = val_size / (val_size + test_size)
    df_val, df_test = train_test_split(df_temp, train_size=relative_val_size, random_state=42, shuffle=True)
    return df_train, df_val, df_test

# Split positive and negative samples
pos_train, pos_val, pos_test = split_df(df_positive, train_size, val_size, test_size)
neg_train, neg_val, neg_test = split_df(df_negative, train_size, val_size, test_size)

# Concatenate to get final splits with equal positive and negative samples
train_df = pd.concat([pos_train, neg_train]).sample(frac=1, random_state=42).reset_index(drop=True)
val_df = pd.concat([pos_val, neg_val]).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat([pos_test, neg_test]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train shape: {train_df.shape}, Positive: {(train_df['type'] == 'positive').sum()}, Negative: {(train_df['type'] == 'negative').sum()}")
print(f"Val shape: {val_df.shape}, Positive: {(val_df['type'] == 'positive').sum()}, Negative: {(val_df['type'] == 'negative').sum()}")
print(f"Test shape: {test_df.shape}, Positive: {(test_df['type'] == 'positive').sum()}, Negative: {(test_df['type'] == 'negative').sum()}")

Train shape: (1600, 4), Positive: 800, Negative: 800
Val shape: (200, 4), Positive: 100, Negative: 100
Test shape: (200, 4), Positive: 100, Negative: 100


In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _id          1600 non-null   object 
 1   duration     1600 non-null   float64
 2   disposition  1600 non-null   object 
 3   type         1600 non-null   object 
dtypes: float64(1), object(3)
memory usage: 50.1+ KB


In [12]:
train_df.to_csv('/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/train_df.csv', index=False)
val_df.to_csv('/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/val_df.csv', index=False)
test_df.to_csv('/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/test_df.csv', index=False)

## 2. Conversations

In [13]:
import os

conversation_folder = '/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/conversations'
txt_files = [f for f in os.listdir(conversation_folder) if f.endswith('.txt')]
print(f"Number of text files: {len(txt_files)}")

Number of text files: 2000


In [16]:
# Remove '.txt' extension from txt_files to get basenames
txt_basenames = set([os.path.splitext(f)[0] for f in txt_files])

# Get set of _id values from df_call_metadata
df_ids = set(df_call_metadata['_id'])

assert txt_basenames == df_ids

In [15]:
assert df_call_metadata['_id'].is_unique, "Not all _id values are unique in df_call_metadata"

In [22]:
random_txt = random.choice(txt_files)
txt_path = os.path.join(conversation_folder, random_txt)
with open(txt_path, 'r') as f:
    content = f.read()

display(Markdown(f"### {random_txt}\n\n{content}"))

### 89cd6569-3d4e-4c45-84e8-c6941f8f624d.txt

agent 00:06-00:16
* * * this call is being recorded and may be monitored for quality assurance purposes and by continuing you're providing consent may i please have your first and last name

borrower 00:19-00:20
* *

agent 00:21-00:24
okay thank you and do you happen to have your agency XYZ account number

borrower 00:25-00:26
no i don't

agent 00:27-00:30
okay no worries did you receive an email text or a letter from us

borrower 00:31-00:33
email text one letter sorry

agent 00:34-00:37
okay got it let me see if i can locate your account what is the year of your birthday please the year

borrower 00:39-00:41
* * * * *

agent 00:42-00:48
okay let me see here if you received the letter let me try to search it that way what is your address to where we send that letter to

borrower 00:50-00:56
* * * * * * * * * * * * *

agent 00:57-01:22
okay thank you okay got it i do see an account so very quickly agency XYZ is a debt collection agency this is an attempt to collect debt and any information obtained will be used for that purpose alright so i do see the account with lvnv funding llc originally with via inc and it has a balance of four hundred ninety dollars and nineteen cents did you wanna pay that in full today

borrower 01:24-01:32
i already sent you a check today certified you guys will be signing to get the cashier check it's coming so you should receive it by tomorrow

agent 01:33-02:07
alright i'll go ahead and make the note on the account and as soon as we receive it we'll send you an email confirmation of the account being closed okay perfect alrighty now a couple of things just to make sure we send it to the correct email we have * your last name * * * is that a good email twelve phone mills perfect so we'll send conns confirmation there and the phone number we have on file is it * * * * * * * * * * is that also accurate

borrower 01:56-01:57
yep yep

borrower 02:07-02:09
yep yep

agent 02:09-02:12
perfect alrighty for now was there anything else i can assist you with

borrower 02:14-02:14
nope

agent 02:15-02:19
perfect alright well i do thank you for calling agency XYZ and i hope you have a great rest of your day

### Format every content better

I want to create a format of the convo as `{"borrower": <text>, "agent": <text>, ...}` and save them in json files

In [27]:
def format_conv_from_content(content):
    lines = content.splitlines()
    lines_fil = [line.strip() for line in lines if len(line.strip()) > 0]
    convo = []
    speaker = None
    text = []
    speaker_re = re.compile(r'^(agent|borrower)\s+\d{2}:\d{2}-\d{2}:\d{2}$', re.IGNORECASE)
    for line in lines_fil:
        if speaker_re.match(line):
            if speaker and text:
                convo.append({"speaker": speaker, "text": " ".join(text)})
            speaker = line.split()[0].lower()
            text = []
        else:
            text.append(line)
    if speaker and text:
        convo.append({"speaker": speaker, "text": " ".join(text)})
    return convo


In [29]:
for txt_file in txt_files:
    txt_path = os.path.join(conversation_folder, txt_file)
    with open(txt_path, 'r') as f:
        content = f.read()
    formatted = format_conv_from_content(content)
    json_path = os.path.splitext(txt_path)[0] + '.json'
    with open(json_path, 'w') as jf:
        json.dump(formatted, jf, indent=2)

# Step 2: Remove Potential PII data

For each conversation, flag potential PII data in the call logs. Mostly looked clean, but imp to do a sanity check on a sample

In [61]:
pii_filter_prompt = """
# Context:
"You are an AI assistant tasked with identifying and flagging customer Personally Identifiable Information (PII) within a given text. Your role is to act as a data protection filter.

You will be given a conversation between an AI agent and a customer as input. Here is how you should operate:
1.  Analyze the following conversation content provided between an AI agent and a customer.
2.  Your primary objective is to determine if any customer PII is present in the text. PII includes, but is not limited to:
    *   Names (full names, first names, last names)
    *   Email addresses
    *   Phone numbers
    *   Physical addresses (street, city, state, zip code)
    *   Social Security numbers or other government-issued identification numbers
    *   Credit card numbers or financial account information
    *   Dates of birth
    *   Login credentials (usernames, passwords)
    *   Medical information
    *   IP addresses or other unique device identifiers
3. Crucially, you must **ignore any data that has been masked**. Masked data is information that has been partially or fully obscured. Common examples of masking techniques to ignore include:
    - Redaction/Masking Out: Replacing parts of the data with characters like 'X', '*', or '#'. For instance, XXX-XX-1234 or john.doe@******.com.
    - Substitution/Replacement: Replacing sensitive data with fictitious yet realistic-looking data. For example, a real name might be replaced with a randomly generated one.
    - Tokenization/Encryption: Replacing data with a non-sensitive placeholder or an encrypted string.
    - Nulling Out: Replacing sensitive information with null values or blanks.
4. **Ignore Agent Names**: You must differentiate between the agent and the customer and ignore the agent's name. The agent's name may be stated explicitly (e.g., "My name is Alex," "This is SupportBot," or "You're speaking with Sarah."). Your focus is exclusively on identifying the **customer's PII**.
5.  After your analysis, you will provide a binary output:
    *   Return **1** if you identify any potential PII in the conversation that is unmasked.
    *   Return **0** if you do not find any PII in the conversation or **all** potential PII is masked out

# Output Format:
- Analysis of the conversation: <Detailed analysis of the conversation between the AI agent and customer wrt finding potential PII>
- Identified PII: <List of potential customer PII data identified in the conversation. Return None if the input is customer PII free>
- Score: <0 if there is no customer PII in the conversation or all customer PII is masked, 1 otherwise>

IMPORTANT: Remember to always adhere to this output format for consistency

# Input
Input Content: {conv} 

# Output
"""

In [63]:
p = deepcopy(pii_filter_prompt)
random_txt = random.choice(txt_files)
txt_path = os.path.join(conversation_folder, random_txt)
txt_path = '/Users/sohammistri/Documents/Prodigal-Take-Home/sentiment-take-home/conversations/f8832c53-8b6c-4f73-bcce-b06c4f89b0f3.txt'
with open(txt_path, 'r') as f:
    content = f.read()

p = p.format(conv=content)
response = client.responses.create(
    model="gpt-4.1",
    input=p
)

display(Markdown(response.output_text))


- Analysis of the conversation:  
Upon reviewing the conversation, I analyzed all exchanges between the agent and the borrower (customer). All possible PII fields such as names ("miss *", etc.), email addresses, and other sensitive data were indicated with masking ("*," "* *," etc.) and are therefore not to be considered as actual unmasked PII under the provided guidelines. Additionally, a statement regarding the customer's health status ("i'm going through cancer right now i i'm i'm in radiation") is mentioned, which is medical information and constitutes PII; however, it is not masked and thus counts as unmasked PII. No unmasked names, contact details, or financial/account information are present—the rest is either masked or relates to public company/agent information.

- Identified PII: 
    - Medical information: The customer discloses undergoing cancer treatment and radiation ("i'm going through cancer right now i i'm i'm in radiation i just got a radiation").

- Score: 1

In [65]:
def extract_sections(text):
    analysis = None
    pii = None
    score = None

    analysis_match = re.search(r'- Analysis of the conversation:\s*(.*?)(?:- Identified PII:|$)', text, re.DOTALL)
    pii_match = re.search(r'- Identified PII:\s*(.*?)(?:- Score:|$)', text, re.DOTALL)
    score_match = re.search(r'- Score:\s*(\d+)', text)

    if analysis_match:
        analysis = analysis_match.group(1).strip()
    if pii_match:
        pii = pii_match.group(1).strip()
    if score_match:
        score = int(score_match.group(1).strip())

    return analysis, pii, score

def get_pii_score(prompt, file_path):
    with open(file_path, 'r') as f:
        content = f.read()

    try:
        p = deepcopy(prompt)
        p = p.format(conv=content)
        response = client.responses.create(
            model="gpt-4.1",
            input=p
        )
        response_out = response.output_text
        analysis, pii, score = extract_sections(response_out)
        result = {
            "file": file_path,
            "analysis": analysis,
            "pii": pii,
            "score": score
        }
        return result
    except Exception as e:
        print(e)
        return None

In [66]:
txt_paths = [os.path.join(conversation_folder, fname) for fname in txt_files]
len(txt_paths)

2000

In [67]:
results = []

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_pii_score, pii_filter_prompt, file_path) for file_path in txt_paths]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Files"):
        results.append(future.result())

Processing Files: 100%|██████████| 2000/2000 [09:12<00:00,  3.62it/s]


In [68]:
results_fil = [res for res in results if res is not None]
results_df = pd.DataFrame(results_fil)

In [69]:
print(results_df['score'].value_counts())

score
0    1894
1     106
Name: count, dtype: int64


In [76]:
i = random.randint(0, len(results_df[results_df['score'] == 1]))
i

64

In [77]:
display(Markdown(results_df[results_df['score'] == 1].iloc[i]["analysis"]))

The conversation between the agent and the customer involves a debt collection call. The customer is addressed by the name "axel" by the agent, which may indicate the customer's first name is unmasked and present. The agent also attempts to verify sensitive customer information: date of birth, mailing address, and email address; however, in all these instances, the information is masked with asterisks ("* * * * *" or similar formats). There are references to accounts and a transaction amount, but no unmasked account numbers, social security numbers, or other direct PII. The only potential PII that is not masked is the given name "axel," which the agent uses to address the customer, and which the customer does not correct or dispute. All other potentially sensitive information (date of birth, address, email) is completely masked.