# Profanity Detection

1. Identify all the call ids where collection agents have used profane language.
2. Identify all the call ids where borrowers have used profane language.

In [None]:
import json
import os
import re
from collections import defaultdict
import yaml

# PROFANE_WORDS_PATH = 'artifacts\\profane-words.json'
# CONVERSATIONS_FOLDER = 'C:/Users/Rounak/Desktop/OneDrive/College/Projects/prodigal-solutions-engineer/artifacts/All_Conversations'

In [40]:
def load_config(path='config.yaml'):
    with open(path, 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)

In [22]:
def load_profane_words(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [11]:
def load_all_conversations(folder_path):
    conversations = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                conversations[filename] = json.load(f)
    return conversations

In [12]:
def build_profanity_regex(profane_words):
    # Escape special characters and join into a regex pattern
    pattern = r'\b(?:' + '|'.join(re.escape(word.lower()) for word in profane_words) + r')\b'
    return re.compile(pattern, flags=re.IGNORECASE)

def detect_profanity(conversation, pattern):
    agent_count = 0
    customer_count = 0

    for utterance in conversation:
        speaker = utterance.get("speaker", "").lower()
        text = utterance.get("text", "").lower()

        matches = pattern.findall(text)

        if speaker == "agent":
            agent_count += len(matches)
        elif speaker == "customer" or speaker == "borrower":
            customer_count += len(matches)

    return {
        "agent": {
            "count": agent_count
        },
        "customer": {
            "count": customer_count
        }
    }

In [52]:
import csv

def main():
    config = load_config()
    config_paths = config['ProfaneWords']

    # root_path = 

    profane_words = load_profane_words(config_paths['PROFANE_WORDS_PATH'])
    pattern = build_profanity_regex(profane_words)
    
    all_convos = load_all_conversations(config_paths['CONVERSATIONS_FOLDER'])

    results = []

    for file_id, convo in all_convos.items():
        result = detect_profanity(convo, pattern)
        agent_count = result["agent"]["count"]
        customer_count = result["customer"]["count"]

        results.append({
            "Call ID": file_id.split('.')[0],
            "Agent Profane": agent_count > 0,
            "Customer Profane": customer_count > 0,
            "Agent Profane Count": agent_count,
            "Customer Profane Count": customer_count
        })

    # Write to CSV
    with open("profanity_summary.csv", "w", newline="", encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=results[0].keys())
        writer.writeheader()
        writer.writerows(results)

    # Optional: Print
    for r in results:
        print(r)

if __name__ == "__main__":
    main()

{'Call ID': '00be25b0-458f-4cbf-ae86-ae2ec1f7fba4', 'Agent Profane': False, 'Customer Profane': False, 'Agent Profane Count': 0, 'Customer Profane Count': 0}
{'Call ID': '019b9e97-9575-459e-9893-b59d8c99acef', 'Agent Profane': False, 'Customer Profane': False, 'Agent Profane Count': 0, 'Customer Profane Count': 0}
{'Call ID': '02b08433-58e0-46af-961e-221ba94cb8df', 'Agent Profane': False, 'Customer Profane': False, 'Agent Profane Count': 0, 'Customer Profane Count': 0}
{'Call ID': '02f1cff9-7d47-4168-b1f9-d9fba10847a0', 'Agent Profane': False, 'Customer Profane': False, 'Agent Profane Count': 0, 'Customer Profane Count': 0}
{'Call ID': '03132d8f-6a7d-47d0-9540-c6a0ac32d946', 'Agent Profane': False, 'Customer Profane': False, 'Agent Profane Count': 0, 'Customer Profane Count': 0}
{'Call ID': '04bec80f-8614-484b-8ba2-831ff9dd03ef', 'Agent Profane': True, 'Customer Profane': False, 'Agent Profane Count': 3, 'Customer Profane Count': 0}
{'Call ID': '05731aab-0e72-46c1-8fd3-dc50fb20c758', '

## LLM - GROQ Approach

In [None]:
API_KEY = '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'
from groq import Groq

In [None]:
import os
import json
import yaml
from pathlib import Path
from groq import Groq
from tqdm import tqdm

# Load config.yaml
def load_config(path="config.yaml"):
    with open(path, "r") as f:
        return yaml.safe_load(f)

# Initialize Groq Client
def get_groq_client():
    return Groq(api_key=API_KEY)

# Format conversation
def format_conversation(json_data):
    return "\n".join([f"{x['speaker']}: {x['text']}" for x in json_data])

# Profanity detection
def check_profanity(convo_text, client, model="llama3-70b-8192"):
    prompt = f"""Does this conversation contain any profanity or abusive language? 
Give the answer strictly in the following Python list format:

[
  (True if Agent used any profane words else False),
  (True if Customer used any profane words else False),
  [List of profane words used by Agent, empty list if none],
  [List of profane words used by Customer, empty list if none]
]

Here is the conversation:
{convo_text}
"""
    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
        )
        reply = response.choices[0].message.content.strip()
        return eval(reply)
    except Exception as e:
        print("Error:", e)
        return [False, False, [], []]

# Main processing function
def process_all_to_dict(folder_path, client):
    results = {}

    for file in tqdm(Path(folder_path).glob("*.json")):
        try:
            with open(file, "r", encoding="utf-8") as f:
                convo_json = json.load(f)

            convo_text = format_conversation(convo_json)
            result = check_profanity(convo_text, client)

            results[file.stem] = {
                "agent_profane": result[0],
                "customer_profane": result[1],
                "agent_words": result[2],
                "customer_words": result[3],
                "agent_word_count": len(result[2]),
                "customer_word_count": len(result[3])
            }

        except Exception as e:
            print(f"Failed on {file.name}: {e}")

    return results

# Run
if __name__ == "__main__":
    config = load_config()
    folder_path = config["ProfaneWords"]["CONVERSATIONS_FOLDER"]
    client = get_groq_client()

    profane_data = process_all_to_dict(folder_path, client)
    
    # You can now use this dictionary however you want
    print("✅ Profanity dictionary created. Sample:\n")
    sample_key = next(iter(profane_data))
    print(sample_key, ":", profane_data[sample_key])


In [70]:
profane_data

{'00be25b0-458f-4cbf-ae86-ae2ec1f7fba4': {'agent_profane': False,
  'customer_profane': False,
  'agent_words': [],
  'customer_words': [],
  'agent_word_count': 0,
  'customer_word_count': 0},
 '019b9e97-9575-459e-9893-b59d8c99acef': {'agent_profane': False,
  'customer_profane': False,
  'agent_words': [],
  'customer_words': [],
  'agent_word_count': 0,
  'customer_word_count': 0},
 '02b08433-58e0-46af-961e-221ba94cb8df': {'agent_profane': False,
  'customer_profane': False,
  'agent_words': [],
  'customer_words': [],
  'agent_word_count': 0,
  'customer_word_count': 0},
 '02f1cff9-7d47-4168-b1f9-d9fba10847a0': {'agent_profane': False,
  'customer_profane': False,
  'agent_words': [],
  'customer_words': [],
  'agent_word_count': 0,
  'customer_word_count': 0},
 '03132d8f-6a7d-47d0-9540-c6a0ac32d946': {'agent_profane': False,
  'customer_profane': False,
  'agent_words': [],
  'customer_words': [],
  'agent_word_count': 0,
  'customer_word_count': 0},
 '04bec80f-8614-484b-8ba2-831f

In [83]:
def profane_llm_implementation(convo_text, client, model="llama3-70b-8192"):
    results = []
    convo_text = open(convo_text, 'r', encoding='utf-8')
    convo_text = json.load(convo_text)
    convo_text = format_conversation(convo_text)
    print(convo_text)
    
    prompt = f"""Does this conversation contain any profanity or abusive language? 
                Give the answer strictly in the following Python list format:

                [
                (True if Agent used any profane words else False),
                (True if Customer used any profane words else False),
                [List of profane words used by Agent, empty list if none],
                [List of profane words used by Customer, empty list if none]
                ]

                Here is the conversation:
                {convo_text}
                """
    try:
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
        )
        reply = response.choices[0].message.content.strip()
        reply = eval(reply)
        print(reply)
        results.append({
            "Call ID": convo_text.split('.')[0],
            "Agent Profane": reply[0],
            "Customer Profane": reply[1],
            "Agent Profane Words": reply[2],
            "Customer Profane Words": reply[3],
            "Agent Profane Count": len(reply[2]),
            "Customer Profane Count": len(reply[3])
        })
        return results[0]

    except Exception as e:
        print("Error:", e)
        return [False, False, [], []]
    
convo_text = "artifacts/All_Conversations/0b6979e4-8c05-49e1-b7a7-94d85a627df5.json"
client = get_groq_client()
profane_llm_implementation(convo_text, client)

Agent: Hello, this is Mark from XYZ Collections. Am I speaking with Jessica? fuck
Customer: Yes, this is Jessica. What is this about?
Agent: I’m calling regarding an outstanding balance on your account. Could you please verify your address for me?
Customer: Sure, it's 123 Elm Street, Springfield.
Agent: Thank you. I see here that you have a balance of $450 due. When do you plan to make a payment?
Customer: I didn't know I had a balance. Can you provide more details?
Agent: Sure! This balance is from services rendered last month. Can I assist you with the payment plan?
Customer: I need to check my finances first before committing.
Agent: Of course, take your time. Just to confirm, do you want to set a follow-up call next week?
Customer: Yes, that sounds good.
Agent: Alright, I will call you next Wednesday. Thank you for your time, Jessica.
Customer: Thank you, Mark.
Agent: Goodbye!
[True, False, ['fuck'], []]


{'Call ID': 'Agent: Hello, this is Mark from XYZ Collections',
 'Agent Profane': True,
 'Customer Profane': False,
 'Agent Profane Words': ['fuck'],
 'Customer Profane Words': [],
 'Agent Profane Count': 1,
 'Customer Profane Count': 0}