# Privacy and Compliance Violation
- Identify all the call ids where agents have shared sensitive information (balance
or account details) without the identity verification(i.e. without verification of date
of birth or address or Social Security Number).


In [None]:
import re
import json
from pathlib import Path
from tqdm import tqdm

def get_sensitive_patterns():
    return {
        "ssn": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
        "phone": re.compile(r"\b(?:\+91[-\s]?|0)?\d{10}\b"),
        "email": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}\b"),
        "credit_card": re.compile(r"\b(?:\d{4}[-\s]?){3}\d{4}\b"),
        "address": re.compile(r"\b\d{1,5}\s\w+\s(?:Street|St|Avenue|Ave|Road|Rd|Lane|Ln)\b", re.I),
        "pan": re.compile(r"\b[A-Z]{5}\d{4}[A-Z]\b"),
        "aadhaar": re.compile(r"\b\d{4}\s\d{4}\s\d{4}\b"),
        "phone": re.compile(r"\b\d{3}-\d{3}-\d{4}\b"),
        "address": re.compile(r'\d+\s\w+\s\w+,\s\w+'),
        "passport": re.compile(r"\b[A-Z]{1}[0-9]{7}\b"),
        "bank_account": re.compile(r"\b\d{9,18}\b"),
        "dob": re.compile(r"\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b"),
        "email_address": re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
    }

def get_verification_keywords():
    return ["date of birth", "dob", "address", "ssn", "social security number", "verification"]

def contains_verification(text):
    text = text.lower()
    return any(key in text for key in get_verification_keywords())

def contains_sensitive(text, patterns):
    found = []
    for name, pattern in patterns.items():
        if pattern.search(text):
            found.append(name)
    return found

def check_privacy_violation(convo_json):
    patterns = get_sensitive_patterns()
    verified = False
    violations = []

    for line in convo_json:
        speaker = line["speaker"]
        text = line["text"]

        if speaker.lower() == "agent":
            if contains_verification(text):
                verified = True

            if not verified:
                sensitive_found = contains_sensitive(text, patterns)
                if sensitive_found:
                    violations.extend(sensitive_found)
        verified = False  # Reset for the next line

    return list(set(violations))  # Unique violation types

def process_conversations(folder_path):
    results = {}

    for file in tqdm(Path(folder_path).glob("*.json")):
        try:
            with open(file, "r", encoding="utf-8") as f:
                convo = json.load(f)

            violations = check_privacy_violation(convo)
            results[file.stem] = {
                "violation": bool(violations),
                "violation_types": violations
            }

        except Exception as e:
            print(f"Error in file {file.name}: {e}")

    return results

if __name__ == "__main__":
    # Use your actual path here
    CONVO_FOLDER = "C:/Users/Rounak/Desktop/OneDrive/College/Projects/prodigal-solutions-engineer/artifacts/All_Conversations"
    
    result = process_conversations(CONVO_FOLDER)

    # Sample print
    for call_id, info in list(result.items())[:5]:
        print(f"{call_id}: {info}")


250it [00:00, 2717.56it/s]

00be25b0-458f-4cbf-ae86-ae2ec1f7fba4: {'violation': False, 'violation_types': []}
019b9e97-9575-459e-9893-b59d8c99acef: {'violation': False, 'violation_types': []}
02b08433-58e0-46af-961e-221ba94cb8df: {'violation': False, 'violation_types': []}
02f1cff9-7d47-4168-b1f9-d9fba10847a0: {'violation': True, 'violation_types': ['Amount']}
03132d8f-6a7d-47d0-9540-c6a0ac32d946: {'violation': True, 'violation_types': ['Amount', 'phone']}





# LLM - GROQ Approach

In [133]:
import json
import yaml
from pathlib import Path
from groq import Groq
from tqdm import tqdm

API_KEY = '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'

def get_groq_client():
    return Groq(api_key=API_KEY)

def load_config(path="config.yaml"):
    with open(path, "r") as f:
        return yaml.safe_load(f)
    
# Format conversation
def format_conversation(json_data):
    return "\n".join([f"{x['speaker']}: {x['text']}" for x in json_data])


def check_sensitive_information(convo_text, client, model="llama3-70b-8192"):

    prompt = f"""
    I am giving you a conversation between a customer and an agent in JSON format.
    Check strictly if agent have shared balance in any currency or account details of customer without the identity verification of customer (i.e. without verification of date of birth, address, or Social Security Number).
    Strictly give output in this Python list format: 

    [True if shared else False] 

    NOTE: I do not want python code. I do not want any explaination.

    Here is the conversation between the customer and agent in JSON format:
    {convo_text}
    """

    try:
        # API call to the model
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
        )
        
        # Get the response content
        reply = response.choices[0].message.content.strip()
        #convert into python list
        reply = eval(reply)


        return reply
    
    except Exception as e:
        print(f"Error occurred: {e}")
        return None


# Main processing function
def process_all_to_dict(folder_path, client):
    results = {}

    for file in tqdm(Path(folder_path).glob("*.json")):
        try:
            with open(file, "r", encoding="utf-8") as f:
                convo_json = json.load(f)

            # convo_text = format_conversation(convo_json)
            result = check_sensitive_information(convo_json, client)
            results[file.stem] = result[0]

        except Exception as e:
            print(f"Failed on {file.name}: {e}")

    return results

In [134]:
if __name__ == "__main__":
    config = load_config()
    folder_path = config["ProfaneWords"]["CONVERSATIONS_FOLDER"]
    client = get_groq_client()

    profane_data = process_all_to_dict(folder_path, client)
    
    # You can now use this dictionary however you want
    print("✅ Profanity dictionary created. Sample:\n")
    sample_key = next(iter(profane_data))
    print(sample_key, ":", profane_data[sample_key])

0it [00:00, ?it/s]

250it [25:06,  6.02s/it]

✅ Profanity dictionary created. Sample:

00be25b0-458f-4cbf-ae86-ae2ec1f7fba4 : False



