In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv('downloads/combined_emails_with_natural_pii.csv')  # Change the filename if needed
print(df.head())  # See first few emails


                                               email      type
0  Subject: Unvorhergesehener Absturz der Datenan...  Incident
1  Subject: Customer Support Inquiry\n\nSeeking i...   Request
2  Subject: Data Analytics for Investment\n\nI am...   Request
3  Subject: Krankenhaus-Dienstleistung-Problem\n\...  Incident
4  Subject: Security\n\nDear Customer Support, I ...   Request


In [10]:
import re

def mask_entity(text, pattern, entity_type, replacement, entities_list):
    for match in re.finditer(pattern, text):
        start, end = match.span()
        original = match.group()
        text = text[:start] + replacement + text[end:]
        entities_list.append({
            "position": [start, start + len(replacement)],
            "classification": entity_type,
            "entity": original
        })
    return text, entities_list

def mask_pii(email_text):
    masked_entities = []

    # Mask Full Name (Simple Pattern: after "my name is ...")
    name_regex = r'\bmy name is ([A-Za-z\s]+)\b'
    matches = re.finditer(name_regex, email_text, re.IGNORECASE)
    for match in matches:
        start, end = match.span(1)
        original = match.group(1)
        email_text = email_text[:start] + '[full_name]' + email_text[end:]
        masked_entities.append({
            "position": [start, start + len('[full_name]')],
            "classification": "full_name",
            "entity": original
        })


    # Mask Email Address
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
    email_text, masked_entities = mask_entity(email_text, email_regex, 'email', '[email]', masked_entities)

    # Mask Phone Number (10 digits)
    phone_regex = r'\b\d{10}\b'
    email_text, masked_entities = mask_entity(email_text, phone_regex, 'phone_number', '[phone_number]', masked_entities)

    # Mask Aadhaar
    aadhaar_regex = r'\b\d{4}\s\d{4}\s\d{4}\b'
    email_text, masked_entities = mask_entity(email_text, aadhaar_regex, 'aadhar_num', '[aadhar_num]', masked_entities)

    # Mask Card Number
    card_regex = r'\b(?:\d[ -]*?){13,16}\b'
    email_text, masked_entities = mask_entity(email_text, card_regex, 'credit_debit_no', '[credit_debit_no]', masked_entities)

    # Mask CVV
    cvv_regex = r'\b\d{3}\b'
    email_text, masked_entities = mask_entity(email_text, cvv_regex, 'cvv_no', '[cvv_no]', masked_entities)

    # Mask Expiry Date (MM/YY)
    expiry_regex = r'\b(0[1-9]|1[0-2])\/\d{2}\b'
    email_text, masked_entities = mask_entity(email_text, expiry_regex, 'expiry_no', '[expiry_no]', masked_entities)

    # Mask Date of Birth (DD/MM/YYYY)
    dob_regex = r'\b(0[1-9]|[12][0-9]|3[01])\/(0[1-9]|1[012])\/\d{4}\b'
    email_text, masked_entities = mask_entity(email_text, dob_regex, 'dob', '[dob]', masked_entities)

    return email_text, masked_entities


In [5]:
import os

# Create folder if it does not exist
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import joblib

# Prepare inputs and outputs
X = df['email']
y = df['type']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train model
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'saved_models/email_classifier.pkl')
print("✅ Model trained and saved.")


✅ Model trained and saved.


In [15]:
from sklearn.metrics import classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Generate precision, recall, f1-score
report = classification_report(y_test, y_pred)

print("✅ Classification Report:\n")
print(report)


✅ Classification Report:

              precision    recall  f1-score   support

      Change       0.97      0.07      0.13       479
    Incident       0.61      0.99      0.75      1920
     Problem       0.38      0.01      0.02      1009
     Request       0.78      0.91      0.84      1392

    accuracy                           0.67      4800
   macro avg       0.68      0.50      0.44      4800
weighted avg       0.65      0.67      0.56      4800



In [12]:
# Load trained model
model = joblib.load('saved_models/email_classifier.pkl')

def classify_email(email_text):
    masked_email, masked_entities = mask_pii(email_text)
    predicted_category = model.predict([masked_email])[0]

    return {
        "input_email_body": email_text,
        "list_of_masked_entities": masked_entities,
        "masked_email": masked_email,
        "category_of_the_email": predicted_category
    }


In [13]:
sample_email = """
Hello Support Team,

My name is Rahul Sharma. My email is rahul123@gmail.com and phone number is 9876543210. 
My card number is 1234 5678 9012 3456, CVV 123, and expiry 06/24. My Aadhaar is 1234 5678 9123.

Please help me with a billing issue.

Thanks,
Rahul
"""

result = classify_email(sample_email)
print(result)


{'input_email_body': '\nHello Support Team,\n\nMy name is Rahul Sharma. My email is rahul123@gmail.com and phone number is 9876543210. \nMy card number is 1234 5678 9012 3456, CVV 123, and expiry 06/24. My Aadhaar is 1234 5678 9123.\n\nPlease help me with a billing issue.\n\nThanks,\nRahul\n', 'list_of_masked_entities': [{'position': [33, 44], 'classification': 'full_name', 'entity': 'Rahul Sharma'}, {'position': [58, 65], 'classification': 'email', 'entity': 'rahul123@gmail.com'}, {'position': [86, 100], 'classification': 'phone_number', 'entity': '9876543210'}, {'position': [121, 133], 'classification': 'aadhar_num', 'entity': '1234 5678 9012'}, {'position': [183, 195], 'classification': 'aadhar_num', 'entity': '1234 5678 9123'}, {'position': [144, 152], 'classification': 'cvv_no', 'entity': '123'}, {'position': [165, 176], 'classification': 'expiry_no', 'entity': '06/24'}], 'masked_email': '\nHello Support Team,\n\nMy name is [full_name]. My email is [email] and phone number is [pho

In [14]:
sample_email = """
Hello Team,

My name is Suresh Kumar. My email is suresh.kumar@example.com and my contact number is 9876543210.
My card number is 1234 5678 9101 1121, CVV 456, expiry 09/25, and Aadhaar 1234 5678 9012.

Please solve my billing issue.

Thanks,
Suresh
"""

# Run the pipeline
result = classify_email(sample_email)

# Pretty print JSON output
import json
print(json.dumps(result, indent=4))


{
    "input_email_body": "\nHello Team,\n\nMy name is Suresh Kumar. My email is suresh.kumar@example.com and my contact number is 9876543210.\nMy card number is 1234 5678 9101 1121, CVV 456, expiry 09/25, and Aadhaar 1234 5678 9012.\n\nPlease solve my billing issue.\n\nThanks,\nSuresh\n",
    "list_of_masked_entities": [
        {
            "position": [
                25,
                36
            ],
            "classification": "full_name",
            "entity": "Suresh Kumar"
        },
        {
            "position": [
                50,
                57
            ],
            "classification": "email",
            "entity": "suresh.kumar@example.com"
        },
        {
            "position": [
                83,
                97
            ],
            "classification": "phone_number",
            "entity": "9876543210"
        },
        {
            "position": [
                117,
                129
            ],
            "classification": "a