In [53]:
import numpy as np
import pandas as pd

# Load Dataset

In [54]:
# Created another dataset using a sample from original enron dataset
df = pd.read_csv('emails_sample.csv')

In [55]:
df.shape

(10000, 2)

In [56]:
df.head()

Unnamed: 0,file,message
0,shackleton-s/sent/1912.,Message-ID: <21013688.1075844564560.JavaMail.e...
1,farmer-d/logistics/1066.,Message-ID: <22688499.1075854130303.JavaMail.e...
2,parks-j/deleted_items/202.,Message-ID: <27817771.1075841359502.JavaMail.e...
3,stokley-c/chris_stokley/iso/client_rep/41.,Message-ID: <10695160.1075858510449.JavaMail.e...
4,germany-c/all_documents/1174.,Message-ID: <27819143.1075853689038.JavaMail.e...


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   file     10000 non-null  object
 1   message  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


# Inspect Data

In [58]:
for i in range(5):
    print(df.iloc[0,0], df.iloc[0,1])
    print('-'*30)

shackleton-s/sent/1912. Message-ID: <21013688.1075844564560.JavaMail.evans@thyme>
Date: Tue, 29 Aug 2000 01:26:00 -0700 (PDT)
From: sara.shackleton@enron.com
To: william.bradford@enron.com
Subject: Re: Credit Derivatives
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Sara Shackleton
X-To: William S Bradford
X-cc: 
X-bcc: 
X-Folder: \Sara_Shackleton_Dec2000_June2001_1\Notes Folders\Sent
X-Origin: SHACKLETON-S
X-FileName: sshackle.nsf

Bill:  Thanks for the info.   I also spoke with Jeff about how 
EnronCredit.com Ltd. was going to work since Dennis O'Connell (London lawyer) 
is responsible for that group.  Maybe you will be able to clarify which of 
Jeff's "positions" will be hedges and which will be backed to EnronCredit.  
Maybe Rod will be handling most of Jeff's credit.  I'd appreciate an update.  
Sara



	William S Bradford
	08/29/2000 07:24 AM
		
		 To: Sara Shackleton/HOU/ECT@ECT
		 cc: Mark Taylor/HOU/ECT@ECT, Paul Radous/Co

# Solution

In [36]:
from openai import OpenAI
import re
from faker import Faker
from dotenv import load_dotenv
import os
from datetime import datetime

In [45]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
fake = Faker('en_IN')  # Indian context
Faker.seed(42)

In [49]:
fake.name()

'Udant Dewan'

In [52]:
fake.city()

'Patna'

In [59]:
def mask_pii(text):
    # Mask emails, phones, dates
    text = re.sub(r'\S+@enron\.com', '[EMAIL]', text)
    text = re.sub(r'\(\d{3}\) \d{3}-\d{4}', '[PHONE]', text)
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '[DATE]', text)
    return text

In [41]:
client = OpenAI(
    api_key = openai.api_key
)

In [42]:
def generate_synthetic_email(original_email):
    prompt = f"""
    Rewrite this email as if from Agriculture India Pvt. Ltd. Replace all PII, company names, and industry terms. Use Indian names, INR currency, and agriculture context. Retain email structure and intent. 

    Original Email:
    {original_email}
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [43]:
print(df.iloc[0,1])

Message-ID: <21013688.1075844564560.JavaMail.evans@thyme>
Date: Tue, 29 Aug 2000 01:26:00 -0700 (PDT)
From: sara.shackleton@enron.com
To: william.bradford@enron.com
Subject: Re: Credit Derivatives
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Sara Shackleton
X-To: William S Bradford
X-cc: 
X-bcc: 
X-Folder: \Sara_Shackleton_Dec2000_June2001_1\Notes Folders\Sent
X-Origin: SHACKLETON-S
X-FileName: sshackle.nsf

Bill:  Thanks for the info.   I also spoke with Jeff about how 
EnronCredit.com Ltd. was going to work since Dennis O'Connell (London lawyer) 
is responsible for that group.  Maybe you will be able to clarify which of 
Jeff's "positions" will be hedges and which will be backed to EnronCredit.  
Maybe Rod will be handling most of Jeff's credit.  I'd appreciate an update.  
Sara



	William S Bradford
	08/29/2000 07:24 AM
		
		 To: Sara Shackleton/HOU/ECT@ECT
		 cc: Mark Taylor/HOU/ECT@ECT, Paul Radous/Corp/Enron@ENRON, Rod 
Nel

In [32]:
original_email = df.iloc[0,1]

In [33]:
masked_email = mask_pii(original_email)
print(masked_email)

Message-ID: <21013688.1075844564560.JavaMail.evans@thyme>
Date: Tue, 29 Aug 2000 01:26:00 -0700 (PDT)
From: [EMAIL]
To: [EMAIL]
Subject: Re: Credit Derivatives
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Sara Shackleton
X-To: William S Bradford
X-cc: 
X-bcc: 
X-Folder: \Sara_Shackleton_Dec2000_June2001_1\Notes Folders\Sent
X-Origin: SHACKLETON-S
X-FileName: sshackle.nsf

Bill:  Thanks for the info.   I also spoke with Jeff about how 
EnronCredit.com Ltd. was going to work since Dennis O'Connell (London lawyer) 
is responsible for that group.  Maybe you will be able to clarify which of 
Jeff's "positions" will be hedges and which will be backed to EnronCredit.  
Maybe Rod will be handling most of Jeff's credit.  I'd appreciate an update.  
Sara



	William S Bradford
	08/29/2000 07:24 AM
		
		 To: Sara Shackleton/HOU/ECT@ECT
		 cc: Mark Taylor/HOU/ECT@ECT, Paul Radous/Corp/Enron@ENRON, Rod 
Nelson/LON/ECT@ECT
		 Subject: Re: Credi

In [44]:
synthetic_email = generate_synthetic_email(masked_email)
print(synthetic_email)

Message-ID: <21013688.1075844564560.JavaMail.ravi@thyme>
Date: Tue, 29 Aug 2020 01:26:00 -0700 (PDT)
From: [EMAIL]
To: [EMAIL]
Subject: Re: Crop Insurance Policies
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Suresh Ranganathan
X-To: Vinod Sharma
X-cc: 
X-bcc: 
X-Folder: \Suresh_Ranganathan_Dec2020_June2021_1\Notes Folders\Sent
X-Origin: RANGANATHAN-S
X-FileName: srangan.nsf

Vinod: Thanks for the info. I also spoke with Jagdish about how 
KrishiBima.com Pvt. Ltd. was going to work since Ramesh Patel (Mumbai lawyer) 
is responsible for that group. Maybe you will be able to clarify which of 
Jagdish's "policies" will be hedges and which will be backed to KrishiBima.  
Maybe Rajesh will be handling most of Jagdish's credit.  I'd appreciate an update.  
Suresh



	Vinod Sharma
	08/29/2020 07:24 AM
		
		 To: Suresh Ranganathan/HYD/AIPL@AIPL
		 cc: Prashant Kumar/HYD/AIPL@AIPL, Anil Mehta/Corp/KrishiBima@KRISHIBIMA, Rajesh 
Kumar/MUM/A

# Findings and Further improvements

1. Masking needs to be improved
2. Faker is not used but can be included for masking PII
3. Nested replies are handled well
4. "credit derivatives" were changed to "crop insurance policies," which makes sense for an agriculture context
5. "total return bond trade" became "total return bond trade" again, which might not fit the agriculture theme
6. Relying solely on the LLM for all replacements might not be reliable. Maybe a hybrid approach where regex and Faker handle more PII before the LLM step would be better.
7. Then, the LLM prompt can focus on contextual changes rather than basic PII replacement, which should already be handled by the masking function.