In [None]:
#Load raw Enron JSON
import json
import pandas as pd
import re

file_path = "../data/raw/cleaned_enron_emails.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Handle both wrapped and unwrapped JSON
if isinstance(data, dict) and "root" in data:
    data = data["root"]

df = pd.DataFrame(data)

df.head()


Unnamed: 0,From,To,Subject,Date,Body,ThreadKey,Filename
0,,,,,,::,.DS_Store
1,,,,,,::,.DS_Store
2,msagel@home.com,jarnold@enron.com,Status,"Thu, 16 Nov 2000 09:30:00 -0800",John:\n?\nI'm not really sure what happened be...,"status::Thu, 16 Nov 2000 09:30:00 -0800",36.
3,slafontaine@globalp.com,john.arnold@enron.com,re:summer inverses,"Fri, 08 Dec 2000 05:05:00 -0800",i suck-hope youve made more money in natgas la...,"summer inverses::Fri, 08 Dec 2000 05:05:00 -0800",19.
4,iceoperations@intcx.com,"icehelpdesk@intcx.com, internalmarketing@intcx...",The WTI Bullet swap contracts,"Tue, 15 May 2001 09:43:00 -0700","Hi,\n\n\nFollowing the e-mail you have receive...","the wti bullet swap contracts::Tue, 15 May 200...",50.


In [3]:
#Data cleaning (noise removal)
# Remove macOS junk files
df = df[df["Filename"] != ".DS_Store"]

# Remove empty bodies
df = df[df["Body"].str.strip() != ""]

df.reset_index(drop=True, inplace=True)


In [4]:
#Date normalization (timezone-safe)
df["Date"] = pd.to_datetime(
    df["Date"],
    errors="coerce",
    utc=True
)

df = df.dropna(subset=["Date"])
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month


In [5]:
# Text preprocessing 
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

df["clean_body"] = df["Body"].apply(preprocess_text)
df["clean_subject"] = df["Subject"].fillna("").apply(preprocess_text)


In [6]:
# Feature engineering
df["email_length"] = df["clean_body"].apply(len)

df["num_recipients"] = df["To"].apply(
    lambda x: len(str(x).split(","))
)

df["sender_domain"] = df["From"].apply(
    lambda x: x.split("@")[-1] if isinstance(x, str) and "@" in x else "unknown"
)


In [7]:
# Weak supervision (label creation)
IMPORTANT_KEYWORDS = [
    "meeting", "deadline", "urgent", "contract",
    "invoice", "report", "approval", "schedule"
]

def label_importance(body, subject):
    combined = f"{subject} {body}"
    return int(any(kw in combined for kw in IMPORTANT_KEYWORDS))

df["label"] = df.apply(
    lambda x: label_importance(x["clean_body"], x["clean_subject"]),
    axis=1
)


In [8]:
df["label"].value_counts()

label
0    320189
1    196604
Name: count, dtype: int64

In [9]:
# Save ML-ready dataset
output_path = "../data/processed/emails_with_labels.csv"
df.to_csv(output_path, index=False)

print("Saved preprocessed dataset to:", output_path)


Saved preprocessed dataset to: ../data/processed/emails_with_labels.csv
