# Data Loading

In [1]:
import tarfile
import email
import pandas as pd
from email import policy
from email.parser import BytesParser

In [2]:
archive_path = "enron_mail_20150507.tar.gz"
emails = []
count = 0

# Parse plain text from email object
def get_text_from_email(msg):
    parts = []
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                try:
                    parts.append(part.get_payload(decode=True).decode('latin1'))
                except:
                    pass
    else:
        try:
            parts.append(msg.get_payload(decode=True).decode('latin1'))
        except:
            pass
    return ''.join(parts)

In [3]:
with tarfile.open(archive_path, "r:gz") as tar:
    for member in tar:
        if member.isfile() and not member.name.endswith('/'):
            try:
                f = tar.extractfile(member)
                raw_email = f.read()
                msg = email.message_from_bytes(raw_email)

                email_data = {
                    "file": member.name,
                    "Message-ID": msg.get("Message-ID"),
                    "Date": msg.get("Date"),
                    "From": msg.get("From"),
                    "To": msg.get("To"),
                    "Subject": msg.get("Subject"),
                    "Mime-Version": msg.get("Mime-Version"),
                    "Content-Type": msg.get("Content-Type"),
                    "Content-Transfer-Encoding": msg.get("Content-Transfer-Encoding"),
                    "X-From": msg.get("X-From"),
                    "X-To": msg.get("X-To"),
                    "X-cc": msg.get("X-cc"),
                    "X-bcc": msg.get("X-bcc"),
                    "X-Folder": msg.get("X-Folder"),
                    "X-Origin": msg.get("X-Origin"),
                    "X-FileName": msg.get("X-FileName"),
                    "content": get_text_from_email(msg)
                }

                emails.append(email_data)
                count += 1

            except Exception:
                continue

In [4]:
# Create DataFrame
df_emails = pd.DataFrame(emails)

print(df_emails.shape)
df_emails.head()

(517401, 17)


Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content
0,maildir/blair-l/personnel___promotions/1.,<8012132.1075853083164.JavaMail.evans@thyme>,"Fri, 14 Sep 2001 14:05:43 -0700 (PDT)",fran.fagan@enron.com,lynn.blair@enron.com,FW: Promotions and Transfers- Gas Logistics,1.0,text/plain; charset=us-ascii,7bit,"Fagan, Fran </O=ENRON/OU=NA/CN=RECIPIENTS/CN=F...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...",,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Personnel...",Blair-L,LBLAIR (Non-Privileged).pst,\n\n\n\nName\t\t\tNew Title\t\t\t\tEffective D...
1,maildir/blair-l/meetings___nng_customer_mtg/8.,<15791711.1075853024135.JavaMail.evans@thyme>,"Mon, 10 Sep 2001 10:33:15 -0700 (PDT)",maggie.matheson@enron.com,"randy.bryan@enron.com, robert.benningfield@enr...",Customer Training,1.0,text/plain; charset=us-ascii,7bit,"Matheson, Maggie </O=ENRON/OU=NA/CN=RECIPIENTS...","Bryan, Randy </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...","Holmes, Bradley </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\n\nFirst I want to thank you for all of your ...
2,maildir/blair-l/meetings___nng_customer_mtg/16.,<15061964.1075853024388.JavaMail.evans@thyme>,"Mon, 20 Aug 2001 12:34:19 -0700 (PDT)",sharon.brown@enron.com,"e..anderson@enron.com, vicki.berg@enron.com, l...",Winter Ops - KC Hotel Information,1.0,text/plain; charset=us-ascii,7bit,"Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Anderson, Gary E. </O=ENRON/OU=NA/CN=RECIPIENT...","Armstrong, Julie </O=ENRON/OU=NA/CN=RECIPIENTS...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\t\nBelow listed is the contact information fo...
3,maildir/blair-l/meetings___nng_customer_mtg/13.,<4937308.1075853024299.JavaMail.evans@thyme>,"Thu, 5 Jul 2001 08:35:29 -0700 (PDT)",lynn.blair@enron.com,sharon.brown@enron.com,Winter Ops Meeting,1.0,text/plain; charset=us-ascii,7bit,"Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...","Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,"\t\t\n\tSharon, could add the following compan..."
4,maildir/blair-l/meetings___nng_customer_mtg/14.,<1237601.1075853024323.JavaMail.evans@thyme>,"Thu, 26 Jul 2001 08:16:14 -0700 (PDT)",sharon.brown@enron.com,"larry.steward@enron.com, toby.kuehl@enron.com,...",Draft - Winter Ops Agenda,1.0,text/plain; charset=us-ascii,7bit,"Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Steward, Larry </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Winters, Ricki </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\nAttached is a draft of the agenda for the Wi...


# Data Cleaning

In [5]:
# Step 1: Clean timezone labels
df_emails['clean_date'] = df_emails['Date'].str.replace(r'\s+\(.*\)', '', regex=True)

# Step 2: Parse datetime safely & force conversion
df_emails['Date'] = pd.to_datetime(df_emails['clean_date'], errors='coerce', utc=True)

# Drop noisy columns if needed
df_emails = df_emails.drop(columns=['Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'clean_date'])

df_emails.head()

Unnamed: 0,file,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content
0,maildir/blair-l/personnel___promotions/1.,<8012132.1075853083164.JavaMail.evans@thyme>,2001-09-14 21:05:43+00:00,fran.fagan@enron.com,lynn.blair@enron.com,FW: Promotions and Transfers- Gas Logistics,"Fagan, Fran </O=ENRON/OU=NA/CN=RECIPIENTS/CN=F...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...",,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Personnel...",Blair-L,LBLAIR (Non-Privileged).pst,\n\n\n\nName\t\t\tNew Title\t\t\t\tEffective D...
1,maildir/blair-l/meetings___nng_customer_mtg/8.,<15791711.1075853024135.JavaMail.evans@thyme>,2001-09-10 17:33:15+00:00,maggie.matheson@enron.com,"randy.bryan@enron.com, robert.benningfield@enr...",Customer Training,"Matheson, Maggie </O=ENRON/OU=NA/CN=RECIPIENTS...","Bryan, Randy </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...","Holmes, Bradley </O=ENRON/OU=NA/CN=RECIPIENTS/...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\n\nFirst I want to thank you for all of your ...
2,maildir/blair-l/meetings___nng_customer_mtg/16.,<15061964.1075853024388.JavaMail.evans@thyme>,2001-08-20 19:34:19+00:00,sharon.brown@enron.com,"e..anderson@enron.com, vicki.berg@enron.com, l...",Winter Ops - KC Hotel Information,"Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Anderson, Gary E. </O=ENRON/OU=NA/CN=RECIPIENT...","Armstrong, Julie </O=ENRON/OU=NA/CN=RECIPIENTS...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\t\nBelow listed is the contact information fo...
3,maildir/blair-l/meetings___nng_customer_mtg/13.,<4937308.1075853024299.JavaMail.evans@thyme>,2001-07-05 15:35:29+00:00,lynn.blair@enron.com,sharon.brown@enron.com,Winter Ops Meeting,"Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...","Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=L...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,"\t\t\n\tSharon, could add the following compan..."
4,maildir/blair-l/meetings___nng_customer_mtg/14.,<1237601.1075853024323.JavaMail.evans@thyme>,2001-07-26 15:16:14+00:00,sharon.brown@enron.com,"larry.steward@enron.com, toby.kuehl@enron.com,...",Draft - Winter Ops Agenda,"Brown, Sharon </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Steward, Larry </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Winters, Ricki </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Meetings ...",Blair-L,LBLAIR (Non-Privileged).pst,\nAttached is a draft of the agenda for the Wi...


In [6]:
import re

def extract_display_name(x):
    if not isinstance(x, str):
        return ""
    match = re.match(r"^([^<@]+)", x)
    if match:
        name = match.group(1).strip()
        # If it's "Last, First", flip it
        if ',' in name:
            parts = [p.strip() for p in name.split(',')]
            if len(parts) == 2:
                return f"{parts[1]} {parts[0]}"
        return name
    return x.strip()

In [7]:
df_emails['Name-From'] = df_emails['X-From'].apply(extract_display_name)
df_emails['Name-To'] = df_emails['X-To'].apply(extract_display_name)
df_emails['Name-cc'] = df_emails['X-cc'].apply(extract_display_name)
df_emails['Name-bcc'] = df_emails['X-bcc'].apply(extract_display_name)

In [8]:
import re

def clean_email_body(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ')  # remove line breaks
    text = text.replace('\t', ' ')  # remove tabs
    text = re.sub(r'\s+', ' ', text)  # normalize extra whitespace
    text = re.sub(r'[^a-zA-Z0-9.,!?$%:;/@#\'\"()\- ]', '', text)  # optional: remove weird characters
    return text.strip()

df_emails['content'] = df_emails['content'].apply(clean_email_body)

# Drop noisy columns if needed
df_emails = df_emails.drop(columns=['file', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder',	'X-Origin',	'X-FileName'])

In [9]:
import pandas as pd

# Step 3: Drop invalid datetimes
df_emails = df_emails[df_emails['Date'].notnull()].copy()

# Step 4: Convert from timezone-aware to naive datetime (optional, but safest)
df_emails['Date'] = df_emails['Date'].dt.tz_localize(None)

# Ensure the 'Date' column is in datetime format
df_emails['Date'] = pd.to_datetime(df_emails['Date'])

# Extract only the date part
df_emails['Date'] = df_emails['Date'].dt.date

In [10]:
df_emails = df_emails[df_emails['content'].str.len() <= 1000000]

In [11]:
df_emails = df_emails.drop_duplicates(subset=['Message-ID'])

In [12]:
# Clean columns for parquet export - convert any non-string objects to strings
def clean_column_for_parquet(series):
    """Convert any non-string objects to strings for parquet compatibility"""
    return series.astype(str)

# Clean problematic columns
columns_to_clean = ['To', 'From', 'Subject', 'Message-ID', 'Date']
for col in columns_to_clean:
    if col in df_emails.columns:
        df_emails[col] = clean_column_for_parquet(df_emails[col])

# Also clean any remaining object columns
object_columns = df_emails.select_dtypes(include=['object']).columns
for col in object_columns:
    df_emails[col] = clean_column_for_parquet(df_emails[col])

print("DataFrame shape after cleaning:", df_emails.shape)
print("DataFrame dtypes:")
print(df_emails.dtypes)

DataFrame shape after cleaning: (517124, 10)
DataFrame dtypes:
Message-ID    object
Date          object
From          object
To            object
Subject       object
content       object
Name-From     object
Name-To       object
Name-cc       object
Name-bcc      object
dtype: object


In [13]:
# Take a random sample
df_sample = df_emails.sample(n=100000, random_state=42)

df_sample.to_parquet("enron_sample.parquet")