In [2]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("enron_emails.csv")

In [5]:
from email import parser

def parse_email(raw_email):
    """Parse a raw email message and extract structured features"""
    email_parser = parser.Parser()
    email_msg = email_parser.parsestr(raw_email)
    
    # Extract all relevant fields
    parsed_data = {
        'message_id': email_msg.get('Message-ID', ''),
        'date': email_msg.get('Date', ''),
        'from': email_msg.get('From', ''),
        'to': email_msg.get('To', ''),
        'subject': email_msg.get('Subject', ''),
        'mime_version': email_msg.get('Mime-Version', ''),
        'content_type': email_msg.get('Content-Type', ''),
        'content_transfer_encoding': email_msg.get('Content-Transfer-Encoding', ''),
        'x_from': email_msg.get('X-From', ''),
        'x_to': email_msg.get('X-To', ''),
        'x_cc': email_msg.get('X-cc', ''),
        'x_bcc': email_msg.get('X-bcc', ''),
        'x_folder': email_msg.get('X-Folder', ''),
        'x_origin': email_msg.get('X-Origin', ''),
        'x_filename': email_msg.get('X-FileName', ''),
        'body': email_msg.get_payload()
    }
    
    return parsed_data

In [6]:
# Test the parser on the example email
test_parsed = parse_email(df['message'][11])
print("Extracted features:")
for key, value in test_parsed.items():
    print(f"{key}: {value[:100] if isinstance(value, str) and len(value) > 100 else value}")

Extracted features:
message_id: <25459584.1075855687536.JavaMail.evans@thyme>
date: Fri, 13 Oct 2000 06:45:00 -0700 (PDT)
from: phillip.allen@enron.com
to: stagecoachmama@hotmail.com
subject: 
mime_version: 1.0
content_type: text/plain; charset=us-ascii
content_transfer_encoding: 7bit
x_from: Phillip K Allen
x_to: stagecoachmama@hotmail.com
x_cc: 
x_bcc: 
x_folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
x_origin: Allen-P
x_filename: pallen.nsf
body: Lucy,

 Here are the rentrolls:



 Open them and save in the rentroll folder.  Follow these steps s


In [7]:
# Parse all emails and create structured dataframe
print("Parsing all emails...")
parsed_emails = df['message'].apply(parse_email)

# Convert to dataframe
df_structured = pd.DataFrame(parsed_emails.tolist())

print(f"Original shape: {df.shape}")
print(f"Structured shape: {df_structured.shape}")
print(f"\nColumns: {list(df_structured.columns)}")
df_structured.head()

Parsing all emails...
Original shape: (517401, 2)
Structured shape: (517401, 16)

Columns: ['message_id', 'date', 'from', 'to', 'subject', 'mime_version', 'content_type', 'content_transfer_encoding', 'x_from', 'x_to', 'x_cc', 'x_bcc', 'x_folder', 'x_origin', 'x_filename', 'body']


Unnamed: 0,message_id,date,from,to,subject,mime_version,content_type,content_transfer_encoding,x_from,x_to,x_cc,x_bcc,x_folder,x_origin,x_filename,body
0,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n
1,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...
2,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!
3,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s..."
4,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.


In [8]:
df_structured.columns

Index(['message_id', 'date', 'from', 'to', 'subject', 'mime_version',
       'content_type', 'content_transfer_encoding', 'x_from', 'x_to', 'x_cc',
       'x_bcc', 'x_folder', 'x_origin', 'x_filename', 'body'],
      dtype='object')

In [9]:
# Print full body of each email
for i, body in enumerate(df_structured['body'][0:100]):
    print(f"\n{'='*80}")
    print(f"EMAIL #{i}")
    print(f"{'='*80}")
    print(body)


EMAIL #0
Here is our forecast

 

EMAIL #1
Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.


EMAIL #2
test successful.  way to go!!!

EMAIL #3
Randy,

 Can you send me a schedule of the salary and level of everyone in the 
scheduling group.  Plus your thoughts 