### Data Processing for Model Training And Evaluation

In [7]:
# Imports Required libraries
import pandas as pd
import numpy as np
import os
import email
from email.parser import Parser
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Sets the dataset directory path
dataset_dir = "../data/raw/enron_dataset_test"

['36.', '313.', '264.', '166.', '96.', '149.', '100.', '19.', '50.', '3.', '202.', '208.', '336.', '241.', '9.', '185.', '13.', '143.', '287.', '125.', '350.', '319.', '227.', '75.', '165.', '35.', '310.', '267.', '53.', '248.', '201.', '95.', '103.', '109.', '140.', '284.', '335.', '242.', '59.', '186.', '10.', '353.', '224.', '76.', '126.', '112.', '84.', '42.', '210.', '259.', '24.', '276.', '348.', '.DS_Store', '174.', '137.', '235.', '342.', '67.', '253.', '324.', '197.', '48.', '151.', '118.', '295.', '41.', '213.', '111.', '158.', '87.', '177.', '27.', '275.', '236.', '308.', '341.', '64.', '134.', '152.', '296.', '250.', '327.', '219.', '194.', '23.', '271.', '306.', '238.', '173.', '298.', '115.', '83.', '45.', '217.', '329.', '254.', '323.', '190.', '156.', '292.', '130.', '179.', '232.', '345.', '60.', '29.', '170.', '139.', '20.', '69.', '272.', '305.', '46.', '199.', '214.', '116.', '80.', '155.', '291.', '257.', '320.', '193.', '231.', '346.', '278.', '63.', '133.', '91.'

In [14]:
def parse_email(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            msg = Parser().parsestr(content)

            # Extract body
            body = []
            if msg.is_multipart():
                for part in msg.walk():
                    if part.get_content_type() == 'text/plain':
                        decoded_part = part.get_payload(decode=True)
                        if decoded_part:
                            decoded_text = decoded_part.decode('utf-8', errors='ignore')
                            body.append(decoded_text)
            else:
                payload = msg.get_payload(decode=True)
                if payload:
                    body.append(payload.decode('utf-8', errors='ignore'))
            
            # Extract recipients
            recipients = []
            for field in ['to', 'cc', 'bcc']:
                field_value = msg.get(field)
                if field_value:
                    # Split and clean recipient addresses
                    field_recipients = [r.strip() for r in field_value.split(',')]
                    recipients.extend(field_recipients)
            
            # Extract headers
            headers = {
                header.lower(): value 
                for header, value in msg.items()
            }

            return {
                'message_id': msg.get('message-id', ''),
                'subject': msg.get('subject', ''),
                'body': '\n'.join(body) if body else '',
                'sender': msg.get('from', ''),
                'date': msg.get('date', ''),
                'recipients': recipients,
                'headers': headers
                
            }
    except Exception as e:
        print(f"Error parsing {file_path}: {str(e)}")
        return None

In [39]:
def process_emails(directory):
    emails = []
    total_files = len(os.listdir(directory))

    print(f"Starting to process {total_files} emails")

    for i, filename in enumerate(os.listdir(directory), 1):
        file_path = os.path.join(directory, filename)
        email_data = parse_email(file_path)

        if email_data:
            emails.append(email_data)

        if i % 100 == 0:
            print(f"Processed {i}/{total_files} emails...")

    print(f"Successfully processed {len(emails)} email(s)")
    return pd.DataFrame(emails)

df = process_emails(dataset_dir)

df.to_pickle('../data/processed/processed_emails.pkl')
print("\nData saved to '../data/processed/processed_emails.pkl'")

df[['message_id', 'subject', 'sender', 'date']].head(10)

Starting to process 343 emails
Processed 100/343 emails...
Processed 200/343 emails...
Processed 300/343 emails...
Successfully processed 343 email(s)

Data saved to '../data/processed/processed_emails.pkl'


Unnamed: 0,message_id,subject,sender,date
0,<33004025.1075853131914.JavaMail.evans@thyme>,RE: Subsidiary Equity or Phantom Equity,nancy.corbet@enron.com,"Thu, 11 Oct 2001 08:59:26 -0700 (PDT)"
1,<5434593.1075862052937.JavaMail.evans@thyme>,RE: A few questions about the scripts,rick.johnson@enron.com,"Mon, 26 Nov 2001 09:07:58 -0800 (PST)"
2,<637032.1075862051693.JavaMail.evans@thyme>,WARN issues,fmackin@aol.com,"Sun, 18 Nov 2001 10:27:39 -0800 (PST)"
3,<2906173.1075855361040.JavaMail.evans@thyme>,RE: Employee Trust Documentation,david.oxley@enron.com,"Thu, 13 Dec 2001 06:04:55 -0800 (PST)"
4,<33527112.1075853133594.JavaMail.evans@thyme>,New Agreement,ann.hill@enron.com,"Mon, 29 Oct 2001 18:05:15 -0800 (PST)"
5,<9945897.1075855360639.JavaMail.evans@thyme>,Rescind Offers/Termination Letters,frank.de@enron.com,"Fri, 7 Dec 2001 08:45:27 -0800 (PST)"
6,<2903650.1075853133701.JavaMail.evans@thyme>,Document1,jpack1@columbus.rr.com,"Sun, 19 Aug 2001 19:34:26 -0700 (PDT)"
7,<1001303.1075853131482.JavaMail.evans@thyme>,RE: lake worth project,mike.indivero@nepco.com,"Fri, 5 Oct 2001 08:53:48 -0700 (PDT)"
8,<23276038.1075853132477.JavaMail.evans@thyme>,Management Committee Presentation Draft,andrea.yowman@enron.com,"Thu, 18 Oct 2001 17:53:58 -0700 (PDT)"
9,<31284243.1075853131083.JavaMail.evans@thyme>,FW: Arthur Anderson LLP,george.wasaff@enron.com,"Fri, 7 Sep 2001 07:34:47 -0700 (PDT)"
