In [1]:
import mailbox
import os
from bs4 import BeautifulSoup

In [None]:
def initialize_output_folder(output_folder):
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder created: {output_folder}")

In [None]:
def open_mbox_file(input_file):
    print("Opening the mbox file...")
    source_mbox = mailbox.mbox(input_file)
    print(f"Total emails in the mbox file: {len(source_mbox)}")
    return source_mbox

In [None]:
def process_email_body(message):
    try:
        if message.is_multipart():
            parts = []
            for part in message.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition"))
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    parts.append(part.get_payload(decode=True).decode(errors="ignore"))
                elif content_type == "text/html" and "attachment" not in content_disposition:
                    html_body = part.get_payload(decode=True).decode(errors="ignore")
                    soup = BeautifulSoup(html_body, "html.parser")
                    parts.append(soup.get_text(separator="\n").strip())
            return "\n\n".join(parts).strip() if parts else "No Body Content"
        else:
            content_type = message.get_content_type()
            if content_type == "text/plain":
                return message.get_payload(decode=True).decode(errors="ignore").strip()
            elif content_type == "text/html":
                html_body = message.get_payload(decode=True).decode(errors="ignore")
                soup = BeautifulSoup(html_body, "html.parser")
                return soup.get_text(separator="\n").strip()
            else:
                return "Unsupported content type: " + content_type
    except Exception as e:
        print(f"Error processing email body: {e}")
        return "Failed to extract body content."


In [None]:
def save_email_to_file(email_file, subject, from_email, to_email, date, body):
    try:
        with open(email_file, "w", encoding="utf-8") as f:
            f.write(f"Subject: {subject}\n")
            f.write(f"From: {from_email}\n")
            f.write(f"To: {to_email}\n")
            f.write(f"Date: {date}\n\n")
            f.write(body)
        print(f"  Saved email to {email_file}")
    except Exception as e:
        print(f"Error saving email to file: {e}")

In [None]:
def process_and_save_emails(source_mbox, output_folder, top_n):
    email_count = 0
    for i, message in enumerate(source_mbox):
        if i >= top_n:
            break
        print(f"Processing email {i + 1}...")
        subject = message['subject'] or "No Subject"
        from_email = message['from'] or "Unknown Sender"
        to_email = message['to'] or "Unknown Recipient"
        date = message['date'] or "Unknown Date"
        print(f"  Subject: {subject}")
        print(f"  From: {from_email}")

        email_file = os.path.join(output_folder, f"email_{i + 1}.txt")
        body = process_email_body(message)
        save_email_to_file(email_file, subject, from_email, to_email, date, body)

        email_count += 1
        if email_count % 10 == 0 or email_count == top_n:
            print(f"Processed {email_count}/{top_n} emails.")

    print(f"Successfully processed and saved {email_count} emails.")

In [None]:
def save_emails_as_text(input_file, output_folder, top_n=100):
    try:
        print(f"Starting to process the mbox file: {input_file}")
        initialize_output_folder(output_folder)
        source_mbox = open_mbox_file(input_file)
        process_and_save_emails(source_mbox, output_folder, top_n)
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
input_mbox = "All mail Including Spam and Trash.mbox"
output_folder = "data"
save_emails_as_text(input_mbox, output_folder, top_n=5000)

Starting to process the mbox file: All mail Including Spam and Trash.mbox
Output folder created: data
Opening the mbox file...
Total emails in the mbox file: 80072
Processing email 1...
  Subject: Gaurav Surtani Sent You a Gift Card for Airbnb
  From: "Amazon.com Gift Cards" <gc-orders@gc.email.amazon.com>
  Saved email to data\email_1.txt
Processing email 2...
  Subject: Everything on sale & 50% off Doorbusters!
  From: "Columbia" <email@e-mail.columbia.com>
  Saved email to data\email_2.txt
Processing email 3...
  Subject: Please read: Your dream home is waiting for you.
  From: SoFi <no-reply@r.sofi.com>
  Saved email to data\email_3.txt
Processing email 4...
  Subject: =?UTF-8?Q?The_future_of_finance_is_here:_D?=
 =?UTF-8?Q?on=E2=80=99t_miss_'Crypto_ki_Paathshala'?=
  From: The Economic Times <newsletter@economictimesnews.com>
  Saved email to data\email_4.txt
Processing email 5...
  Subject: =?utf-8?B?VGhlcmUgd29u4oCZdCBiZSBhIGJldHRlciBkZWFsIOKAlCBvbmx5?=
	=?utf-8?B?ICQxLjM5L21vbn