In [None]:
import mailbox #read the mbox file 
import pandas as pd 
import re #pattern matching, used here to extract names and email addresses from the sender field
from datetime import datetime #

# Function to parse the MBOX file and extract details
def parse_mbox_with_date_components(file_path):
    # Open the MBOX file
    mbox = mailbox.mbox(file_path)

    # Lists to store email data
    dates = []
    senders = []
    subjects = []
    names = []
    emails = []
    days = []
    months = []
    years = []
    weekdays = []

    # Function to extract name and email from the sender field
    def extract_name_and_email(sender):
        name_pattern = r"\"?([^\"]+)\"?"  # Matches text within quotes or plain names
        email_pattern = r"<(.+?)>"  # Matches text within angle brackets

        name_match = re.search(name_pattern, sender)
        name = name_match.group(1) if name_match else "Unknown"

        email_match = re.search(email_pattern, sender)
        email = email_match.group(1) if email_match else "Unknown"

        return name, email

    # Function to parse date and extract components
    def parse_date(date_str):
        try:
            # Parse the date string
            parsed_date = datetime.strptime(date_str[:31].strip(), "%a, %d %b %Y %H:%M:%S %z")
            day = parsed_date.day
            month = parsed_date.strftime("%B")  # Full month name
            year = parsed_date.year
            weekday = parsed_date.strftime("%A")  # Full weekday name
        except Exception:
            # Handle cases where the date string is invalid
            day, month, year, weekday = None, None, None, "Unknown"
        return day, month, year, weekday

    # Loop through each email in the MBOX file
    for message in mbox:
        # Extract and store the date, sender, and subject
        date = message.get('Date', 'Unknown')
        dates.append(date)
        sender = message.get('From', 'Unknown')
        senders.append(sender)
        subjects.append(message.get('Subject', 'Unknown'))

        # Extract name and email from the sender field
        name, email = extract_name_and_email(sender)
        names.append(name)
        emails.append(email)

        # Parse date to extract day, month, year, and weekday
        day, month, year, weekday = parse_date(date)
        days.append(day)
        months.append(month)
        years.append(year)
        weekdays.append(weekday)

    # Create a DataFrame to organize the extracted data
    data = pd.DataFrame({
        'Date': dates,
        'Sender': senders,
        'Name': names,
        'Email': emails,
        'Subject': subjects,
        'Day': days,
        'Month': months,
        'Year': years,
        'Weekday': weekdays
    })

    return data

# File path to the MBOX file (in the same directory)
mbox_file_path = "..\Datasets\All mail Including Spam and Trash"

# Parse the MBOX file
email_data = parse_mbox_with_date_components(mbox_file_path)

# Save the extracted data to a CSV file in the same directory
output_csv_path = "..\Datasets\parced_email_data.csv"
email_data.to_csv(output_csv_path, index=False)

# Notify the user
print(f"Data saved to {output_csv_path}")


Data saved to ..\Datasets\All mail Including Spam and Trash.mbox
