In [1]:
import os
import imaplib
import email
from email import policy
import pandas as pd
from bs4 import BeautifulSoup
import time

# ---------------------------
# Email Configuration
# ---------------------------
EMAIL_USER = 'salacjamesrhode23@gmail.com'
EMAIL_PASSWORD = os.getenv('RECIPIENT_EMAIL_APP_PASSWORD')
IMAP_URL = 'imap.gmail.com'
BATCH_SIZE = 5  # Number of emails to process per batch
BATCH_DELAY = 1   # Delay in seconds between batches

# ---------------------------
# Helper Functions
# ---------------------------
def connect_to_mailbox(user, password, imap_url='imap.gmail.com'):
    """Connect to the Gmail inbox via IMAP and return the mailbox object."""
    mail = imaplib.IMAP4_SSL(imap_url)
    mail.login(user, password)
    mail.select('Inbox')
    return mail

def fetch_email_ids(mail, subject_filter):
    """Fetch all email IDs filtered by subject."""
    status, data = mail.search(None, 'SUBJECT', subject_filter)
    return data[0].split()[:10]

def get_email_body(message):
    """Extract the plain text or HTML body from an email message."""
    if message.is_multipart():
        for part in message.iter_parts():
            content_type = part.get_content_type()
            if content_type in ["text/plain", "text/html"]:
                return part.get_content()
    else:
        return message.get_content()

def parse_order_email(body):
    """Parse an order confirmation email and return extracted data."""
    soup = BeautifulSoup(body, 'html.parser')

    # Extract common order information
    customer = soup.find(text="Customer:").parent.next_sibling.strip()
    order_date = soup.find(text="Order Date:").parent.next_sibling.strip()
    total_amount = soup.find_all('tr')[-1].find_all('td')[-1].text.strip()

    # Extract payment details
    payment_method = soup.find(text="Payment Method:").parent.next_sibling.strip()
    payment_ref = soup.find(text="Payment Reference:").parent.next_sibling.strip()
    payment_date = soup.find(text="Payment Date:").parent.next_sibling.strip()

    # Extract line items
    line_items = []
    for tr in soup.find_all('tr')[1:-1]:  # Skip header and total row
        tds = [td.text.strip() for td in tr.find_all('td')]
        line_items.append({
            'customer': customer,
            'product': tds[0],
            'sku': tds[1],
            'qty': tds[2],
            'price': tds[3],
            'line_total': tds[4],
            'total_amount': total_amount,
            'payment_method': payment_method,
            'payment_reference': payment_ref,
            'order_date': order_date,
            'payment_date': payment_date
        })
    return line_items

# ---------------------------
# Main Script
# ---------------------------
# Connect to mailbox
mailbox = connect_to_mailbox(EMAIL_USER, EMAIL_PASSWORD)

# Fetch all email IDs matching the subject
subject_filter = '"[demo-store] Order Confirmation for"'
email_ids = fetch_email_ids(mailbox, subject_filter)

# Process emails in batches
all_orders = []

for i in range(0, len(email_ids), BATCH_SIZE):
    batch_ids = email_ids[i:i + BATCH_SIZE]
    print(f"Processing batch {i // BATCH_SIZE + 1} ({len(batch_ids)} emails)...")

    for email_id in batch_ids:
        status, data = mailbox.fetch(email_id, '(RFC822)')
        for _, raw_msg in (part for part in data if isinstance(part, tuple)):
            msg = email.message_from_bytes(raw_msg, policy=policy.default)
            body = get_email_body(msg)
            all_orders.extend(parse_order_email(body))
    
    time.sleep(BATCH_DELAY)  # Pause between batches

# Convert collected data to DataFrame
orders_df = pd.DataFrame(all_orders)
len(orders_df)

Processing batch 1 (5 emails)...


  customer = soup.find(text="Customer:").parent.next_sibling.strip()
  order_date = soup.find(text="Order Date:").parent.next_sibling.strip()
  payment_method = soup.find(text="Payment Method:").parent.next_sibling.strip()
  payment_ref = soup.find(text="Payment Reference:").parent.next_sibling.strip()
  payment_date = soup.find(text="Payment Date:").parent.next_sibling.strip()


Processing batch 2 (5 emails)...


32

In [2]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
import pandas as pd
import imaplib
import email
import os
from bs4 import BeautifulSoup

In [None]:
# Configure email credentials
email_user = 'salacjamesrhode23@gmail.com'
email_password = os.getenv('RECIPIENT_EMAIL_APP_PASSWORD')

# URL for imap connection
imap_url = 'imap.gmail.com'

# Connection with gmail using SSL
my_mail = imaplib.IMAP4_SSL(imap_url)

# Login using the email credentials
my_mail.login(email_user, email_password)

# Select the Inbox to fetch message
my_mail.select('Inbox')

# Define key and value for email search
key = 'SUBJECT'
value = '"[demo-store] Order Confirmation for"'
_, data = my_mail.search(None, key, value)

mail_id_list = data[0].split()[:5]

In [None]:
import email
from email import policy

emails = []  # this will store parsed emails

def get_email_body(message):
    if message.is_multipart():
        for part in message.iter_parts():
            content_type = part.get_content_type()
            if content_type == "text/plain":
                return part.get_content()
            elif content_type == "text/html":
                return part.get_content()
    else:
        return message.get_content()




In [None]:
rows = []

for num in mail_id_list:
    typ, data = my_mail.fetch(num, '(RFC822)')
    
    for _, raw_msg in (part for part in data if isinstance(part, tuple)):
        msg = email.message_from_bytes(raw_msg, policy=policy.default)
        body = get_email_body(msg)

        # --- Parse HTML and extract data ---
        soup = BeautifulSoup(body, 'html.parser')

        # Common order information
        customer = soup.find(text="Customer:").parent.next_sibling.strip()
        order_date = soup.find(text="Order Date:").parent.next_sibling.strip()
        total_amount = soup.find_all('tr')[-1].find_all('td')[-1].text.strip()

        # Payment details
        payment_method = soup.find(text="Payment Method:").parent.next_sibling.strip()
        payment_ref = soup.find(text="Payment Reference:").parent.next_sibling.strip()
        payment_date = soup.find(text="Payment Date:").parent.next_sibling.strip()

        # Line items
        for tr in soup.find_all('tr')[1:-1]:  # Skip header and total row
            tds = [td.text.strip() for td in tr.find_all('td')]
            rows.append({
                'customer': customer,
                'product': tds[0],
                'sku': tds[1],
                'qty': tds[2],
                'price': tds[3],
                'line_total': tds[4],
                'total_amount': total_amount,
                'payment_method': payment_method,
                'payment_reference': payment_ref,
                'order_date': order_date,
                'payment_date': payment_date
            })


In [None]:
orders_df = pd.DataFrame(rows)
orders_df.head()