# Code to read mbox file (downloaded from google)

Code from https://stackoverflow.com/questions/59681461/read-a-big-mbox-file-with-python/59682472#59682472

In [4]:
import pandas as pd
import email
from email.policy import default
from tqdm import tqdm
from bs4 import BeautifulSoup #to clean the payload

In [None]:
!pip install beautifulsoup4
!pip install pandas
!pip install tqdm

In [5]:


class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

Possible keys from message ['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'X-Received',
 'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results', 
'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results', 
'DKIM-Signature', 'X-Google-DKIM-Signature', 'X-Gm-Message-State', 
'X-Google-Smtp-Source', 'MIME-Version', 'X-Received', 'Date', 'Reply-To',
 'X-Google-Id', 'Precedence', 'List-Unsubscribe', 'Feedback-ID', 'List-Id',
 'X-Notifications', 'X-Notifications-Bounce-Info', 'Message-ID', 'Subject',
 'From', 'To', 'Content-Type']

In [6]:
possible_keys =  ['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'X-Received',
 'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results', 
'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results', 
'DKIM-Signature', 'X-Google-DKIM-Signature', 'X-Gm-Message-State', 
'X-Google-Smtp-Source', 'MIME-Version', 'X-Received', 'Date', 'Reply-To',
 'X-Google-Id', 'Precedence', 'List-Unsubscribe', 'Feedback-ID', 'List-Id',
 'X-Notifications', 'X-Notifications-Bounce-Info', 'Message-ID', 'Subject',
 'From', 'To', 'Content-Type']

In [7]:
path = r".\All mail Including Spam and Trash.mbox"

mbox = MboxReader(path)

MAX_EMAILS = 100
current_mails = 0

all_mail_contents = ""
mail_from_arr, mail_date_arr, mail_body_arr = [],[],[]
for idx,message in tqdm(enumerate(mbox)):
    # print(message.keys())
    mail_from = f"{str(message['From'])}\n".replace('"','').replace('\n','').strip()
    mail_date = f"{str(message['Date'])}\n".replace('"','').replace('\n','').strip()
    payload = message.get_payload(decode=True)
    if payload:
        current_mails += 1
        if current_mails > MAX_EMAILS:
            break
        soup = BeautifulSoup(payload, 'html.parser')
        body_text = soup.get_text().replace('"','').replace("\n", "").replace("\t", "").strip()
        mail_from_arr.append(mail_from)
        mail_date_arr.append(mail_date)
        mail_body_arr.append(body_text)
        all_mail_contents += body_text + " "

  soup = BeautifulSoup(payload, 'html.parser')
560it [00:04, 120.06it/s]


In [8]:
df = pd.DataFrame({'From':mail_from_arr, 'Date':mail_date_arr, 'Body':mail_body_arr})
df.to_pickle("df_mail.pkl")


In [9]:
# write all mail contents to txt
with open("all_mail_contents.txt", "w", encoding="utf-8") as f:
	f.write(all_mail_contents)

In [10]:
# load all mail contents
with open("all_mail_contents.txt", "r", encoding="utf-8") as f:
	all_mail_contents = f.read()

In [11]:
df.columns

Index(['From', 'Date', 'Body'], dtype='object')

In [12]:
texts = df["Body"].tolist()


In [13]:
df.head(20)

Unnamed: 0,From,Date,Body
0,suh@eie.noSusanna Hjelset / EIE Frogner & Aker...,"Wed, 06 Mar 2024 21:52:55 +0100",Hei Eivind Kjosbakken.Takk for din interesse f...
1,noreply@skatteetaten.no,"Mon, 11 Mar 2024 12:32:48 +0100",Skattemeldingen din er nå tilgjengelig. Logg i...
2,DNB <noreply@info.dnb.no>,"Tue, 27 Feb 2024 11:35:35 +0100",DNBFå 20 - 30 % rabatt på kjente merkevarer\r ...
3,Digipost <no-reply@digipost.no>,"Sat, 03 Feb 2024 10:04:05 +0000","Hei, Eivind\r\rDu har fått en ny melding i Dig..."
4,DNB <noreply@info.dnb.no>,"Mon, 11 Mar 2024 21:43:10 +0100",DNBBli oppdatert på ett minutt. Se siste marke...
5,no-reply@ntnu.no,"Thu, 04 Jan 2024 12:08:09 +0100",Dette er et automatisk varsel om at vi nå har ...
6,Gjensidige <gjensidige.privat@gjensidige.no>,"Thu, 25 Jan 2024 19:12:43 +0100",Gjensidige Nytt år og nye tips og råd fra oss ...
7,DNB <noreply@info.dnb.no>,"Tue, 12 Mar 2024 10:56:52 +0100",DNBFå 20 - 30 % rabatt på kjente merkevarer\r ...
8,DNB <noreply@info.dnb.no>,"Sun, 07 Jan 2024 13:00:11 +0100","DNBDu som er så god til å spare, bør ta en tit..."
9,Microsoft Learn <Learn@mails.microsoft.com>,"Tue, 13 Feb 2024 12:44:25 -0800",Skilling-XLifecycle-ModernCredentialsProjectSp...


In [38]:
texts[9]

'Skilling-XLifecycle-ModernCredentialsProjectSpike-1-A-EM-ProEarn your Microsoft Applied Skills credential to demonstrate your proficiency.\r͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u200c \xa0 ͏ \u20