### Imports

In [1]:
import pandas as pd
import datetime as dt
from dateutil import parser as date_parser
import os
import sys

import json
import imaplib
import email
import mimetypes

print(sys.version)
print(sys.getdefaultencoding())

3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
utf-8


### Functions

In [2]:
def auth(path_credentials = "F:/Data/DataBases/creds.json"):
    '''
    
    Automatisches Anmelden an einer Imap-Emailinbox.
    Input:   Pfad mit den Credentials (json-File)
    Output:  Connection Objekt für weitere Abfragen an die Mailbox
    
    '''
    import json
    import imaplib
    
    pcreds = path_credentials    
    with open(pcreds, encoding="utf-8-sig") as f:
        creds = json.load(f)
    ID, SERVER, EMAIL, PASSCODE = creds["ID"], creds["host"], creds["users"], creds["pw"]
    con = imaplib.IMAP4_SSL(SERVER)
    con.login(EMAIL, PASSCODE)
    print(f"Mailbox mit ID {ID} connected.")
    return con

In [3]:
def get_mail_ids(con, filter='ALL'):
    '''
    Auslesen der Email-IDs eines vorab selektieren Postfaches 
    Input:   Connection Objekt (default="mail"), Filterdefinition (default="All")
    Output:  Liste mit Email-IDS: mail_ids
    ''' 
    status, data = con.search(None, filter)
    mail_ids = []
    for block in data:
        mail_ids += block.split()
    print(f"{len(mail_ids)} Mail IDs ausgelesen")
    return mail_ids

In [4]:
def get_emails(con, mail_ids, ):                  # get_emails(result_bytes)
    '''
    Auslesen der Emails (Messages) anhand der Email-IDs eines vorab selektieren Postfaches 
    Input:   Connection Objekt (default="mail"), Liste der Email-IDs
    Output:  Liste mit Emails (Messages)
    '''    
    messages = []
    for mid in mail_ids:
        typ, data = mail.fetch(mid, '(RFC822)')
        messages.append(data)
    print(f"{len(mail_ids)} Emails ausgelesen")
    return messages

In [5]:
def search(key,value,con):
    '''
    Abfrage der selektierten Email-Postfachs mit Suchbegriffe / Selektion
    Input:   key: Suchbegriff , Value: Selektionsbegriff, con: Connection-Objekt
    Output:  Selektierte Mail-IDs
    '''
    result, data  = con.search(None, key, '"{}"'.format(value))
    return data

In [6]:
def get_header(msg):
    '''
    Extrahiert den Email-Header einer Email-Message (email-message Objekt).
    # funktioniert noch nicht: Und ruft Decoding (utf-8) und Cleaning Funktion auf
    Input:   email-message Objekt
    Output:  Dekodierter und gecleanter Email-Body
    '''
    header_dict = {}
    header_dict["to"] = msg["to"]
    header_dict["from"] = msg["from"]
    header_dict["cc"] = msg["cc"]
    header_dict["bcc"] = msg["bcc"]
    header_dict["sub"] = msg["subject"]
    header_dict["date_raw"] = msg["date"]
    header_dict["date"] =  date_parser.parse(header_dict['date_raw']).strftime ("%d-%m-%Y")
    header_dict["time"] =  date_parser.parse(header_dict['date_raw']).strftime ("%H:%M:%S")
        
#   FUNKTIONIERT NICHT  header_dict = {k: v.decode("utf-8") for k,v in header_dict.items()}
    return header_dict    

In [7]:
def decodeAndclean(raw):
    '''
    Funktionsaufruf durch andere Funktion (z.B. get_body).
    Decoding (utf-8) und Cleaning Funktion auf dem übergebenen Email-Text
    Input:   email-message Objekt
    Output:  Dekodierter und gecleanter Email-Text
    '''
    rawDecoded = raw.decode()
    msgClean = rawDecoded.replace("\r\n", "<NL>")
    return msgClean
    

In [84]:
def get_attachments(msg, save2disk=True):
    '''
    Liest die Attachments einer zugeführten Raw-Email-Message (byte-Codierung) aus und speichert diese ab
    Input:   email-message Objekt
    Output:  Bei save2disk = True: wird jeder Anhang im <save_path_> Ordner abgespeichert
    '''
    
    import mimetypes
    email_message = msg
    attachments = []
    
    counter = 1
    for part in email_message.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        
        filename = part.get_filename()
        content_type = part.get_content_type()
        
        if not filename:
            ext = mimetypes.guess_extension(part.get_content_type())
            if 'text' in content_type:
                ext = '.txt'
            if not ext:
                ext = '.bin'
            
            elif 'html' in content_type:
                ext = '.html'
                
            filename = 'msg-part-%08d%s' %(counter, ext)
        counter += 1
        
        attachments.append(filename)
        print(filename)
        
        # Save File
        today = dt.date.today().strftime("%d%m%Y")
        filename = today+'__'+m_id.decode()+'__'+filename
        save_path = os.path.join(os.getcwd(), "attachment_downloads", today,) #date_
        
        if save2disk == False:
            return print("\n\n\nAttachments wurden NICHT abgespeichert"), attachments
        
        else:  
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            
            with open(os.path.join(save_path, filename), 'wb') as fp:
                fp.write(part.get_payload(decode=True))
                
            return print(f"{len(attachments)} Attachments in {save_path} abgespeichert"), attachments 
    

In [79]:
def get_body(msg):
    '''
    Extrahiert den Email-Body einer Email-Message (email-message Objekt).
    # Funktioniert nicht: Und ruft Decoding (utf-8) und Cleaning Funktion auf
    Input:   email-message Objekt
    Output:  Dekodierter und gecleanter Email-Body
    '''
    if msg.is_multipart():
        raw =  get_body(msg.get_payload(0, decode=False))
    else:
        raw =  msg.get_payload(None,True)
        
#     return decodeAndclean(raw)  # return cleaned body text
    return raw


### Execute

In [120]:
mail = auth()
folder = 'Mailtest'
mail.select(folder)
mail_ids = get_mail_ids(mail)
# mail.list()
# msgs = get_emails((get_mail_ids()))
print(mail_ids)
[mid.decode() for mid in mail_ids]



Mailbox mit ID 999 connected.
14 Mail IDs ausgelesen
[b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'10', b'11', b'12', b'13', b'14']


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']

In [None]:
 header_dict["to"] = msg["to"]
    header_dict["from"] = msg["from"]
    header_dict["cc"] = msg["cc"]
    header_dict["bcc"] = msg["bcc"]
    header_dict["sub"] = msg["subject"]
    header_dict["date_raw"] = msg["date"]
    header_dict["date"] =  date_parser.parse(header_dict['date_raw']).strftime ("%d-%m-%Y")
    header_dict["time"] =  date_parser.parse(header_dict['date_raw']).strftime ("%H:%M:%S")

In [105]:
[ m_id, header["date"], header['time'], header["from"],
                                 header['to'], header['cc'], header['bcc'], header['sub'],]

[b'1',
 '30-03-2017',
 '16:57:37',
 '=?utf-8?Q?S=C3=B6nke_Mail?= <017662099018@o2online.de>',
 'Andreas Barth <barth.andreas@web.de>',
 None,
 None,
 'Re: Zweites Standbein?']

In [125]:
# mail = auth()
# folder = 'Mailtest'
# mail.select(folder)
# mail_ids = get_mail_ids(mail)

# TODAY = dt.date.today().strftime("%d%m%Y")
# SAVE_PATH = os.path.join(os.getcwd(), "attachment_downloads", TODAY,)

# cols="MID DATE TIME FROM TO CC BCC SUBJECT ".split(", ")
cols="MID DATE TIME FROM TO CC BCC SUBJECT BODY #FILES FILES".split()
MAILCONTAINER = pd.DataFrame(columns=cols)

for m_id in mail_ids:  
    result, data = mail.fetch(m_id,'(RFC822)')
    raw_msg = email.message_from_bytes(data[0][1])
    
    idx = m_id.decode()
    
    header = get_header(raw_msg)
    body = decodeAndclean(get_body(raw_msg)) #"DUMMY"
    # p, files = get_attachments(raw_msg, save2disk=False) 
    nbrFiles = 99 #sum([1 for file in files])
    files = []
    
    MAILCONTAINER.loc[idx, :] = [ m_id, header["date"], header['time'], header["from"],
                                 header['to'], header['cc'], header['bcc'], header['sub'],
                                 body, nbrFiles, files]

MAILCONTAINER

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf6 in position 40: invalid start byte

In [None]:
MAILCONTAINER

### Aufruf der einzelnen Email

In [41]:
result, data = mail.fetch(b'11','(RFC822)')
raw_msg = email.message_from_bytes(data[0][1])

#### Header auslesen

In [42]:
header = get_header(raw_msg)
[i for i in header.values()]

['"Andreas Barth" <barth.andreas@web.de>',
 '<barth@strategiepilot.de>',
 None,
 None,
 'Testmail mit Anhang jpeg',
 'Wed, 10 Jun 2020 12:24:40 +0200',
 '10-06-2020',
 '12:24:40']

#### Body auslesen

In [64]:
# get_body(raw_msg).decode()

print(raw_msg.get_charsets())

print()

decodeAndclean(get_body(raw_msg))

# get_body(raw_msg).decode()

[None, None, 'utf-8', 'utf-8', None, None]



'<NL><NL>Sent from on the go .... Please excuse brevity!<NL><NL>Anfang der weitergeleiteten Nachricht:<NL><NL>Von: "Lochmann, Heike" <lochmann@hs-albsig.de><NL>Datum: 10. Juni 2020 um 12:32:02 MESZ<NL>An: HS_Studierende_Albstadt <HS_Studierende_Albstadt@hs-albsig.de>, HS_Studierende_Sigmaringen <HS_Studierende_Sigmaringen@hs-albsig.de><NL>Kopie: rektorat <rektorat@hs-albsig.de>, "Leu, Sabine" <leu@hs-albsig.de>, "Sick, Christina" <sick@hs-albsig.de>, "Pfefferle, Hannah" <pfefferle@hs-albsig.de>, "Brandauer, Heidi" <brandauer@hs-albsig.de>, "Nell, Katrin" <nell@hs-albsig.de>, "Limbeck, Ulrike" <limbeck@hs-albsig.de><NL>Betreff: Gremienwahl 2020: Wahlbekanntmachung<NL><NL>\ufeff<NL>Sehr geehrte Damen und Herren,<NL><NL>bitte beachten Sie die beigefügte Wahlbekanntmachung.<NL>Bei Fragen dürfen Sie sich gerne melden.<NL><NL>Mit freundlichen Grüßen<NL><NL>Lochmann<NL>Wahlleitung<NL><NL>'

In [56]:
[mid.decode() for mid in mail_ids]

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']

In [44]:
os.getcwd()

'C:\\Users\\barth\\Data\\DataScienceUni\\60100'

In [83]:
for m_id in mail_ids:
    result, data = mail.fetch(m_id,'(RFC822)')
    raw_msg = email.message_from_bytes(data[0][1])
    print(get_header(raw_msg)['from'])
    get_attachments(raw_msg, save_attachm=True)  
    
    print('-'*50+"  Nächste Email:  "+"-"*50)

=?utf-8?Q?S=C3=B6nke_Mail?= <017662099018@o2online.de>
msg-part-00000001.txt
1 Attachments in C:\Users\barth\Data\DataScienceUni\60100\attachment_downloads\15062020 abgespeichert
--------------------------------------------------  Nächste Email:  --------------------------------------------------
Jon Banks <Jon@thepitchagency.com>
msg-part-00000001.txt
1 Attachments in C:\Users\barth\Data\DataScienceUni\60100\attachment_downloads\15062020 abgespeichert
--------------------------------------------------  Nächste Email:  --------------------------------------------------
Kim.Boestam@creata.com
msg-part-00000001.txt
1 Attachments in C:\Users\barth\Data\DataScienceUni\60100\attachment_downloads\15062020 abgespeichert
--------------------------------------------------  Nächste Email:  --------------------------------------------------
=?utf-8?B?R3LDvG5ld2FsZCwgQW5kcmVhcw==?= <ag@fivv.de>
msg-part-00000001.txt
1 Attachments in C:\Users\barth\Data\DataScienceUni\60100\attachment_downloads\150

Parkplatz

In [None]:
mail = auth()
folder = 'Mailtest'
mail.select(folder)

In [None]:
# tutorial:
# Day 30: Login with IMAP in Python Programming: https://youtu.be/Gql_NQv3ND4

mail = auth()
folder = 'Mailtest'
mail.select(folder)

result, data = mail.uid('search', None, 'All')
folder_item_list = data[0].split()
print(folder_item_list)


from bs4 import BeautifulSoup
import mimetypes

for item in folder_item_list: 
    
    res, email_data = mail.uid('fetch', item, '(RFC822)')
    
#     raw_email = email_data[0][1].decode('utf-8')
#     email_message = email.message_from_string(raw_email)
    raw_email = email_data[0][1]
    email_message = email.message_from_bytes(raw_email)
    
    
    to_  = email_message["To"]
    from_ = email_message["From"]
    subject_ = email_message["Subject"]
    date_ = date_parser.parse(email_message['date']).strftime ("%d-%m-%Y")
    print(f"Email vom {date_}, Betreff: {subject_}")
    
    counter = 1
    for part in email_message.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        
        filename = part.get_filename()
        content_type = part.get_content_type()
        
        if not filename:
            ext = mimetypes.guess_extension(part.get_content_type())
            if 'text' in content_type:
                ext = '.txt'
            if not ext:
                ext = '.bin'
            
            elif 'html' in content_type:
                ext = '.html'
                
            filename = 'msg-part-%08d%s' %(counter, ext)
        counter += 1
        print(filename)
        
    # Save File
    save_path = os.path.join(os.getcwd(), "emails", date_,)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with open(os.path.join(save_path, filename), 'wb') as fp:
        fp.write(part.get_payload(decode=True))
    
 
    # print(subject_)
    print(content_type)    
    
    if "plain" in content_type:
#       print(part.get_payload())
        pass
    
    elif "html" in content_type:
        html_ = part.get_payload()
        soup = BeautifulSoup(html_, "html.parser")
        text = soup.get_text()
        # print(text)

    else:
        pass #print(content_type)





            
        
#     email_message.get_payload()

In [None]:
os.getcwd()

In [None]:
# tutorial:
# Day 30: Login with IMAP in Python Programming: https://youtu.be/Gql_NQv3ND4

mail = auth()
folder = 'Mailtest'
mail.select(folder)

result, data = mail.uid('search', None, 'All')
folder_item_list = data[0].split()
res, email_data = mail.uid('fetch', b'1', '(RFC822)')
raw_email = email_data[0][1].decode('utf-8')
email_message = email.message_from_string(raw_email)
email_message["from"]

In [None]:
msg = MIMEText("döner","utf-8")

print(msg)

In [None]:
from email.mime.text import MIMEText
txt = MIMEText(raw_msg["From"], 'utf-8', 'latin-1')
print(txt)
# msg = MIMEText("döner","utf-8")

In [None]:
import email

# we'll search using the ALL criteria to retrieve
# every message inside the inbox
# it will return with its status and a list of ids
status, data = mail.search(None, 'ALL')
# the list returned is a list of bytes separated
# by white spaces on this format: [b'1 2 3', b'4 5 6']
# so, to separate it first we create an empty list
mail_ids = []
# then we go through the list splitting its blocks
# of bytes and appending to the mail_ids list
for block in data:
    # the split function called without parameter
    # transforms the text or bytes into a list using
    # as separator the white spaces:
    # b'1 2 3'.split() => [b'1', b'2', b'3']
    mail_ids += block.split()

# now for every id we'll fetch the email
# to extract its content
for i in mail_ids:
    # the fetch function fetch the email given its id
    # and format that you want the message to be
    status, data = mail.fetch(i, '(RFC822)')

    # the content data at the '(RFC822)' format comes on
    # a list with a tuple with header, content, and the closing
    # byte b')'
    for response_part in data:
        # so if its a tuple...
        if isinstance(response_part, tuple):
            # we go for the content at its second element
            # skipping the header at the first and the closing
            # at the third
            message = email.message_from_bytes(response_part[1])
         
            # with the content we can extract the info about
            # who sent the message and its subject
            mail_from = message['from']
            mail_subject = message['subject']

            # then for the text we have a little more work to do
            # because it can be in plain text or multipart
            # if its not plain text we need to separate the message
            # from its annexes to get the text
            if message.is_multipart():
                mail_content = ''

                # on multipart we have the text message and
                # another things like annex, and html version
                # of the message, in that case we loop through
                # the email payload
                for part in message.get_payload():
                    # if the content type is text/plain
                    # we extract it
                    if part.get_content_type() == 'text/plain':
                        mail_content += part.get_payload()
            else:
                # if the message isn't multipart, just extract it
                mail_content = message.get_payload()

            # and then let's show its result
            print("#"*40+" NEXT EMAIL "+"#"*40)
            print(f'From: {mail_from}')
            print(f'Subject: {mail_subject}')
            print(f'Content: {mail_content}')
            