In [None]:
from __future__ import print_function
import os.path

# decoding email data
import base64

# regular expression
import re

# using nan for data that cannot be decoded
import numpy as np

# parsing date 
from dateutil import parser

# removing html tags from decoded email
from bs4 import BeautifulSoup
from datetime import datetime

# required to deal with google stuff :)
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def get_emails(service):
    # creating csv file
    dataset_file = open('email_dataset.csv', 'a', encoding = 'utf-8')
    dataset_file.flush()

    # file structure: date, from, to, subject, body
    # adding header to file
    dataset_file.write("date,from,to,subject,body\n")

    # fetching all emails from inbox
    # Returns a dictionary with all email id and threadId stored in the hey messages
    results = service.users().messages()\
    .list(userId='me', labelIds=["INBOX"], includeSpamTrash=True, maxResults=1500).execute()
    
    # extracting all emails { id, threadId }
    emails = results['messages']

    # display number of emails
    print("{} emails retrieved".format(len(emails)))

    # regex to extract email from string
    # email_regex = r"(([a-zA-Z0-9]*\.?-?[a-zA-Z0-9]*)*\@{1}([a-zA-Z0-9]*\.?-?[a-zA-Z0-9]*)\.{1}[a-zA-Z]+)"

    # retrieving all emails and decoding the,
    for email in emails:
        
        # fetch email by id
        encoded_mail = service.users().messages().get(userId='me', id=email['id']).execute()

        # extract email body
        payload = encoded_mail['payload']
        body = payload['body']
        headers = payload['headers']
        body_text = ""
        header_details = {}

        # print(payload['headers'])
        # retrieving sender, receiver, data and subject from email headers
        for header in headers:
            if header['name'].lower() in ["from", "to", "date", "subject"]:
                header_details[header['name'].lower()] = header['value']
        

        # parsing date from header
        header_details['date'] = str(parser.parse(header_details['date']))

        # converting emails to timestamp. This is becase the from and to headers could include the sender's name which is irrelevant
        # header_details['from'] = re.search(email_regex, header_details['from']).group(0)
        # header_details['to'] = re.search(email_regex, header_details['to']).group(0)
        print(header_details)
        # verify whether email contains data
        # we are searching for data in body because some emails have no data and data holds the email contnet
        if 'data' in body.keys():
            try:
                # decoding email body data
                decoded_body = base64.urlsafe_b64decode(body['data']).decode()

                # removing html tags from email body
                removed_html = BeautifulSoup(decoded_body, 'lxml').text

                # removing irrelevant non printable characters
                body_text = " ".join(list(map(lambda text : str(text).strip(), removed_html.split())))
                
            except Exception as e:
                # add nan to file
                print("Failed to decode")
                body_text = np.nan
                
            finally:
                # writing data into file
                dataset_file.write(f"{header_details['date']},{header_details['from']},{header_details['to']},{header_details['subject']},\"{body_text}\"\n")
        
    # closing file
    dataset_file.close()
    print('Completed')
                

def main():
    """Shows basic usage of the Gmail API.
    Lists the user's Gmail labels.
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    try:
        # Call the Gmail API
        service = build('gmail', 'v1', credentials=creds)

        get_emails(service)
        

    except HttpError as error:
        print(f'An error occurred: {error}')


if __name__ == '__main__':
    main()


In [None]:
pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
import email
import imaplib



In [None]:
[â€Œ, Itâ€™s, â€™ : ',  âœ”ï, âž¡ï, â‚¹30, ðŸ‘‰, ðŸ’«, ðŸ’«ðŸŽ‡, ðŸ‘‹]