In [2]:
import os
from dotenv import load_dotenv
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
load_dotenv()

# SCOPES: Gmail read-only
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
token_path = os.getenv("token_path")
credentials_path = os.getenv("credentials_path")

In [3]:
def authenticate_gmail():
    creds = None
    flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
    creds = flow.run_local_server(port=0)
    service = build('gmail', 'v1', credentials=creds)
    return service

#service = authenticate_gmail()

In [4]:
def get_message_metadata(service, user_id='me', max_results=5):
    results = service.users().messages().list(userId=user_id, maxResults=max_results, q="after:2025/08/08").execute()
    messages = results.get('messages', [])

    for _, msg in enumerate(messages):
        msg_data = service.users().messages().get(userId=user_id, id=msg['id'], format='metadata', metadataHeaders=['Subject', 'From', 'Date']).execute()
        headers = msg_data.get('payload', {}).get('headers', [])

        email_info = {header['name']: header['value'] for header in headers if header['name'] in ['Subject', 'From', 'Date']}
        print(f"From: {email_info.get('From')}")
        print(f"Subject: {email_info.get('Subject')}")
        print(f"Date: {email_info.get('Date')}")

In [5]:
def get_full_messages(service, message_ids):
    messages = []
    for msg in message_ids:
        message = service.users().messages().get(userId="me", id=msg['id']).execute()
        messages.append(message)
    return messages
x = get_full_messages(service, messages)

NameError: name 'service' is not defined

In [7]:
import time, threading
from datetime import  datetime, timedelta

def last_7_days_generator(start_date: datetime.date):
    for i in range(1, 8):
        yield start_date - timedelta(days=i)

def get_ids(i, gen_1, gen_2):
    start_time = time.time()
    creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    service = build('gmail', 'v1', credentials=creds)
    results = service.users().messages().list(userId='me', q=f"after:{next(gen_2)} before:{next(gen_1)}").execute()
    #time.sleep(0.3)
    messages = results.get('messages', [])
    print(f"[Thread-{i}]. Time taken: {time.time() - start_time:.4f} sec")
    print(len(messages))
    
fetch_threads = []
x = datetime.now().date()  # just date
y = x - timedelta(days=1)
gen_1 =  last_7_days_generator(x)
gen_2 =  last_7_days_generator(y)
for i in range(7):
    t = threading.Thread(target=get_ids, args=(i+1, gen_1, gen_2))
    t.start()
    fetch_threads.append(t)

for t in fetch_threads:
    t.join()

print("All threads finished.")       

[Thread-4]. Time taken: 1.6214 sec[Thread-2]. Time taken: 1.6235 sec
35
[Thread-5]. Time taken: 1.6214 sec
30

34
[Thread-7]. Time taken: 1.6204 sec
38
[Thread-1]. Time taken: 1.6235 sec
18
[Thread-3]. Time taken: 1.6224 sec
23
[Thread-6]. Time taken: 1.6563 sec
23
All threads finished.


In [24]:
import queue
def create_dates() -> list[tuple]:
    today = datetime.now().date()
    ranges = []
    for i in range(1, 8):
        after_date = today - timedelta(days=i)
        before_date = today - timedelta(days=i - 1)
        ranges.append((after_date.strftime("%Y/%m/%d"), before_date.strftime("%Y/%m/%d")))
    return ranges


def get_ids(i: int, q: queue.Queue, token_path: str, after: str, before: str) -> None: 
    start_time = time.time()
    SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
    creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    service = build('gmail', 'v1', credentials=creds)
    results = service.users().messages().list(userId='me', q=f"after:{after} before:{before}").execute()
    #time.sleep(0.2)
    messages = results.get('messages', [])
    print(f"[Thread-{i}] >> Time taken: {time.time() - start_time:.4f} sec.")
    print(len(messages))
    q.put(messages)

def threaded_get_ids() -> list[dict]:    
    dates =  create_dates()
    threads_2=[]
    q = queue.Queue()   
    for i, (after, before) in enumerate(dates, start=1):
            t = threading.Thread(target=get_ids, args=(i, q, token_path, after, before))
            threads_2.append(t)
            t.start()
            time.sleep(0.1)
            
    for t in threads_2:
        t.join()
    print("All threads finished.")
    
    return [q.get() for _ in range(q.qsize())]   
#l = threaded_get_ids()

In [22]:
from functools import partial
batch_size = 7          #Get better
dates = create_dates() 

def get_ids_pool(q: queue.Queue, token_path: str, date: tuple) -> None: 
    start_time = time.time()
    SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
    creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    service = build('gmail', 'v1', credentials=creds)
    results = service.users().messages().list(userId='me', q=f"after:{date[0]} before:{date[1]}").execute()
    messages = results.get('messages', [])
    q.put(messages)
    print(f"[Thread] >> Time taken: {time.time() - start_time:.4f} sec.")
    print(len(messages))  
    #time.sleep(0.2)

In [23]:
from concurrent.futures import ThreadPoolExecutor
qq = queue.Queue()    
partial_function = partial(get_ids_pool, qq, token_path)

with ThreadPoolExecutor(max_workers=7) as executor:   #multiple workers for faster calling/extracting
     executor.map(partial_function, dates)
        
l = [qq.get() for _ in range(qq.qsize())]
qq.task_done()

[Thread] >> Time taken: 1.5693 sec.
23
[Thread] >> Time taken: 1.5722 sec.
18
[Thread] >> Time taken: 1.5870 sec.
29
[Thread] >> Time taken: 1.5834 sec.
35
[Thread] >> Time taken: 1.6032 sec.
23
[Thread] >> Time taken: 1.6403 sec.
38
[Thread] >> Time taken: 1.6615 sec.
30


In [12]:
print(l)

[[{'id': '1988691375a9beb7', 'threadId': '1988691375a9beb7'}, {'id': '198863ac43083174', 'threadId': '198863ac43083174'}, {'id': '198862a53f1f8273', 'threadId': '198862a53f1f8273'}, {'id': '19885fd9b18c5065', 'threadId': '19885fd9b18c5065'}, {'id': '1988571a8df57281', 'threadId': '1988571a8df57281'}, {'id': '19884e1a5a79d487', 'threadId': '19884e1a5a79d487'}, {'id': '19884da798c9f277', 'threadId': '19884da798c9f277'}, {'id': '19884d6cb429e40f', 'threadId': '19884d6cb429e40f'}, {'id': '19884a794e87153f', 'threadId': '19884a794e87153f'}, {'id': '19884a16f6fd5793', 'threadId': '19884a16f6fd5793'}, {'id': '198849ff4af8c706', 'threadId': '198849ff4af8c706'}, {'id': '19884757e8c8ca3e', 'threadId': '19884757e8c8ca3e'}, {'id': '1988441f9389d9e0', 'threadId': '1988441f9389d9e0'}, {'id': '198843c96012890b', 'threadId': '198843c96012890b'}, {'id': '19884051ffcad32c', 'threadId': '19884051ffcad32c'}, {'id': '19883e41a841ed0a', 'threadId': '19883e41a841ed0a'}, {'id': '19883e30d2ec7636', 'threadId':

In [13]:
from utils import extract_headers, decode_body

In [15]:
# import json
# d={}
# for msg in messages:
#     msg_data = service.users().messages().get(userId="me", id=msg["id"], format="full").execute()

#     # payload = msg_data.get("payload", {})
#     # subject, sender = extract_headers(payload)
#     # body_text = decode_body(payload, prefer_plain=True).strip()

#     # print(f"\n--- Email ---")
#     # print(f"From: {sender}")
#     # print(f"Subject: {subject}")
#     # print(f"Body:\n{body_text}")
#     d[msg["id"]] = msg_data

# with open(f"email.json", "w") as f:
#     json.dump(d, f)


In [16]:
import sys

size_in_bytes=sys.getsizeof(l)

size_in_mb=size_in_bytes / (1024 * 1024)
print(f"Size in MB: {size_in_mb}")

Size in MB: 0.00011444091796875


In [None]:
len(l)

7