In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# SCOPES: Gmail read-only
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
token_path = os.getenv("token_path")
credentials_path = os.getenv("credentials_path")

In [None]:
def get_message_metadata(service, user_id='me', max_results=5):
    results = service.users().messages().list(userId=user_id, maxResults=max_results, q="after:2025/08/08").execute()
    messages = results.get('messages', [])

    for _, msg in enumerate(messages):
        msg_data = service.users().messages().get(userId=user_id, id=msg['id'], format='metadata', metadataHeaders=['Subject', 'From', 'Date']).execute()
        headers = msg_data.get('payload', {}).get('headers', [])

        email_info = {header['name']: header['value'] for header in headers if header['name'] in ['Subject', 'From', 'Date']}
        print(f"From: {email_info.get('From')}")
        print(f"Subject: {email_info.get('Subject')}")
        print(f"Date: {email_info.get('Date')}")

In [None]:
import gzip, msgspec
import pandas as pd
from dags.utils import extract_headers, decode_body 
path = "C:/airflow-docker/data/15-08-2025-10-41.json.gz"
#Decompress and load
with gzip.open(path, 'rb') as f:
    decompressed_bytes = f.read()
decompressed_data = msgspec.msgpack.decode(decompressed_bytes)

df =pd.DataFrame(decompressed_data)
payload = df.loc[3,["Payload"]].values
df.head(2)

Unnamed: 0,Id,Payload
0,1988691375a9beb7,"{'partId': '', 'mimeType': 'multipart/alternat..."
1,19885fd9b18c5065,"{'partId': '', 'mimeType': 'multipart/mixed', ..."


In [2]:
df["Headers"] = df["Payload"].apply(extract_headers)
df["Body"] = df["Payload"].apply(decode_body)
df["Subject"] = df["Headers"].apply(lambda x : x[0])
df["Sender"] = df["Headers"].apply(lambda x : x[1])
df.drop(["Payload", "Headers"], axis=1, inplace=True)

In [3]:
df.head(10)

Unnamed: 0,Id,Body,Subject,Sender
0,1988691375a9beb7,satwik ready to take your career to the next l...,Enhance Your Career with Braintrust's Career Help,Braintrust
1,19885fd9b18c5065,weekly digest hi leetcoder a leetcoder has rec...,LeetCode Weekly Digest,LeetCode
2,1988571a8df57281,competition launch open model redteaming chall...,Competition Launch: Open Model Red-Teaming Cha...,Kaggle
3,19884a794e87153f,,Status of your job application has changed,Naukri
4,198863ac43083174,premium members are 26x more likely to get hir...,"Methari, enjoy 50% off LinkedIn Premium for 2 ...",LinkedIn
5,19884d6cb429e40f,blackstraw is hiring job alert data engineeryo...,Data Engineer at Blackstraw and 12 more jobs i...,Glassdoor Jobs
6,19884da798c9f277,bytesar technologies is hiring job alert machi...,Ai/ml Excutive at Wellorgs Infotech Pvt. Ltd. ...,Glassdoor Jobs
7,198862a53f1f8273,jobs similar to data scientist at tallentoai m...,New jobs similar to Data Scientist at Tallento.ai,LinkedIn
8,19884a16f6fd5793,vcars auto private limited is hiring job alert...,Data Scientist at Hanumant Technology Pvt. Ltd...,Glassdoor Jobs
9,19884e1a5a79d487,posted on 862025 methari satwik top job picks ...,Dimitra Technology is hiring a Junior (Satelli...,LinkedIn


In [None]:
df["Sender"].value_counts()

In [None]:
x = payload.get("headers", [])
for h in x:
    print(h)

In [None]:
from dags.utils import extract_headers, decode_body 
for payload in decompressed_data["Payload"][:1]:
    subject, sender = extract_headers(payload)
    body_text = decode_body(payload, prefer_plain=True).strip()

    print(f"\n--- Email ---")
    print(f"From: {sender}")
    print(f"Subject: {subject}")
    #print(f"Body:\n{body_text}")

In [None]:
from dags.utils import preprocess_email_body
# Example usage
modified_text = preprocess_email_body(body_text)
print(len(body_text), len(modified_text))