In [1]:
import os
import csv
import base64
import pandas as pd
import time
import re
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request


# Scrape sent emails using Gmail API:

In [None]:
# NOTE: Need to save a token.json with access credentials before running this

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
os.chdir('/Users/ianduke/Desktop/LLMs')

def authenticate_gmail():
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

def get_email_content(message):
    parts = []
    if 'parts' in message['payload']:
        for part in message['payload']['parts']:
            if 'parts' in part:
                parts.extend(part['parts'])
            else:
                parts.append(part)
    else:
        parts.append(message['payload'])

    email_body = ""
    for part in parts:
        if part['mimeType'] in ['text/plain', 'text/html']:
            data = part['body'].get('data')
            if data:
                decoded_data = base64.urlsafe_b64decode(data).decode('utf-8')
                email_body += decoded_data

    return email_body

def get_sent_emails(service):
    results = service.users().messages().list(userId='me', labelIds=['SENT']).execute()
    messages = results.get('messages', [])
    next_page_token = results.get('nextPageToken')

    while next_page_token:
        results = service.users().messages().list(userId='me', labelIds=['SENT'], pageToken=next_page_token).execute()
        messages.extend(results.get('messages', []))
        next_page_token = results.get('nextPageToken')

    email_data = []

    for message in messages:
        msg = service.users().messages().get(userId='me', id=message['id']).execute()
        payload = msg['payload']
        headers = payload.get("headers")
        subject = ""
        date = ""
        to = ""

        for header in headers:
            name = header.get("name")
            if name == "Subject":
                subject = header.get("value")
            if name == "Date":
                date = header.get("value")
            if name == "To":
                to = header.get("value")

        email_body = get_email_content(msg)

        email_data.append([date, to, subject, email_body])

        # To avoid hitting the API rate limit, add a short delay
        time.sleep(0.1)

    return email_data

def save_to_csv(emails):
    with open('sent_emails.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "To", "Subject", "Body"])
        writer.writerows(emails)

def main():
    creds = authenticate_gmail()
    service = build('gmail', 'v1', credentials=creds)
    emails = get_sent_emails(service)
    save_to_csv(emails)

if __name__ == '__main__':
    main()

# Clean:

In [4]:
# remove rows where I say 'my name is Ian'
ian_emails = ian_emails[~ian_emails['Body'].str.contains('name is Ian', case=False, na=False)]
ian_emails = ian_emails.reset_index()

In [None]:
emails_copy = ian_emails.copy()
for i in range(len(emails_copy)):
    print(i)
    text = str(emails_copy['Body'][i])
    match = re.search(r'(.*?(?:,?\s*Ian|-\s*Ian|On\s+\w{3},\s+\w{3}\s+\d{1,2},\s+\d{4}))', text, re.DOTALL)
    if match is not None and "content=" not in emails_copy['Body'][i]:
        emails_copy['Body'][i] = match .group(1)
    else:
        emails_copy['Body'][i] = 'NONE'

In [None]:
emails_copy = emails_copy[emails_copy['Body'] != 'NONE'].reset_index(drop=True)

In [None]:
# Lets finally remove email dates from the body
# Define the pattern to match
pattern = r'On \w{3}, \w{3} \d{2}, \d{4}'

# Function to replace the pattern in the text
def replace_date_strings(text):
    return re.sub(pattern, '', text)

for i in range(len(emails_copy)):
    emails_copy.loc[i, 'Body'] = replace_date_strings(str(emails_copy['Body'][i]))

# Save to csv for future finetuning:

In [None]:
# save to a csv
emails_copy.to_csv('sent_emails_usf_cleaned.csv')