In [None]:
from atproto import Client
from datetime import datetime
import os
import pandas as pd
import pytz
import time

# write a chunk of post-data to CSV
def write_chunk(df: str, bsky_username: str):
    # filename format is <username>_<earliest_contained_post_date>_<latest_contained_post_date>
    start    = df['post_created_at'].min().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    end      = df['post_created_at'].max().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    dir = f"usr_{bsky_username}"
    if not os.path.exists(dir):
        os.makedirs(dir)
    filename = f"{dir}/{bsky_username}.bsky.social_{start}_to_{end}.csv"
    df.to_csv(filename)

# ingest all posts for a specific BlueSky user as a collection of CSVs
def stash_feed(bsky_client: Client, bsky_did: str, bsky_username: str):
    schema = {'content_id':                []
             ,'post_uri':                  []
             ,'like_count':                []
             ,'quote_count':               []
             ,'reply_count':               []
             ,'repost_count':              []
             ,'post_created_at':           []
             ,'text':                      []
             ,'tags':                      []
             ,'embedded_link_title':       []
             ,'embedded_link_description': []
             ,'embedded_link_uri':         []
             ,'author_username':           []
             ,'author_displayname':        []
             ,'author_account_created_at': []
             }
    data = schema 
    
    csr          = None
    pages_remain = True
    filenum      = 0
    page_num     = 0
    
    # Iterate through every post in their account's post history
    while pages_remain:
        
        # check if the current file is already "full" (larger than 100 MB)
        # if it is, stash the current data object as CSV and reset a new empty one
        df = pd.DataFrame(data)
        if df.memory_usage(deep=True).sum() / (1024*1024) >= 100:
            print("\nCondition `df.memory_usage(deep=True).sum() / (1024*1024) >= 100` TRIGGERED")
            filenum+=1
            write_chunk(df, bsky_username)
            data = schema
            del df # No need to lock up memory while the next instance of `data` is filling up...
        
        # Retrieve a paginated post-feed for a specific bluesky user
        page_num += 1  
        resp      = bsky_client.get_author_feed(actor=bsky_did, cursor=csr)
        feed      = resp.feed
        print(f"Retrieving post data from page {page_num} for user @{bsky_username}.bsky.social...", end='\r')
        for item in feed:
            # i drink your data! i DRINK IT UP ლಠ益ಠ)ლ
            data['content_id'].append(item.post.cid)
            data['post_uri'].append(item.post.uri)
            data['like_count'].append(item.post.like_count)
            data['quote_count'].append(item.post.quote_count)
            data['reply_count'].append(item.post.reply_count)
            data['repost_count'].append(item.post.repost_count)

            # extract timestamp strings as actual timestamps, including timezone
            ts = datetime.strptime(item.post.record.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = pytz.timezone('UTC').localize(ts)
            data['post_created_at'].append(ts)   

            data['text'].append(item.post.record.text)
            data['tags'].append(item.post.record.tags)
            
            # post may or may not have external links
            try:
                data['embedded_link_title'].append(item.post.record.embed.external.title)
            except AttributeError:
                data['embedded_link_title'].append('null')
            try:
                data['embedded_link_description'].append(item.post.record.embed.external.description)
            except AttributeError:
                data['embedded_link_description'].append('null')
            try:
                data['embedded_link_uri'].append(item.post.record.embed.external.uri)
            except AttributeError:
                data['embedded_link_uri'].append('null')
            data['author_username'].append(item.post.author.handle)
            data['author_displayname'].append(item.post.author.display_name)

            # extract timestamp strings as actual timestamps, including timezone
            ts = datetime.strptime(item.post.author.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = pytz.timezone('UTC').localize(ts)
            data['author_account_created_at'].append(ts) 
            
        if not resp.cursor:
            print("\nCondition `not resp.cursor` TRIGGERED")
            # i think im getting throttled
            # to test: the first time you get a null cursor, wait 5 minutes, then check again
            # keep going unless the cursor is STILL null after 5 minutes
            print("\nWaiting 5 min before initiating the next request...")
            time.sleep(300)
            resp = bsky_client.get_author_feed(actor=bsky_did, cursor=csr)
            if not resp.cursor:
                print("Still did not get a new cursor after waiting for 5 minutes")
                pages_remain = False
        csr = resp.cursor        # reset cursor when another page of posts is available
    
    if len(df) > 0:
        print("\nCondition `len(df) > 0` TRIGGERED")
        write_chunk(df, bsky_username)

def ingest_posts_by_user(bsky_username: str):
    USR = os.getenv('BSY_USR').lower()
    KEY = os.getenv('BSY_KEY')
    
    # Instantiate a BlueSky session
    bsky_client = Client()
    bsky_client.login(USR, KEY)
    
    # user DID cannot change for a given account, but username can
    # therefore, use DID and not the handle when retrieving posts from a specific BlueSky user
    bsky_did = bsky_client.com.atproto.identity.resolve_handle({'handle': bsky_username}).did
    
    stash_feed(bsky_client, bsky_did, bsky_username)
    
    print(f"\nPost Ingestion for user @{bsky_username}.bsky.social Complete!")

In [18]:
USR = os.getenv('BSY_USR').lower()
KEY = os.getenv('BSY_KEY')

# Instantiate a BlueSky session
bsky_client = Client()
bsky_client.login(USR, KEY)

bsky_did = bsky_client.com.atproto.identity.resolve_handle({'handle': 'politico.com'}).did
resp = bsky_client.get_author_feed(actor = bsky_did)
resp

Response(feed=[FeedViewPost(post=PostView(author=ProfileViewBasic(did='did:plc:yf6hctt2ug3qyfty4in64yob', handle='politico.com', associated=None, avatar='https://cdn.bsky.app/img/avatar/plain/did:plc:yf6hctt2ug3qyfty4in64yob/bafkreid3rrarj36xsctgqpt7wlr2iv7yuzo3vixvtgxk4ryzx2pdnunfwu@jpeg', created_at='2023-04-28T22:40:12.228Z', display_name='Politico', labels=[], viewer=ViewerState(blocked_by=False, blocking=None, blocking_by_list=None, followed_by=None, following='at://did:plc:pk3ngv5kofsjoet7gs4o2o7f/app.bsky.graph.follow/3lkqfbz3jg522', known_followers=None, muted=False, muted_by_list=None, py_type='app.bsky.actor.defs#viewerState'), py_type='app.bsky.actor.defs#profileViewBasic'), cid='bafyreigzftastiaq6facatw3s2crsovn7ygz62pul4dpqprpmzlfrc27hi', indexed_at='2025-03-20T22:47:01.359Z', record=Record(created_at='2025-03-20T22:46:59.942Z', text="Republicans fear Gavin Newsom's podcast could help him in a potential 2028 presidential bid.", embed=Main(external=External(description='It’