In [None]:
from atproto import Client
from datetime import datetime
import os
import pandas as pd
import pytz
import time
 
# write a chunk of post-data to CSV
def write_chunk(df: str, target_user: str) -> None:
    # filename format is <username>_<earliest_contained_post_date>_<latest_contained_post_date>
    start    = df['post_created_at'].min().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    end      = df['post_created_at'].max().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    dir = f"usr_{target_user}"
    if not os.path.exists(dir):
        os.makedirs(dir)
    filename = f"{dir}/{target_user}.bsky.social_from_{start}_to_{end}.csv"
    
    
    df.to_csv(filename)

# ingest all posts for a specific BlueSky user as a collection of CSVs
def ingest_posts_by_user(target_user: str) -> None:
    # Retrieve login info from local env
    USR = os.getenv('BSY_USR').lower()
    KEY = os.getenv('BSY_KEY')
    # Instantiate a BlueSky session
    cli = Client()
    cli.login(USR, KEY)

    schema = {'content_id':                []
             ,'post_uri':                  []
             ,'like_count':                []
             ,'quote_count':               []
             ,'reply_count':               []
             ,'repost_count':              []
             ,'post_created_at':           []
             ,'text':                      []
             ,'tags':                      []
             ,'embedded_link_title':       []
             ,'embedded_link_description': []
             ,'embedded_link_uri':         []
             ,'author_username':           []
             ,'author_displayname':        []
             ,'author_account_created_at': []
             }
    data = schema 
    
    # Set some control vars for ingestion
    pages_remain = True
    page_num     = 0
    csr          = None
    filenum      = 0
    
    # Iterate through every post in their account's post history
    while pages_remain:
        
        # check if the current file is already "full" (larger than 100 MB)
        # if it is, stash the current data object as CSV and reset a new empty one
        df = pd.DataFrame(data)
        if df.memory_usage(deep=True).sum() / (1024*1024) >= 100:
            filenum+=1
            write_chunk(df, target_user)
            data = schema
            del df # No need to lock up memory while the next instance of `data` is filling up...
        
        # Retrieve a paginated post-feed for a specific bluesky user
        page_num += 1  
        resp      = cli.get_author_feed(target_user, cursor=csr)
        feed      = resp.feed
        print(f"Retrieving post data from page {page_num} for user @{target_user}.bsky.social...", end='\r')
        for item in feed:
            # i drink your data! i DRINK IT UP ლಠ益ಠ)ლ
            data['content_id'].append(item.post.cid)
            data['post_uri'].append(item.post.uri)
            data['like_count'].append(item.post.like_count)
            data['quote_count'].append(item.post.quote_count)
            data['reply_count'].append(item.post.reply_count)
            data['repost_count'].append(item.post.repost_count)
            '''
            We will keep all incoming data as string, with the exception of Timestamps. These 
            will be converted to "true" python timestamps.

            We do this so we can leverage pandas' .min() and .max() methods when naming our CSV
            file "chunks".

            Another way to do this would be to only do the conversion in `write_chunk()` and 
            leave the timestamps as strings. But that would mean:
                1. Most likely, these data will be converted to true timestamps downstream anways
                2. We would need a str column whose values were always 'UTC' so the timezone information
                   is still captured -- that would take extra space for a column whose value will never 
                   change-- UTC is a common standard timezone for most computer system's internal clocks. 
            '''
            ts = datetime.strptime(item.post.record.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = pytz.timezone('UTC').localize(ts)
            data['post_created_at'].append(ts)   

            data['text'].append(item.post.record.text)
            data['tags'].append(item.post.record.tags)
            # post may or may not have external links
            try:
                data['embedded_link_title'].append(item.post.record.embed.external.title)
            except AttributeError:
                data['embedded_link_title'].append('null')
            try:
                data['embedded_link_description'].append(item.post.record.embed.external.description)
            except AttributeError:
                data['embedded_link_description'].append('null')
            try:
                data['embedded_link_uri'].append(item.post.record.embed.external.uri)
            except AttributeError:
                data['embedded_link_uri'].append('null')
            data['author_username'].append(item.post.author.handle)
            data['author_displayname'].append(item.post.author.display_name)
            
            ts = datetime.strptime(item.post.author.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = pytz.timezone('UTC').localize(ts)
            data['author_account_created_at'].append(ts) 
            
        if not resp.cursor:
            pages_remain = False # flag to stop ingestion when the final page of posts is reached 
        csr = resp.cursor        # reset cursor when another page of posts is available
        time.sleep(2)            # limit query rate to avoid throttling, etc
    
    if len(df) > 0:
        write_chunk(df, target_user)
    print(f"\nPost Ingestion for user @{target_user}.bsky.social Complete!")

In [42]:
ingest_posts_by_user('politico.com')

Post Ingestion for user @politico.com.bsky.social Complete!ky.social...


In [35]:
# Retrieve login info from local env
USR = os.getenv('BSY_USR').lower()
KEY = os.getenv('BSY_KEY')
# Instantiate a BlueSky session
cli = Client()
cli.login(USR, KEY)

resp = cli.get_author_feed('politico.com')
resp.feed[0].post.author.display_name

'Politico'

In [None]:
data = {'foo': [1,2,3], 'bar': ['a', 'b', 'c']}
len(pd.DataFrame(data))


3