In [None]:
from atproto import Client
from datetime import datetime
import os
import pandas as pd
import pytz
import time

def parse_timestamp(timestamp_str, return_timezone: str='UTC'):
    if '-' in timestamp_str or '+' in timestamp_str:
        try:
            return datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S%z')
        except ValueError:
            pass
    for fmt in ('%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%SZ'):
        try:
            ts = datetime.strptime(timestamp_str, fmt)
            return pytz.timezone(return_timezone).localize(ts) 
        except ValueError:
            continue
    raise ValueError(f"Timestamp format not recognized: {timestamp_str}")

# Instantiate a BlueSky session
def bluesky_login():
    USR = os.getenv('BSY_USR').lower()
    KEY = os.getenv('BSY_KEY')
    bsky_client = Client()
    bsky_client.login(USR, KEY)
    return bsky_client, USR

def get_did(bsky_client: Client, bksy_handle: str) -> str:
    return bsky_client.com.atproto.identity.resolve_handle({'handle': bksy_handle}).did

def get_followers(bsky_client: Client, bsky_handle: str) -> list:
    bsky_did = get_did(bsky_client, bsky_handle)
    return bsky_client.get_follows(actor=bsky_did).follows

# write a chunk of post-data to CSV
def write_chunk(df: str, bsky_username: str) -> None:
    # filename format is <username>_<earliest_contained_post_date>_<latest_contained_post_date>
    start = df['post_created_timestamp'].min().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    end   = df['post_created_timestamp'].max().to_pydatetime().strftime('%Y-%m-%d_%H-%M-%S')
    dir   = f"output_data/usr_{bsky_username}"
    if not os.path.exists(dir):
        os.makedirs(dir)
    filename = f"{dir}/{bsky_username}.bsky.social_{start}_to_{end}.csv"
    df.to_csv(filename)

# ingest all posts for a specific BlueSky user as a collection of CSVs
def stash_feed(bsky_client: Client, bsky_did: str, bsky_username: str) -> None:
    schema = {'content_id':                       []
             ,'post_uri':                         []
             ,'like_count':                       []
             ,'quote_count':                      []
             ,'reply_count':                      []
             ,'repost_count':                     []
             ,'post_created_timestamp':           []
             ,'text':                             []
             ,'tags':                             []
             ,'embedded_link_title':              []
             ,'embedded_link_description':        []
             ,'embedded_link_uri':                []
             ,'author_username':                  []
             ,'author_displayname':               []
             ,'author_account_created_timestamp': []
             ,'record_captured_timestamp':        []
             }
    data = schema 
    
    csr          = None
    pages_remain = True
    filenum      = 0
    page_num     = 0
    
    # Iterate through every post in their account's post history
    while pages_remain:
        
        # # check if the current file is already "full" (larger than 100 MB)
        # # if it is, stash the current data object as CSV and reset a new empty one
        df = pd.DataFrame(data)
        if df.memory_usage(deep=True).sum() / (1024*1024) >= 100:
            print("\nCondition `df.memory_usage(deep=True).sum() / (1024*1024) >= 100` TRIGGERED")
            filenum+=1
            write_chunk(df, bsky_username)
            data = schema
            del df # No need to lock up memory while the next instance of `data` is filling up...
        
        # Retrieve a paginated post-feed for a specific bluesky user
        page_num += 1  
        resp      = bsky_client.get_author_feed(actor=bsky_did, cursor=csr)
        feed      = resp.feed
        print(f"Retrieving post data from page {page_num} for user @{bsky_username}.bsky.social...", end='\r')
        for item in feed:
            # i drink your data! i DRINK IT UP ლಠ益ಠ)ლ
            data['content_id'].append(item.post.cid)
            data['post_uri'].append(item.post.uri)
            data['like_count'].append(item.post.like_count)
            data['quote_count'].append(item.post.quote_count)
            data['reply_count'].append(item.post.reply_count)
            data['repost_count'].append(item.post.repost_count)

            # extract timestamp strings as actual timestamps, including timezone
            # ts = datetime.strptime(item.post.record.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = parse_timestamp(item.post.record.created_at)
            data['post_created_timestamp'].append(ts)   

            data['text'].append(item.post.record.text)
            data['tags'].append(item.post.record.tags)
            
            # post may or may not have external links
            try:
                data['embedded_link_title'].append(item.post.record.embed.external.title)
            except AttributeError:
                data['embedded_link_title'].append('null')
            try:
                data['embedded_link_description'].append(item.post.record.embed.external.description)
            except AttributeError:
                data['embedded_link_description'].append('null')
            try:
                data['embedded_link_uri'].append(item.post.record.embed.external.uri)
            except AttributeError:
                data['embedded_link_uri'].append('null')
            data['author_username'].append(item.post.author.handle)
            data['author_displayname'].append(item.post.author.display_name)

            # extract timestamp strings as actual timestamps, including timezone
            # ts = datetime.strptime(item.post.author.created_at, '%Y-%m-%dT%H:%M:%S.%fZ')
            ts = parse_timestamp(item.post.author.created_at)
            data['author_account_created_timestamp'].append(ts) 

            ts = datetime.now(pytz.timezone('America/New_York')).astimezone(pytz.timezone('UTC'))
            data['record_captured_timestamp'].append(ts) 
            
        if not resp.cursor:
            print("\nCondition `not resp.cursor` TRIGGERED")
            pages_remain = False
        csr = resp.cursor        # reset cursor when another page of posts is available
    
    df = pd.DataFrame(data)
    if len(df) > 0:
        print("\nCondition `len(df) > 0` TRIGGERED")
        write_chunk(df, bsky_username)


def ingest_posts_by_user(bsky_client: Client, bsky_handle: str) -> None:
    bsky_did = get_did(bsky_client, bsky_handle)

    # user DID cannot change for a given account, but username can
    # therefore, use DID and not the handle when retrieving posts from a specific BlueSky user
    stash_feed(bsky_client, bsky_did, bsky_handle)
    
    print(f"\nPost Ingestion for user @{bsky_handle}.bsky.social Complete!")

In [62]:
cli, usr = bluesky_login()
fol = get_followers(cli, usr)
followers = {item.handle: [item.did, item.display_name] for item in fol}

for follower in followers:
    ingest_posts_by_user(cli, follower)

Retrieving post data from page 215 for user @mydoggyruss.bsky.social.bsky.social...

ValueError: Timestamp format not recognized: 2024-11-14T01:10:06+00:00

In [69]:
s = '2024-11-14T01:10:06+00:00'
datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')

datetime.datetime(2024, 11, 14, 1, 10, 6, tzinfo=datetime.timezone.utc)