In [273]:
import duckdb

# connect to DuckDB database file
con = duckdb.connect("all_posts.duckdb")

# export first post date and body per user, deduplicated
con.execute("""
    COPY (
        SELECT author_did, created_at AS first_post_created_at, body AS first_post_body
        FROM (
            SELECT
                author_did,
                created_at,
                body,
                ROW_NUMBER() OVER (PARTITION BY author_did ORDER BY created_at ASC) AS rn
            FROM all_posts
        )
        WHERE rn = 1
    ) TO 'first_posts_per_user.csv' (HEADER, DELIMITER ',');
""")

con.close()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [274]:
import pandas as pd

# load csvs
users_df = pd.read_csv("sample_users.csv")
first_posts_df = pd.read_csv("first_posts_per_user.csv")

# merge on did
merged_df = users_df.merge(
    first_posts_df,
    how="inner",
    left_on="did",
    right_on="author_did"
)

#  cleanup
merged_df = merged_df.drop(columns=["author_did"])

# Save final result
merged_df.to_csv("sample_users_with_first_posts.csv", index=False)
print("CSV written: sample_users_with_first_posts.csv")


CSV written: sample_users_with_first_posts.csv


In [275]:
import pandas as pd

df = pd.read_csv("sample_users_with_first_posts.csv")

print(df.head())


                                did                      handle  \
0  did:plc:ooehugjick2vkzwlr64lephm        haleaziz.bsky.social   
1  did:plc:qy3kg6jvtlhenv5ojqurynwh     joshuacroke.bsky.social   
2  did:plc:ibqwicj6ersmvw36yldikozx  inspiringtimmy.bsky.social   
3  did:plc:f4ultoamz3y2m4drcjvhoecu  wilmonstouches.bsky.social   
4  did:plc:yzhdirwmonvaezndwnssoosd      kerrieneet.bsky.social   

                                                 bio               created_at  \
0  I cover DHS and immigration policy for the New...  2024-11-18 15:39:52.581   
1  Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...  2024-11-15 11:45:12.223   
2  Best known for watching the Queen's Gambit fro...  2024-11-06 08:11:15.881   
3  same name but not rosh‘s ex | #OMAR : ”oh, tha...  2024-11-15 17:01:25.026   
4  Wild places in the SW and beyond. Where to go?...  2024-11-20 01:32:25.972   

        first_post_created_at  \
0  2024-11-16 20:04:59.812-05   
1  2024-11-15 07:36:53.045-05   
2  2024-11-

In [276]:
len(df)

23335

In [277]:
df

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812-05,If Trump-Miller-Homan want to shut down the ri...
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045-05,A they/them here for good trouble and out to m...
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817-05,Foundations day 3\n\nlive now!
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418-05,it‘s cuz the nice ones switched to here and th...
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791-05,I would love to be added
...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,2021-08-13 10:03:56-04,"and yeah, I believe, deeply, that’s the kind o..."
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221-04,Made a curry tonight. One mouthful later and I...
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133-04,Being an earthworm would be awesome. If you ge...
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821-04,The fox waifu <3\n\n#art #furry


In [278]:
df_features = df.copy()

In [279]:
print(df_features['first_post_created_at'].dtype)


object


In [280]:
# convert to string
df_features['first_post_created_at_str'] = df_features['first_post_created_at'].astype(str)

# parse to datetime, preserve timezone if present
df_features['first_post_created_at_parsed'] = pd.to_datetime(
    df_features['first_post_created_at_str'],
    errors='coerce'
)

# strip timezone without changing local time
df_features['first_post_created_at_parsed'] = df_features['first_post_created_at_parsed'].apply(
    lambda x: x.replace(tzinfo=None) if pd.notna(x) else pd.NaT
)

# format to milliseconds
df_features['first_post_created_at_formatted'] = (
    df_features['first_post_created_at_parsed']
    .dt.strftime('%Y-%m-%d %H:%M:%S.%f')
    .str.slice(0, 23)
)

# check results
parsed = df_features['first_post_created_at_parsed'].notna().sum()
total = len(df_features)
print(f"Parsed and stripped timezone: {parsed}/{total} rows ({parsed / total:.2%})")


  df_features['first_post_created_at_parsed'] = pd.to_datetime(


Parsed and stripped timezone: 22465/23335 rows (96.27%)


In [281]:
# Replace original column with formatted values
df_features['first_post_created_at'] = df_features['first_post_created_at_formatted']

# Drop intermediate columns
df_features = df_features.drop(
    columns=[
        'first_post_created_at_formatted',
        'first_post_created_at_parsed',
        'first_post_created_at_str'
    ],
    errors='ignore'  # just in case one is missing
)


In [282]:
df_features['first_post_created_at'] = df_features['first_post_created_at'].replace('NaT', pd.NA)

# Confirm NaNs now work
print("True NaNs:", df_features['first_post_created_at'].isna().sum())


✅ True NaNs: 870


In [283]:
df_features

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812,If Trump-Miller-Homan want to shut down the ri...
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045,A they/them here for good trouble and out to m...
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817,Foundations day 3\n\nlive now!
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418,it‘s cuz the nice ones switched to here and th...
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791,I would love to be added
...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,,"and yeah, I believe, deeply, that’s the kind o..."
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221,Made a curry tonight. One mouthful later and I...
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133,Being an earthworm would be awesome. If you ge...
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821,The fox waifu <3\n\n#art #furry


In [258]:
# Check how many failed
num_failed = df_features['first_post_created_at'].isna().sum()
print(f"Rows that failed to parse: {num_failed}")

# See some of them
bad_rows = df_features[df_features['first_post_created_at'].isna()]
print(bad_rows['first_post_created_at'].head(10))


Rows that failed to parse: 870
113    NaN
223    NaN
294    NaN
315    NaN
341    NaN
348    NaN
365    NaN
380    NaN
429    NaN
444    NaN
Name: first_post_created_at, dtype: object


In [284]:
# bio length
df_features['bio length'] = df_features['bio'].str.len()

In [285]:
# first post body length
df_features['first post body length'] = df_features['first_post_body'].str.len()

In [286]:
# bio_has_emoji, bio_emoji_count, post_has_emoji, post_emoji_count

import pandas as pd
import re

# emoji pattern (covers most common emoji ranges apparently)
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # flags
    "\U00002700-\U000027BF"  # dingbats
    "\U000024C2-\U0001F251"  # enclosed characters
    "]+", flags=re.UNICODE
)

# function to count emojis
def count_emojis(text):
    if pd.isnull(text):
        return 0
    return len(emoji_pattern.findall(text))

# function to check if an emoji exiss
def has_emoji(text):
    if pd.isnull(text):
        return 0
    return 1 if emoji_pattern.search(text) else 0

# Apply to bio
df_features['bio_has_emoji'] = df_features['bio'].apply(has_emoji)
df_features['bio_emoji_count'] = df_features['bio'].apply(count_emojis)

# Apply to first_post_body
df_features['post_has_emoji'] = df_features['first_post_body'].apply(has_emoji)
df_features['post_emoji_count'] = df_features['first_post_body'].apply(count_emojis)


In [287]:
df_features

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body,bio length,first post body length,bio_has_emoji,bio_emoji_count,post_has_emoji,post_emoji_count
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812,If Trump-Miller-Homan want to shut down the ri...,102.0,299.0,0,0,0,0
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045,A they/them here for good trouble and out to m...,219.0,300.0,1,9,1,6
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817,Foundations day 3\n\nlive now!,230.0,28.0,0,0,0,0
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418,it‘s cuz the nice ones switched to here and th...,126.0,71.0,0,0,0,0
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791,I would love to be added,255.0,24.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,,"and yeah, I believe, deeply, that’s the kind o...",,256.0,0,0,0,0
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221,Made a curry tonight. One mouthful later and I...,143.0,124.0,0,0,1,1
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133,Being an earthworm would be awesome. If you ge...,186.0,129.0,0,0,0,0
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821,The fox waifu <3\n\n#art #furry,101.0,29.0,1,3,0,0


In [290]:
# time_to_first_post_days

# convert to datetime  and strip timezones
df_features['created_at'] = pd.to_datetime(df_features['created_at'], errors='coerce').dt.tz_localize(None)
df_features['first_post_created_at'] = pd.to_datetime(df_features['first_post_created_at'], errors='coerce').dt.tz_localize(None)


# subtract
df_features['time_to_first_post_days'] = (
    df_features['first_post_created_at'] - df_features['created_at']).dt.total_seconds() / (60 * 60 * 24)


In [291]:
df_features

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body,bio length,first post body length,bio_has_emoji,bio_emoji_count,post_has_emoji,post_emoji_count,time_to_first_post_days
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812,If Trump-Miller-Homan want to shut down the ri...,102.0,299.0,0,0,0,0,-1.815889
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045,A they/them here for good trouble and out to m...,219.0,300.0,1,9,1,6,-0.172444
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817,Foundations day 3\n\nlive now!,230.0,28.0,0,0,0,0,8.088356
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418,it‘s cuz the nice ones switched to here and th...,126.0,71.0,0,0,0,0,0.013477
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791,I would love to be added,255.0,24.0,0,0,0,0,1.270507
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,NaT,"and yeah, I believe, deeply, that’s the kind o...",,256.0,0,0,0,0,
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221,Made a curry tonight. One mouthful later and I...,143.0,124.0,0,0,1,1,2.794723
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133,Being an earthworm would be awesome. If you ge...,186.0,129.0,0,0,0,0,-61.253922
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821,The fox waifu <3\n\n#art #furry,101.0,29.0,1,3,0,0,-15.606489


In [292]:
# bio_has_hashtag, bio_hashtag_count, post_has_hashtag, post_hashtag_count


import re

# hashtag pattern: matches # followed by letters, numbers, or underscores
hashtag_pattern = re.compile(r"#\w+")

# check if any hashtags
def has_hashtag(text):
    if pd.isnull(text):
        return 0
    return 1 if hashtag_pattern.search(text) else 0

# count how many hashtags
def count_hashtags(text):
    if pd.isnull(text):
        return 0
    return len(hashtag_pattern.findall(text))

# apply to bio
df_features['bio_has_hashtag'] = df_features['bio'].apply(has_hashtag)
df_features['bio_hashtag_count'] = df_features['bio'].apply(count_hashtags)

# apply to first_post_body
df_features['post_has_hashtag'] = df_features['first_post_body'].apply(has_hashtag)
df_features['post_hashtag_count'] = df_features['first_post_body'].apply(count_hashtags)


In [293]:
df_features

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body,bio length,first post body length,bio_has_emoji,bio_emoji_count,post_has_emoji,post_emoji_count,time_to_first_post_days,bio_has_hashtag,bio_hashtag_count,post_has_hashtag,post_hashtag_count
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812,If Trump-Miller-Homan want to shut down the ri...,102.0,299.0,0,0,0,0,-1.815889,0,0,0,0
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045,A they/them here for good trouble and out to m...,219.0,300.0,1,9,1,6,-0.172444,1,1,1,1
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817,Foundations day 3\n\nlive now!,230.0,28.0,0,0,0,0,8.088356,0,0,0,0
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418,it‘s cuz the nice ones switched to here and th...,126.0,71.0,0,0,0,0,0.013477,1,1,0,0
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791,I would love to be added,255.0,24.0,0,0,0,0,1.270507,1,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,NaT,"and yeah, I believe, deeply, that’s the kind o...",,256.0,0,0,0,0,,0,0,0,0
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221,Made a curry tonight. One mouthful later and I...,143.0,124.0,0,0,1,1,2.794723,0,0,0,0
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133,Being an earthworm would be awesome. If you ge...,186.0,129.0,0,0,0,0,-61.253922,0,0,0,0
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821,The fox waifu <3\n\n#art #furry,101.0,29.0,1,3,0,0,-15.606489,0,0,1,2


In [294]:
df_features

Unnamed: 0,did,handle,bio,created_at,first_post_created_at,first_post_body,bio length,first post body length,bio_has_emoji,bio_emoji_count,post_has_emoji,post_emoji_count,time_to_first_post_days,bio_has_hashtag,bio_hashtag_count,post_has_hashtag,post_hashtag_count
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581,2024-11-16 20:04:59.812,If Trump-Miller-Homan want to shut down the ri...,102.0,299.0,0,0,0,0,-1.815889,0,0,0,0
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223,2024-11-15 07:36:53.045,A they/them here for good trouble and out to m...,219.0,300.0,1,9,1,6,-0.172444,1,1,1,1
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881,2024-11-14 10:18:29.817,Foundations day 3\n\nlive now!,230.0,28.0,0,0,0,0,8.088356,0,0,0,0
3,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026,2024-11-15 17:20:49.418,it‘s cuz the nice ones switched to here and th...,126.0,71.0,0,0,0,0,0.013477,1,1,0,0
4,did:plc:yzhdirwmonvaezndwnssoosd,kerrieneet.bsky.social,Wild places in the SW and beyond. Where to go?...,2024-11-20 01:32:25.972,2024-11-21 08:01:57.791,I would love to be added,255.0,24.0,0,0,0,0,1.270507,1,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23330,did:plc:67yljp3x2ggfmtouqhzm23ax,quigonejinn.bsky.social,,2023-07-25 19:58:17.493,NaT,"and yeah, I believe, deeply, that’s the kind o...",,256.0,0,0,0,0,,0,0,0,0
23331,did:plc:64vgrlkpc25gq7dzgv4wk4fz,raywillmott.bsky.social,Some Welsh Brand Manager in games @rawfury.bsk...,2023-07-23 23:19:36.113,2023-07-26 18:24:00.221,Made a curry tonight. One mouthful later and I...,143.0,124.0,0,0,1,1,2.794723,0,0,0,0
23332,did:plc:ld6amztvyqa6dkyfpuw7agz3,madrigal.bsky.social,"I have a shiny, smooth coat and a precise, int...",2023-07-04 16:56:13.032,2023-05-04 10:50:34.133,Being an earthworm would be awesome. If you ge...,186.0,129.0,0,0,0,0,-61.253922,0,0,0,0
23333,did:plc:ifcposy3e6frphvetobi7mva,foxpopvli.bsky.social,"🔸 Anthros, Cars and Nostalgia\n🔸 100% Natural ...",2023-07-19 23:27:01.452,2023-07-04 08:53:40.821,The fox waifu <3\n\n#art #furry,101.0,29.0,1,3,0,0,-15.606489,0,0,1,2


In [295]:
# Replace NaN in 'bio' with empty string
df_features['bio'] = df_features['bio'].fillna('')

# Replace NaN in 'bio length' with 0
df_features['bio length'] = df_features['bio length'].fillna(0)

# Replace NaN in 'first_post_body' with empty string
df_features['first_post_body'] = df_features['first_post_body'].fillna('')

# Replace NaN in 'first post body length' with 0
df_features['first post body length'] = df_features['first post body length'].fillna(0)



In [296]:
df_features.to_csv("df_features.csv", index=False)

In [297]:
# Count NaNs in each column
nan_counts = df_features.isna().sum()

# Print columns with at least one NaN
print(nan_counts[nan_counts > 0])



first_post_created_at      870
time_to_first_post_days    870
dtype: int64


In [266]:
missing_bio_df = df_features[df_features['bio'].isna()]
print(missing_bio_df)


                                    did                      handle  bio  \
111    did:plc:7a47nphhiy37rrori2vrrvo5                      rds.ca  NaN   
153    did:plc:iizajxbr7psnyb4yxtyzy6vp  simplyaworm333.bsky.social  NaN   
176    did:plc:tktqqgnnntqhrsx56pta35tb  wacvconference.bsky.social  NaN   
194    did:plc:o5quctzp4mhdeluu3bsl5kxs            dankennett.co.uk  NaN   
206    did:plc:gm5h24qfhu5545pkbcxteyur         yungyej.bsky.social  NaN   
...                                 ...                         ...  ...   
23056  did:plc:m5kwuwzdlv67mgzu2bculzyh    lightblanket.bsky.social  NaN   
23072  did:plc:cbl4mzfbhc2miabft5qmww2q       colodraws.bsky.social  NaN   
23144  did:plc:qncshwndve6fujdqveysakp6       quoronter.bsky.social  NaN   
23248  did:plc:jbhnrxwr7pb25w4427nqmtej      mjerickson.bsky.social  NaN   
23330  did:plc:67yljp3x2ggfmtouqhzm23ax     quigonejinn.bsky.social  NaN   

                   created_at    first_post_created_at  \
111   2024-11-14 15:37:16.539