In [14]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns

import re
import duckdb

In [11]:
user_labels_df = pd.read_csv("kmeans_cluster_labels.csv")
users_df = pd.read_csv("sample_users.csv")

In [13]:
user_labels_df.shape[0], users_df.shape[0]

(23335, 98421)

In [5]:
with duckdb.connect("file.db") as conn:
    first_post_sql = """
    with ranked_posts as (
        select
            author_did,
            body,
            created_at,
            row_number() over (partition by author_did order by created_at) as r
        from
            all_posts
    ) select * from ranked_posts where r = 1;

    """
    first_post_df = conn.execute(first_post_sql).df()

In [6]:
first_post_df

Unnamed: 0,author_did,body,created_at,r
0,did:plc:kmaleldosvavjfjjo7wl6e6a,Mr Krabs from SpongeBob has been arrested in B...,2023-09-07 11:20:12.715000-04:00,1
1,did:plc:le7k5zwsli5545o5habpizir,me when they say to put my figures at the end,2023-04-19 07:17:27-04:00,1
2,did:plc:ljbhqcwg5hougnhecdmmkmeb,If you're also new here and wondering how long...,2023-09-20 05:53:29.665000-04:00,1
3,did:plc:lndyvhazyfzvxf374avdchx6,"Anyone, any entity, anywhere in the world can ...",2024-11-25 17:28:34.480000-05:00,1
4,did:plc:lv26i2oswc5r6a22q2mh4dpi,Absolutely thrilled to share my postdoc work i...,2023-10-14 11:58:40.573000-04:00,1
...,...,...,...,...
23330,did:plc:hzd5xbub4sreswusx4dfr3ih,🚲🌸🍁,2024-11-06 08:34:44.579000-05:00,1
23331,did:plc:ifgltyxgrdfqv4hwwrg4bwt4,Three little kids are stomping around the bloc...,2018-09-01 09:31:15-04:00,1
23332,did:plc:it3dgoo6w6xo53ndjiku56fd,Let's try this BlueSky !\n\nA dedicated accoun...,2023-10-16 09:16:42.475000-04:00,1
23333,did:plc:jdfkui5fng3yg6f5qsamalkz,"EcoEvo folks, this is a fun game! Figure out e...",2023-08-26 07:18:11.899000-04:00,1


In [24]:
users_df = users_df.rename(columns={"did":"author_did"})
users_df

Unnamed: 0,author_did,handle,bio,created_at
0,did:plc:ooehugjick2vkzwlr64lephm,haleaziz.bsky.social,I cover DHS and immigration policy for the New...,2024-11-18 15:39:52.581
1,did:plc:qy3kg6jvtlhenv5ojqurynwh,joshuacroke.bsky.social,Founder of 🏳️‍🌈 queerforcities.com / 🌈loveyour...,2024-11-15 11:45:12.223
2,did:plc:ibqwicj6ersmvw36yldikozx,inspiringtimmy.bsky.social,Best known for watching the Queen's Gambit fro...,2024-11-06 08:11:15.881
3,did:plc:czowt24mxhkxcvxqffdg2t6u,quetzallicortez.bsky.social,,2024-11-24 18:10:16.146
4,did:plc:f4ultoamz3y2m4drcjvhoecu,wilmonstouches.bsky.social,"same name but not rosh‘s ex | #OMAR : ”oh, tha...",2024-11-15 17:01:25.026
...,...,...,...,...
98416,did:plc:iwguck4nqz7ky3snlfop2gdj,annshiro.bsky.social,== illustrator｜中/日/Eng 🆗 | 📫annshiro@163.com｜\...,2024-03-03 05:16:06.510
98417,did:plc:a4ndyz2v2sw3l4zyogvfmrxs,ruchikapose.bsky.social,女の子ポーズ&衣装資料ならおまかせ！ \n絵描きによる絵描き目線の作画資料\n🎀中の人るちか...,2024-03-22 12:05:20.239
98418,did:plc:b25v5fuvq4q374w5jxvfgdwl,anggia.bsky.to,,2024-06-09 22:34:26.421
98419,did:plc:s3ha2cfuskferixapyqztime,itsourqueen.bsky.social,,2024-06-08 05:35:12.293


In [32]:
HAS_MENTIONS_REGEX = r".*(@\w+)"

with open("url_regex.txt", "r") as f:
    URL_REGEX = f.read()

# Has mentions in first post Boolean
def has_mentions_in_bio(bio):
    return True if re.match(HAS_MENTIONS_REGEX, bio) else False

# Has mentions in first post Boolean
def has_mentions_in_first_post(first_post):
    return True if re.match(HAS_MENTIONS_REGEX, first_post) else False

# Has URL in bio
def has_url_in_bio(bio):
    return True if re.match(URL_REGEX, bio) else False
# Has URL in first post
def has_url_in_first_post(first_post):
    return True if re.match(URL_REGEX, first_post) else False



In [34]:
transformations = [has_mentions_in_bio, has_mentions_in_first_post, has_url_in_bio, has_url_in_first_post]

df = pd.merge(first_post_df, users_df, how="left", on="author_did")

for transformation in transformations:
    name = transformation.__name__
    if name.endswith("bio"):
        df[name] = df["bio"].apply(lambda x:transformation(x) if isinstance(x, str) else "")
    if name.endswith("post"):
        df[name] = df["body"].apply(lambda x:transformation(x) if isinstance(x, str) else "")



In [37]:
final_df = df[["author_did"] + [i.__name__ for i in transformations]]
final_df

Unnamed: 0,author_did,has_mentions_in_bio,has_mentions_in_first_post,has_url_in_bio,has_url_in_first_post
0,did:plc:kmaleldosvavjfjjo7wl6e6a,False,False,False,False
1,did:plc:le7k5zwsli5545o5habpizir,False,False,False,False
2,did:plc:ljbhqcwg5hougnhecdmmkmeb,False,False,False,False
3,did:plc:lndyvhazyfzvxf374avdchx6,False,False,False,False
4,did:plc:lv26i2oswc5r6a22q2mh4dpi,False,False,False,False
...,...,...,...,...,...
23330,did:plc:hzd5xbub4sreswusx4dfr3ih,False,False,False,False
23331,did:plc:ifgltyxgrdfqv4hwwrg4bwt4,False,False,False,False
23332,did:plc:it3dgoo6w6xo53ndjiku56fd,False,False,False,False
23333,did:plc:jdfkui5fng3yg6f5qsamalkz,False,False,False,False


In [39]:
final_df[final_df["has_url_in_bio"] == True]

Unnamed: 0,author_did,has_mentions_in_bio,has_mentions_in_first_post,has_url_in_bio,has_url_in_first_post
58,did:plc:5niusquxbyjcu7xladonnxym,False,False,True,False
220,did:plc:lghkt7kri3ouxsggv3jscxkp,False,False,True,False
227,did:plc:opbenuyiz7vnbopkomegs6ac,False,False,True,False
366,did:plc:vbmszr2gffpr254j33mezwig,False,False,True,False
404,did:plc:qjvqocu6gx4vmcqsjiu7px3z,False,False,True,False
...,...,...,...,...,...
22378,did:plc:2bnru3l4lbnxwbpdpywsmtnq,False,False,True,False
22524,did:plc:gdal6xjdupuuugf7u4c6zxyr,False,False,True,False
22647,did:plc:jg2b5vutlufg3mm7uwyzcbee,False,False,True,False
22719,did:plc:d6dkjstsyw3zqthf2qtqkjp2,True,False,True,False


In [None]:
final_df.to_csv("prithaj_features.csv", index=False)