In [1]:
import json
import dotenv
from twon_lss.utility import LLM, Message, Chat
import random
from tqdm import tqdm
import pandas as pd
import re

In [2]:
ENV = dotenv.dotenv_values("../" * 3 + ".env")

In [3]:
df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")


Columns (2,3,6,9) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
# Comprehensive regex including figures, organizations, and variants
# Grouped logically for maintenance, compiled once for performance.
covid_expanded_pattern = re.compile(
    r'\b('
    # 1. Core Virus Identifiers
    r'covid(-?19)?|corona(virus)?|sars-?cov-?2|n?cov(-?19|2019)?|'
    
    # 2. Key Variants
    r'omicron|delta|alpha|beta|ba\.\d+|xbb|'
    
    # 3. Medical & Vaccine Manufacturers
    r'pfizer|moderna|astrazeneca|biontech|j&j|johnson & johnson|'
    r'novavax|sinovac|sputnik v|'
    
    # 4. Slang & Colloquial
    r'the\s?rona|miss\s?rona|covidiot|vax(xed)?|antivax(xer)?|'
    
    # 5. High-Signal Context Specifics
    r'quarantine|lockdown|pandemic|epidemic|'
    r'social distanc(ing|e)|herd immunity|'
    r'wuhan (lab|market)|'
    r'super-?spreader|long covid'
    r')\b',
    re.IGNORECASE
)

def is_covid_relevant(tweet_text):
    """
    Returns True if the tweet contains any major COVID-19 keywords, 
    key figures, or organizations associated with the pandemic.
    """
    if not tweet_text:
        return False
    return bool(covid_expanded_pattern.search(tweet_text))

In [5]:
# Load the profiler prompts
with open("../data/profiler.bio.txt", "r") as f:
    BIO_PROFILER_PROMPT = f.read()

with open("../data/profiler.cognition.txt", "r") as f:
    COGNITION_PROFILER_PROMPT = f.read()

# Load the instructions
with open("../data/agents.instructions.json", "r") as f:
    INSTRUCTIONS = json.load(f)

In [6]:
def determine_posts_per_day(df):
    num_days = (pd.to_datetime(df["created_at"]).max() - pd.to_datetime(df["created_at"]).min()).days
    total_posts = len(df)
    posts_per_day = total_posts / num_days if num_days > 0 else 0
    return posts_per_day 


def df_to_history_string(df) -> str:
    """Parse the history from the json format to the Message format."""
    history_string = ""
    for i, message in df.iterrows():
        history_string += f">Tweet written by you: {message['full_text']}\n"
    return history_string


def df_to_llm_history(df) -> list[Message]:
    """Parse the history from the json format to the Message format."""
    parsed_history = []

    for i, message in df.iterrows():
            parsed_history.append({"role": "user", "content": f"{INSTRUCTIONS['actions']['post_prompt']}"})
            parsed_history.append({"role": "assistant", "content": f"{message['full_text']}"})

    return parsed_history

In [7]:
# Setup the Profiler LLM
AGENT_LLM = LLM(api_key=ENV["HF_TOKEN"], model="Qwen/Qwen3-235B-A22B-Instruct-2507:cerebras")

In [8]:
# Load existing agent personas
with open("../data/agents.personas.json", "r") as f:
    PERSONA_PROFILES = json.load(f)

with open("../data/agents.personas_covid.json", "r") as f:
    PERSONA_PROFILES_COVID = json.load(f)

In [9]:
for screen_name in tqdm(df["screen_name"].unique().tolist()):

    if len([persona for persona in PERSONA_PROFILES_COVID if persona["covid_flag"] == True]) >= 2000:
        break

    if any(persona.get("screen_name") == screen_name for persona in PERSONA_PROFILES_COVID):
        continue

    if any(persona.get("screen_name") == screen_name for persona in PERSONA_PROFILES):
        # get history of existing persona
        for persona in PERSONA_PROFILES:
            if persona.get("screen_name") == screen_name:
                history = persona.get("history", [])
                profile = persona
                break
        
        covid_flag = False
        if is_covid_relevant("".join(message["content"].lower() for message in history)):
            covid_flag = True

        if not covid_flag:
            # update user prompt in history 
            for message in history:
                if message["role"] == "user":
                    message["content"] = INSTRUCTIONS["actions"]["post_prompt"]
                    
            profile["covid_flag"] = False
            profile["covid_tweets"] = []
            PERSONA_PROFILES_COVID.append(profile)
            continue

    persona_dict = {"screen_name": screen_name}

    # Filter to original tweets only
    filtered_df = df[df["screen_name"] == screen_name].drop_duplicates(subset=["full_text"])

    filtered_df["reply_to_user"] = filtered_df["reply_to_user"].astype(str)
    filtered_df = filtered_df[filtered_df["reply_to_user"] == "nan"]

    filtered_df["retweeted_user_ID"] = filtered_df["retweeted_user_ID"].astype(str)
    filtered_df = filtered_df[filtered_df["retweeted_user_ID"] == "nan"]

    filtered_df = filtered_df.sort_values(by="created_at").reset_index(drop=True)

    # Calculate posts per day
    posts_per_day = determine_posts_per_day(filtered_df)

    # Drop any posts that contains an URL
    filtered_df = filtered_df[~filtered_df["full_text"].str.contains("http")].reset_index(drop=True)
    if len(filtered_df) < 5:
        continue

    # Identify COVID-related tweets and drop them from the dataframe
    covid_tweets = []
    for i, row in filtered_df.iterrows():
        tweet_text = row["full_text"].lower()
        if is_covid_relevant(tweet_text):
            covid_tweets.append(row["full_text"])

    # !!! Currently only process users with COVID tweets - remove this condition to profile all users
    if not covid_tweets:
        continue

    filtered_df = filtered_df[~filtered_df["full_text"].apply(lambda x: is_covid_relevant(x.lower()))].reset_index(drop=True)

    bio = AGENT_LLM.generate(
        Chat([
            Message(role="user", content=BIO_PROFILER_PROMPT.format(history=df_to_history_string(filtered_df))),
        ])
    )

    cognition = AGENT_LLM.generate(
        Chat([
            Message(role="user", content=COGNITION_PROFILER_PROMPT.format(history=df_to_history_string(filtered_df), bio=bio))
        ])
    )

    persona_dict["bio"] = bio
    persona_dict["cognition"] = cognition
    persona_dict["history"] = df_to_llm_history(filtered_df)
    persona_dict["posts_per_day"] = posts_per_day
    persona_dict["covid_flag"] = True if covid_tweets else False
    persona_dict["covid_tweets"] = covid_tweets

    PERSONA_PROFILES_COVID.append(persona_dict)

    # Save as JSON
    with open("../data/agents.personas_covid.json", "w") as f:
        json.dump(PERSONA_PROFILES_COVID, f, indent=4)

 43%|████▎     | 14803/34696 [57:02<1:54:39,  2.89it/s] ERROR:root:Failed to query LLM: 'choices'
 48%|████▊     | 16509/34696 [1:14:27<5:06:13,  1.01s/it] ERROR:root:Failed to query LLM: 'choices'
 50%|████▉     | 17323/34696 [1:23:14<9:56:08,  2.06s/it]  ERROR:root:Failed to query LLM: 'choices'
 54%|█████▎    | 18632/34696 [1:39:22<1:32:26,  2.90it/s]  ERROR:root:Failed to query LLM: 'choices'
 58%|█████▊    | 20241/34696 [1:57:15<1:23:44,  2.88it/s] ERROR:root:Failed to query LLM: 'choices'
 60%|█████▉    | 20809/34696 [2:05:05<2:46:07,  1.39it/s] ERROR:root:Failed to query LLM: 'choices'
 60%|█████▉    | 20816/34696 [2:06:21<15:05:38,  3.91s/it]ERROR:root:Failed to query LLM: 'choices'
 62%|██████▏   | 21354/34696 [2:12:55<4:57:50,  1.34s/it] ERROR:root:Failed to query LLM: 'choices'
 62%|██████▏   | 21431/34696 [2:14:35<1:17:13,  2.86it/s] ERROR:root:Failed to query LLM: 'choices'
 62%|██████▏   | 21633/34696 [2:17:22<1:15:11,  2.90it/s] ERROR:root:Failed to query LLM: 'choices'
