In [1]:
import json
import dotenv
from twon_lss.utility import LLM, Message, Chat
import random
from tqdm import tqdm
import pandas as pd

In [2]:
ENV = dotenv.dotenv_values("../" * 3 + ".env")

In [3]:
df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")


Columns (2,3,6,9) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
# Load the profiler prompts
with open("../data/profiler.bio.txt", "r") as f:
    BIO_PROFILER_PROMPT = f.read()

with open("../data/profiler.cognition.txt", "r") as f:
    COGNITION_PROFILER_PROMPT = f.read()

# Load the instructions
with open("../data/agents.instructions.json", "r") as f:
    INSTRUCTIONS = json.load(f)

In [5]:
def determine_posts_per_day(df):
    num_days = (pd.to_datetime(df["created_at"]).max() - pd.to_datetime(df["created_at"]).min()).days
    total_posts = len(df)
    posts_per_day = total_posts / num_days if num_days > 0 else 0
    return posts_per_day 


def df_to_history_string(df) -> str:
    """Parse the history from the json format to the Message format."""
    history_string = ""
    for i, message in df.iterrows():
        history_string += f">Tweet written by you: {message['full_text']}\n"
    return history_string


def df_to_llm_history(df) -> list[Message]:
    """Parse the history from the json format to the Message format."""
    parsed_history = []

    for i, message in df.iterrows():
            parsed_history.append({"role": "user", "content": f"{INSTRUCTIONS['actions']['post_prompt']}"})
            parsed_history.append({"role": "assistant", "content": f"{message['full_text']}"})

    return parsed_history

In [6]:
# Setup the Profiler LLM
AGENT_LLM = LLM(api_key=ENV["HF_TOKEN"], model="Qwen/Qwen3-235B-A22B-Instruct-2507:cerebras")

In [7]:
# Load existing agent personas
with open("../data/agents.personas_dummy.json", "r") as f:
    PERSONA_PROFILES = json.load(f)

#PERSONA_PROFILES = []

In [8]:
for screen_name in tqdm(df["screen_name"].unique().tolist()):
    if any(persona.get("screen_name") == screen_name for persona in PERSONA_PROFILES):
        continue

    persona_dict = {"screen_name": screen_name}

    # Filter to original tweets only
    filtered_df = df[df["screen_name"] == screen_name].drop_duplicates(subset=["full_text"])

    filtered_df["reply_to_user"] = filtered_df["reply_to_user"].astype(str)
    filtered_df = filtered_df[filtered_df["reply_to_user"] == "nan"]

    filtered_df["retweeted_user_ID"] = filtered_df["retweeted_user_ID"].astype(str)
    filtered_df = filtered_df[filtered_df["retweeted_user_ID"] == "nan"]

    filtered_df = filtered_df.sort_values(by="created_at").reset_index(drop=True)

    # Calculate posts per day
    posts_per_day = determine_posts_per_day(filtered_df)

    # Drop any posts that contains an URL
    filtered_df = filtered_df[~filtered_df["full_text"].str.contains("http")].reset_index(drop=True)
    if len(filtered_df) < 5:
        continue

    bio = AGENT_LLM.generate(
        Chat([
            Message(role="user", content=BIO_PROFILER_PROMPT.format(history=df_to_history_string(filtered_df))),
        ])
    )

    cognition = AGENT_LLM.generate(
        Chat([
            Message(role="user", content=COGNITION_PROFILER_PROMPT.format(history=df_to_history_string(filtered_df), bio=bio))
        ])
    )

    persona_dict["bio"] = bio
    persona_dict["cognition"] = cognition
    persona_dict["history"] = df_to_llm_history(filtered_df)
    persona_dict["posts_per_day"] = posts_per_day

    PERSONA_PROFILES.append(persona_dict)

    # Save as JSON
    with open("../data/agents.personas_dummy.json", "w") as f:
        json.dump(PERSONA_PROFILES, f, indent=4)

 20%|█▉        | 6826/34696 [20:02<32:16:09,  4.17s/it]ERROR:root:Failed to query LLM: 'choices'
 21%|██        | 7365/34696 [35:56<26:17:05,  3.46s/it] ERROR:root:Failed to query LLM: 'choices'
 22%|██▏       | 7690/34696 [47:58<19:23:16,  2.58s/it] ERROR:root:Failed to query LLM: 'choices'
 24%|██▍       | 8398/34696 [1:12:49<19:52:50,  2.72s/it]ERROR:root:Failed to query LLM: 'choices'
 24%|██▍       | 8417/34696 [1:14:44<3:53:22,  1.88it/s]  


KeyboardInterrupt: 