# Installation

In [None]:
# pip install -r requirements.txt

# Tool usage test

In [1]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Add console handler only if not already present
if not logger.handlers:
    handler = logging.StreamHandler()
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

Persona Extractor

In [1]:
from src.tools.persona_extractor_gpt import get_dialogpt_persona_extractor_tool

tool1 = get_dialogpt_persona_extractor_tool('user_91', 'Lifestyle Optimization')

result1 = tool1.invoke({"sentence": "I also like to eat pasta for dinner."})
print(result1)

Device set to use cuda:0
INFO:persona_util:SQLitePersonaDB initialized
INFO:src.tools.persona_extractor_gpt:[START] DialogGPT extract() called with: I also like to eat pasta for dinner.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
INFO:src.tools.persona_extractor_gpt:[EXTRACTED] relation='favorite_food', object='also pasta', topic='Skill', task='Lifestyle Optimization'
INFO:persona_util:Persona fact inserted for user user_91, task Lifestyle Optimization
INFO:src.tools.persona_extractor_gpt:[SAVED TO DB] for username='user_91'


[EXTRACTED] relation='favorite_food', object='also pasta', topic='Skill', task='Lifestyle Optimization'


In [2]:
from src.tools.persona_extractor import get_persona_extractor_tool

tool1 = get_persona_extractor_tool('user_91', 'Lifestyle Optimization')

result1 = tool1.invoke({"sentence": "I also like to eat pasta for dinner."})
print(result1)

Device set to use cuda:0
INFO:persona_util:SQLitePersonaDB initialized
INFO:src.tools.persona_extractor:[EXTRACTED] relation='favorite_food', object='pasta', topic='Skill', task='Lifestyle Optimization'
INFO:persona_util:Persona fact inserted for user user_91, task Lifestyle Optimization
INFO:src.tools.persona_extractor:[SAVED TO DB] for username='user_91'


[EXTRACTED] relation='favorite_food', object='pasta', topic='Skill', task='Lifestyle Optimization'


In [3]:
result2 = tool1.invoke({"sentence": "I like to work as a data scientist."})
print(result2)

INFO:src.tools.persona_extractor:[EXTRACTED] relation='has_profession', object='scientist', topic='Career', task='Lifestyle Optimization'
INFO:persona_util:Persona fact inserted for user user_91, task Lifestyle Optimization
INFO:src.tools.persona_extractor:[SAVED TO DB] for username='user_91'


[EXTRACTED] relation='has_profession', object='scientist', topic='Career', task='Lifestyle Optimization'


Community-based Recommender:

In [2]:
from src.tools.community_recommender import create_community_recommender_tool

tool2 = create_community_recommender_tool(username="user_41", task="Content Consumption")

result2 = tool2.invoke({"sentence": "Can you provide community recommendations?"})
print(result2)

INFO - SQLitePersonaDB initialized
INFO - [TOOL INVOKED] Community recommender triggered with input: Can you provide community recommendations?


[TOOL RESULT]
Type: Recommendation:
Community suggestions for Content Consumption (from other users):
- Book:
  . occasionally reads news articles: liked by 10 users
  . reading graphic novels: liked by 5 users
  . reading science fiction: liked by 5 users
- Game:
  . card games: liked by 6 users
  . strategy games: liked by 6 users
  . arcade games: liked by 5 users
- Movie:
  . watching animated movies: liked by 6 users
  . watching fantasy movies: liked by 6 users
  . watching historical dramas: liked by 6 users
- Music:
  . hip hop music: liked by 6 users
  . indie music: liked by 6 users
  . reggae music: liked by 6 users


# Agent Testing

In [None]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [1]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
# LLM = "DialoGPT-small"             # "Phi-3.5-mini-instruct", "DialoGPT-small"
# model_id = f"microsoft/{LLM}"           # f"meta-llama/{LLM}"

In [3]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import os

# # Define model ID and local paths

# MODEL_PATH = f"LLMs/{LLM}"
# TOKENIZER_PATH = f"Tokenizers/{LLM}"

# # Function to check if directory is already populated
# def is_downloaded(directory):
#     return os.path.exists(directory) and any(os.scandir(directory))

# # Create folders if necessary
# os.makedirs(MODEL_PATH, exist_ok=True)
# os.makedirs(TOKENIZER_PATH, exist_ok=True)

# # Download tokenizer if not already present
# if not is_downloaded(TOKENIZER_PATH):
#     print("⬇️ Downloading tokenizer...")
#     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
#     tokenizer.save_pretrained(TOKENIZER_PATH)
#     print(f"✅ Tokenizer saved to: {TOKENIZER_PATH}")
# else:
#     print(f"✅ Tokenizer already exists at: {TOKENIZER_PATH}")

# # Download model if not already present
# if not is_downloaded(MODEL_PATH):
#     print("⬇️ Downloading model...")
#     model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
#     model.save_pretrained(MODEL_PATH)
#     print(f"✅ Model saved to: {MODEL_PATH}")
# else:
#     print(f"✅ Model already exists at: {MODEL_PATH}")

In [2]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)


# Add console handler
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [3]:
from dotenv import load_dotenv
load_dotenv() 

True

In [4]:
from src.utils.persona_util import SQLitePersonaDB
db = SQLitePersonaDB()

USERNAME = "user_02"

# db.create_user(USERNAME, 'Sali A', 28, 'male', 'user')

USER = db.get_user(USERNAME)

task_list = [f"{task['name']}" for task in db.get_all_tasks()]

TASK = task_list[0]
print(TASK)

USER_PERSONA = db.get_user_persona_summary_by_task(USERNAME, TASK)
# USER_PERSONA = USER_PERSONA.replace('user_01', USERNAME)

USER_PERSONA

INFO - SQLitePersonaDB initialized


Content Consumption


'User user_02 preferences for Content Consumption: | Book: like_read 1984, 1984, 1984; likes reading biographies | Career: like_read 1984, 1984 | Game: plays board games | Movie: likes watching comedies | Music: likes pop music'

## OpenAI Agent

In [5]:
from src.agents.PersoAgent import PersoAgent

agent = PersoAgent(
    user = USER,
    task = TASK,
    prev_personas = USER_PERSONA
)

INFO - Initializing PersoAgent for user: user_02 with task: Content Consumption and previous personas: User user_02 preferences for Content Consumption: | Book: like_read 1984, 1984, 1984; likes reading biographies | Career: like_read 1984, 1984 | Game: plays board games | Movie: likes watching comedies | Music: likes pop music
INFO - Initializing PersoAgent for user: {'username': 'user_02', 'full_name': 'Bob Johnson', 'age': 32, 'gender': 'male', 'role': 'user'} with task: Content Consumption
Device set to use cuda:0
INFO - SQLitePersonaDB initialized
INFO - SQLitePersonaDB initialized


In [6]:
# Simple invocation - just pass the task
response = agent.handle_task("I just finished reading the 1984 book, and I loved it.")

print(response)



[1m> Entering new AgentExecutor chain...[0m


INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m
Invoking: `PersonaExtractor` with `{'sentence': 'I just finished reading the 1984 book, and I loved it.'}`


[0m

INFO - [EXTRACTED] relation='like_read', object='1984', topic='Book', task='Content Consumption'
INFO - Persona fact inserted for user user_02, task Content Consumption
INFO - [SAVED TO DB] for username='user_02'


[36;1m[1;3m[TOOL RESULT] relation='like_read', object='1984', topic='Book', task='Content Consumption'[0m

INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "response": "It's great to hear that you loved reading the book 1984!",
    "reason": "User expressing his preference for the book 1984",
    "used_tool": "PersonaExtractor"
}
{
    "response": "It's great to hear that you loved reading the book 1984!",
    "reason": "User expressing his preference for the book 1984",
    "used_tool": "PersonaExtractor"
}[0m

[1m> Finished chain.[0m
{
    "response": "It's great to hear that you loved reading the book 1984!",
    "reason": "User expressing his preference for the book 1984",
    "used_tool": "PersonaExtractor"
}
{
    "response": "It's great to hear that you loved reading the book 1984!",
    "reason": "User expressing his preference for the book 1984",
    "used_tool": "PersonaExtractor"
}


In [7]:
response = agent.handle_task("Give me some book recommendation from the community, then.")

print(response)



[1m> Entering new AgentExecutor chain...[0m


INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO - [TOOL INVOKED] Community recommender triggered with input: Can you provide community recommendations?


[32;1m[1;3m
Invoking: `CommunityRecommender` with `{'sentence': 'Can you provide community recommendations?'}`
responded: {
    "response": "I will need to use the CommunityRecommender tool to provide you with book recommendations from the community.",
    "reason": "User requested community recommendations for books.",
    "used_tool": "CommunityRecommender"
}

[0m[33;1m[1;3m[TOOL RESULT]
Type: Recommendation:
Here are community recommendations for Content Consumption:
\n• Book:\n  - occasionally reads news articles: liked by 10 users
\n  - reading graphic novels: liked by 5 users
\n  - reading science fiction: liked by 5 users
\n• Game:\n  - card games: liked by 6 users
\n  - strategy games: liked by 6 users
\n  - arcade games: liked by 5 users
\n• Movie:\n  - watching animated movies: liked by 6 users
\n  - watching fantasy movies: liked by 6 users
\n  - watching historical dramas: liked by 6 users
\n• Music:\n  - hip hop music: liked by 6 users
\n  - indie music: liked by 6 us

INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "response": "Here are some book recommendations from the community:\n- occasionally reads news articles: liked by 10 users\n- reading graphic novels: liked by 5 users\n- reading science fiction: liked by 5 users",
    "reason": "Community recommendations for books based on user's request.",
    "used_tool": "CommunityRecommender"
}[0m

[1m> Finished chain.[0m
{
    "response": "Here are some book recommendations from the community:\n- occasionally reads news articles: liked by 10 users\n- reading graphic novels: liked by 5 users\n- reading science fiction: liked by 5 users",
    "reason": "Community recommendations for books based on user's request.",
    "used_tool": "CommunityRecommender"
}


In [17]:
# Simple invocation - just pass the task
response = agent.handle_task("So, which movies should I watch tonight?")

print(response)



[1m> Entering new AgentExecutor chain...[0m


INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "response": "You can consider watching 'Inception' or 'The Shawshank Redemption'. Both are highly acclaimed movies that are worth watching.",
    "reason": "General movie recommendation based on popular choices.",
    "used_tool": "None"
}[0m

[1m> Finished chain.[0m
{
    "response": "You can consider watching 'Inception' or 'The Shawshank Redemption'. Both are highly acclaimed movies that are worth watching.",
    "reason": "General movie recommendation based on popular choices.",
    "used_tool": "None"
}


# Labeling Assistatn

In [None]:
# Set memory limits before importing transformers
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
from transformers import GPT2Tokenizer, AutoModelForCausalLM
import json
import re

class LabelingAssistant:
    def __init__(self):
        # Use 75% of GPU memory (9GB out of 12GB)
        torch.cuda.set_per_process_memory_fraction(0.75)
        torch.cuda.empty_cache()
        
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = GPT2Tokenizer.from_pretrained("./Tokenizers/Phi-4-mini-instruct")
        
        self.model = AutoModelForCausalLM.from_pretrained(
            "./LLMs/Phi-4-mini-instruct", 
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
    
    def get_alignment_score(self, user_persona: str, classification_description: str) -> dict:
        prompt = f"""<|user|>
Analyze alignment between user persona and classification. Return JSON with score (0.0-1.0) and reasoning.

User Persona: {user_persona}
Classification: {classification_description}

Response format: {{"score": 0.85, "reasoning": "explanation"}}
<|end|>
<|assistant|>"""

        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.3,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                use_cache=False
            )
        
        response = self.tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        
        del outputs, inputs
        torch.cuda.empty_cache()
        
        try:
            json_match = re.search(r'\{[^}]*"score"[^}]*\}', response)
            if json_match:
                return json.loads(json_match.group())
        except:
            pass
        
        score_match = re.search(r'([0-9.]+)', response)
        score = float(score_match.group(1)) if score_match else 0.5
        return {"score": min(1.0, score), "reasoning": response.strip()}
    
    def reset_memory(self):
        torch.cuda.set_per_process_memory_fraction(1.0)
        torch.cuda.empty_cache()

# Usage
assistant = LabelingAssistant()

user_persona = "Content Consumption: | Book: likes reading novels | Game: plays video games | Movie: likes watching sci-fi | Music: likes classical music"
classification = "Classify users based on their passion for reading books and literature."

result = assistant.get_alignment_score(user_persona, classification)
print(f"Score: {result['score']}")
print(f"Reasoning: {result['reasoning']}")

assistant.reset_memory()


  _ = torch.tensor([0], device=i)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Score: 0.5
Reasoning: The user persona shows a strong interest in reading novels, which aligns well with the classification's focus on reading books and literature. However, the user also enjoys other forms of media such as video games, sci-fi movies, and classical music, which are not directly related to the classification's focus on reading. Therefore, while there is a significant overlap, the user's diverse interests prevent a perfect score.


In [1]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)


# Add console handler
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

In [2]:
from src.utils.persona_util import SQLitePersonaDB

db = SQLitePersonaDB()

res = db.get_random_candidate_usernames(classification_task_name="Content Consumption", count=5)
res

INFO - SQLitePersonaDB initialized
INFO - Found 5 candidate users for classification task 'Content Consumption'


['user_36', 'user_19', 'user_02', 'user_06', 'user_34']

In [3]:
from src.tools.labeling_assistant import LabelingAssistant

assistant = LabelingAssistant()
pool_size = 5
clf_task_name = "Camping Enthusiast"
result = assistant.get_alignment_score(pool_size, clf_task_name)

  _ = torch.tensor([0], device=i)
INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO - SQLitePersonaDB initialized
INFO - Found 5 candidate users for classification task 'Camping Enthusiast'
INFO - Found 5 candidates for task 'Camping Enthusiast'
INFO - Persona Summary for user_31: 
User user_31 preferences: || Career Development - Career: prefers_path academia | Education: studied_at Columbia | Skill: has_skill writing || Content Consumption - Book: likes reading mysteries | Game: plays role-playing games | Movie: likes watching adventure movies | Music: likes blues music || Lifestyle Optimization - Exercise: has_hobby climbing | Mental Health: focuses_on reading self-help | Outdoor: likes hiking and sleeping in tents
INFO - Persona Summary for user_13: 
User user_13 preferences: || Career Development - Career: prefers_path industry | Education: studied_at Yale | Skill: has_skill teaching || Content Consumption - Book: likes reading philosophy | Game: plays arcade games | Movie: likes watching fantasy movies | Music: likes indie music || Lifestyle Optimization - 

In [4]:
result

[{'username': 'user_32',
  'full_name': 'Indigo Underwood',
  'age': 39,
  'gender': 'female',
  'score': 0.85,
  'reasoning': '```json\n{\n  "score": 0.85,\n  "reasoning": "User has a hobby in camping, indicating a strong interest in outdoor activities, which aligns well with the classification criteria."\n}\n``` <|end|>'},
 {'username': 'user_39',
  'full_name': 'Finley Nash',
  'age': 23,
  'gender': 'male',
  'score': 0.0,
  'reasoning': '```json\n{\n  "score": 0.0,\n  "reasoning": "No topics in the user persona relate to camping and outdoor activities."\n}\n``` <|end|>'},
 {'username': 'user_08',
  'full_name': 'Hannah Adams',
  'age': 24,
  'gender': 'female',
  'score': 0.7,
  'reasoning': '```json\n{\n  "score": 0.7,\n  "reasoning": "User has a hobby in mountain biking, which is related to outdoor activities. However, no direct interest in camping or broader outdoor activities is mentioned."\n}\n``` <|end|>'},
 {'username': 'user_20',
  'full_name': 'Quinn Vega',
  'age': 19,
 

# Classification

In [2]:
import sys
sys.path.append('src')
from utils.persona_util import SQLitePersonaDB

db = SQLitePersonaDB()
db._create_tables()
pred = db.get_predictions(classification_task_id= 2)
pred

INFO:persona_util:SQLitePersonaDB initialized


[]

In [1]:
# Simple test for classification functions
import sys
sys.path.append('src')
from utils.persona_util import SQLitePersonaDB

db = SQLitePersonaDB()

# Get classification task name
task_info = db.get_classification_task(2)
print(f"Classification Task: {task_info['name']}")

# Get two users from responses
responses = db.get_classification_task_responses(task_info['name'])
user1 = responses[0]['username']
user2 = responses[1]['username']

print(f"\nUser 1: {user1}")
summary1 = db.get_user_persona_summary(user1)
filtered1 = db.filter_persona_summary_from_topics(summary1)
print(f"Filtered: {filtered1}")

print(f"\nUser 2: {user2}")
summary2 = db.get_user_persona_summary(user2)
filtered2 = db.filter_persona_summary_from_topics(summary2)
print(f"Filtered: {filtered2}")


INFO:persona_util:SQLitePersonaDB initialized


Classification Task: Camping Enthusiast

User 1: user_27
Filtered:  | prefers_path academia | studied_at Oxford | has_skill marketing | likes reading children's books | plays mobile games | likes watching thrillers | likes jazz music | has_hobby hiking | focuses_on journaling | frequently_visits national parks

User 2: user_38
Filtered:  | prefers_path industry | studied_at Oxford | has_skill marketing | likes reading biographies | plays arcade games | likes watching thrillers | likes jazz music | has_hobby hiking | focuses_on journaling | has_experience_with campfire cooking


In [3]:
# Test PersonaClassifier
import sys
sys.path.append('src')
from tools.persona_classifier import PersonaClassifier

classifier = PersonaClassifier()

# Get classification task name from ID 2
task_info = classifier.db.get_classification_task(2)
task_name = task_info['name']
print(f"Classification Task: {task_name}")

# Train classifier
result = classifier.train(task_name)
print(f"Training result: {result}")

if result.get("success"):
    # Get candidates and predict
    candidates = classifier.db.get_random_candidate_usernames(task_name, 10)
    if candidates:
        predictions = classifier.predict(candidates, 5)
        print(f"\nPredictions:")
        for pred in predictions:
            print(f"  {pred['username']}: {pred['prediction']} (confidence: {pred['confidence']})")
    else:
        print("No candidates found")


INFO:persona_util:SQLitePersonaDB initialized
INFO:persona_util:Found 10 candidate users for classification task 'Camping Enthusiast'


Classification Task: Camping Enthusiast
Training result: {'success': True, 'training_size': 12, 'accepted': 6, 'declined': 6}

Predictions:
  user_33: declined (confidence: 0.231)
  user_16: accepted (confidence: 0.095)
  user_39: declined (confidence: 0.076)
  user_02: accepted (confidence: 0.042)
  user_01: accepted (confidence: 0.023)


In [3]:
# Test updated classifier
import sys
sys.path.append('src')
from tools.persona_classifier import PersonaClassifier

classifier = PersonaClassifier()

# Train and predict
classification_task_id = 2
task_info = classifier.db.get_classification_task(classification_task_id)
result = classifier.train(task_info['name'])

if result.get("success"):
    predictions = classifier.predict_and_save(classification_task_id, count=30, min_confidence=0.6)
    print(predictions)


INFO:persona_util:SQLitePersonaDB initialized
INFO:persona_util:Found 30 candidate users for classification task 'Camping Enthusiast'
INFO:persona_util:Saved 8 predictions for classification task 2


{'classification_task': {'id': 2, 'name': 'Camping Enthusiast'}, 'predictions': [{'username': 'user_22', 'predicted_label': 'Camping Enthusiast', 'confidence': np.float64(0.637)}, {'username': 'user_10', 'predicted_label': 'Camping Enthusiast', 'confidence': np.float64(0.667)}, {'username': 'user_25', 'predicted_label': 'Camping Enthusiast', 'confidence': np.float64(0.64)}, {'username': 'user_08', 'predicted_label': 'Camping Enthusiast', 'confidence': np.float64(0.649)}, {'username': 'user_12', 'predicted_label': 'Not Enthusiast', 'confidence': np.float64(0.63)}, {'username': 'user_36', 'predicted_label': 'Not Enthusiast', 'confidence': np.float64(0.709)}, {'username': 'user_33', 'predicted_label': 'Not Enthusiast', 'confidence': np.float64(0.663)}, {'username': 'user_40', 'predicted_label': 'Not Enthusiast', 'confidence': np.float64(0.602)}]}


In [4]:
predictions

{'classification_task': {'id': 2, 'name': 'Camping Enthusiast'},
 'predictions': [{'username': 'user_22',
   'predicted_label': 'Camping Enthusiast',
   'confidence': np.float64(0.637)},
  {'username': 'user_10',
   'predicted_label': 'Camping Enthusiast',
   'confidence': np.float64(0.667)},
  {'username': 'user_25',
   'predicted_label': 'Camping Enthusiast',
   'confidence': np.float64(0.64)},
  {'username': 'user_08',
   'predicted_label': 'Camping Enthusiast',
   'confidence': np.float64(0.649)},
  {'username': 'user_12',
   'predicted_label': 'Not Enthusiast',
   'confidence': np.float64(0.63)},
  {'username': 'user_36',
   'predicted_label': 'Not Enthusiast',
   'confidence': np.float64(0.709)},
  {'username': 'user_33',
   'predicted_label': 'Not Enthusiast',
   'confidence': np.float64(0.663)},
  {'username': 'user_40',
   'predicted_label': 'Not Enthusiast',
   'confidence': np.float64(0.602)}]}