In [8]:
!pip install praw beautifulsoup4 transformers torch tqdm



In [9]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

GPU is available: Tesla T4


In [10]:
import praw
from transformers import pipeline
from tqdm import tqdm
from getpass import getpass
import os

os.environ["REDDIT_CLIENT_ID"] = getpass("Enter Reddit client_id: ")
os.environ["REDDIT_CLIENT_SECRET"] = getpass("Enter Reddit client_secret: ")
os.environ["REDDIT_USER_AGENT"] = getpass("Enter Reddit user_agent: ")

# Initialize Reddit with your app credentials
def init_reddit():
    return praw.Reddit(
          client_id = os.getenv("REDDIT_CLIENT_ID"),
          client_secret = os.getenv("REDDIT_CLIENT_SECRET"),
          user_agent = os.getenv("REDDIT_USER_AGENT")
    )

Enter Reddit client_id: ··········
Enter Reddit client_secret: ··········
Enter Reddit user_agent: ··········


In [11]:
# Scrape Reddit user posts and comments
def scrape_user_data(username, reddit):
    user = reddit.redditor(username)
    posts, comments = [], []

    for submission in user.submissions.new(limit=100):
        posts.append({'title': submission.title, 'body': submission.selftext, 'url': submission.url})

    for comment in user.comments.new(limit=100):
        comments.append({'body': comment.body, 'link': f"https://reddit.com{comment.permalink}"})

    return posts, comments

In [12]:
# Use HuggingFace summarizer
def summarize_text(text_chunks):
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    return [summarizer(chunk, max_length=60, min_length=20, do_sample=False)[0]['summary_text'] for chunk in tqdm(text_chunks)]

# Build user persona from summaries
def build_user_persona(posts, comments):
    text_chunks, sources = [], []

    for post in posts:
        content = f"{post['title']}\n{post['body']}"
        if len(content) > 100:
            text_chunks.append(content[:1024])
            sources.append(post.get('url', ''))

    for comment in comments:
        body = comment['body']
        if len(body) > 100:
            text_chunks.append(body[:1024])
            sources.append(comment['link'])

    summaries = summarize_text(text_chunks)

    persona = {
        'Behavior Traits': [],
        'Interests': [],
        'Goals': [],
        'Needs': [],
        'Motivations': []
    }

    for i, summary in enumerate(summaries):
        src = sources[i]
        summary_lower = summary.lower()

        if "want" in summary_lower or "goal" in summary_lower:
            persona['Goals'].append((summary, src))
        elif "need" in summary_lower:
            persona['Needs'].append((summary, src))
        elif "enjoy" in summary_lower or "like" in summary_lower:
            persona['Interests'].append((summary, src))
        elif "feel" in summary_lower or "think" in summary_lower:
            persona['Behavior Traits'].append((summary, src))
        else:
            persona['Motivations'].append((summary, src))

    return persona

In [13]:
# Save persona to text file
def save_persona(username, persona):
    os.makedirs("output", exist_ok=True)
    filepath = f"output/{username}_persona.txt"
    with open(filepath, "w", encoding="utf-8") as f:
        for category, values in persona.items():
            f.write(f"\n== {category} ==\n")
            for value, source in values:
                f.write(f"- {value}\n  [Source] {source}\n")
    print(f"\n Persona saved to {filepath}")
    return filepath

# Extract username from Reddit profile URL
def extract_username_from_url(url):
    return url.strip("/").split("/")[-1]

In [14]:
# Input your Reddit profile URL
reddit_url = "https://www.reddit.com/user/Hungry-Move-6603/"  # Change this to any valid Reddit user
username = extract_username_from_url(reddit_url)

print(f"Extracting data for: {username}")
reddit = init_reddit()
posts, comments = scrape_user_data(username, reddit)

print("Generating persona...")
persona = build_user_persona(posts, comments)

filepath = save_persona(username, persona)


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Extracting data for: Hungry-Move-6603


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Generating persona...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0

  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:04<00:20,  4.09s/it][A
 33%|███▎      | 2/6 [00:06<00:13,  3.26s/it][AYour max_length is set to 60, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)

 50%|█████     | 3/6 [00:07<00:06,  2.27s/it][AYour max_length is set to 60, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)

 67%|██████▋   | 4/6 [00:08<00:03,  1.64s/it][AYour max_length is set to 60, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_lengt


 Persona saved to output/Hungry-Move-6603_persona.txt



