In [None]:
import praw
import openai
import os
from getpass import getpass

In [None]:
import google.generativeai as genai
from getpass import getpass

In [None]:
import json
import re

In [None]:
os.environ['REDDIT_CLIENT_ID'] = getpass('Enter your Reddit client ID: ')
os.environ['REDDIT_CLIENT_SECRET'] = getpass('Enter your Reddit client secret: ')
os.environ['REDDIT_USER_AGENT'] = getpass('Enter your Reddit user agent: ')
os.environ['GOOGLE_API_KEY'] = getpass('Enter your Google AI API Key: ')

In [None]:
try:
    reddit = praw.Reddit(
        client_id=os.environ['REDDIT_CLIENT_ID'],
        client_secret=os.environ['REDDIT_CLIENT_SECRET'],
        user_agent=os.environ['REDDIT_USER_AGENT']
    )
    next(reddit.front.hot(limit=1))
    print("Successfully connected to Reddit API.")
except Exception as e:
    print(f"Failed to connect to Reddit API. Please check your credentials. Error: {e}")

In [None]:
def scrape_redditor_data(username):
    try:
        redditor = reddit.redditor(username)
        if not hasattr(redditor, 'id'):
            print(f"User '{username}' not found or is suspended.")
            return None

        comments = []
        for comment in redditor.comments.new(limit=None):
            comments.append(f"Comment in r/{comment.subreddit.display_name} (Score: {comment.score}): {comment.body}\n---\n")

        posts = []
        for submission in redditor.submissions.new(limit=None):
            posts.append(f"Post in r/{submission.subreddit.display_name} (Score: {submission.score}): {submission.title}\n{submission.selftext}\n---\n")
        
        print(f"Found {len(comments)} comments and {len(posts)} posts for user '{username}'.")
        
        return {
            "comments": comments, 
            "posts": posts,
        }

    except Exception as e:
        print(f"An error occurred while scraping Reddit: {e}")
        return None

In [None]:
def build_user_persona(user_data):
    """
    Builds a user persona based on scraped Reddit data using the Google Gemini LLM,
    with a strong emphasis on citing every piece of information.
    """
    try:
        genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
    except Exception as e:
        return f"Google API Key configuration failed. Ensure you have entered it correctly. Error: {e}"

    if not user_data or (not user_data["comments"] and not user_data["posts"]):
        return "Could not generate a persona due to lack of user data."

    all_content = "".join(user_data["comments"]) + "".join(user_data["posts"])
    
    prompt = f"""
    You are an expert analyst. Based on the following Reddit comments and posts, create a detailed user persona.
    
    **CRITICAL INSTRUCTION:** For every single point you make in every category, you MUST provide a direct quote from the user's content as a citation. The format is mandatory: (Citation: "The user's original text snippet..."). If you cannot find a direct quote to support a point and must make an inference, you MUST state it as (Citation: Inferred from overall activity). Do not make up citations.

    **EXAMPLE OF THE REQUIRED FORMAT:**
    *   **Interests:**
        *   Baking Sourdough Bread (Citation: "My starter is finally active enough to bake a proper sourdough loaf this weekend.")
        *   Classic Science Fiction (Citation: "Just finished re-reading Dune for the fifth time, it never gets old.")
        *   PC Gaming (Citation: Inferred from frequent posts in r/buildapc and r/gaming.)

    **--- START PERSONA GENERATION ---**

    **Persona Title:** [Create a short, creative title for the persona, like "The Knowledgeable Hobbyist"]

    **Demographics:**
    *   **Age (Estimated):** [Estimate the age range] (Citation: ...)
    *   **Location (Estimated):** [Estimate the location] (Citation: ...)
    *   **Occupation (Estimated):** [Estimate the occupation] (Citation: ...)
    *   **Interests:** [List key interests as bullet points, each with its own citation]

    **Personality (based on user's tone and topics):**
    *   [Provide a brief analysis of the user's personality, with a citation for each observation.]

    **Motivations:**
    *   [List user's key motivations as bullet points, each with its own citation.]

    **Frustrations/Pain Points:**
    *   [List user's key frustrations as bullet points, each with its own citation.]

    **Quote:**
    *   [A representative quote from the user's comments/posts that captures their essence.]

    **Reddit Activity Summary:**
    *   **Most Active Subreddits:** [List the subreddits the user is most active in] (Citation: Inferred from the source of the provided comments and posts.)
    *   **General Tone:** [Describe the overall tone] (Citation: ...)

    **--- END PERSONA GENERATION ---**

    Here is the user's content (use this for your analysis):
    ---
    {all_content[:25000]}
    """

    try:
        model = genai.GenerativeModel('gemini-1.5-flash-latest')
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Full error from Google AI: {e}")
        return f"An error occurred with the Google AI API. Check the error message above."

In [None]:
def save_persona_to_file(username, persona_text):
    filename = f"{username}_persona.txt"
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, f"{username}_persona.txt")
    
    cleaned_text = persona_text.replace('*', '')
    try:
        with open(filename, "w", encoding="utf-8") as f:
            f.write(cleaned_text)
        print(f"Cleaned persona successfully saved to {filename}")
    except Exception as e:
        print(f"Could not save the text file. Error: {e}")

In [None]:


def save_persona_to_json_final(username, persona_text):
    """
    Parses the text persona and saves it to a uniquely named JSON file
    directly inside the 'public' folder.
    """
    persona_data = {
        "personaTitle": "N/A", "demographics": {"Age": "N/A", "Occupation": "N/A", "Location": "N/A"},
        "habits": [], "frustrations": [], "goals": [], "quote": ""
    }
    try:
        
        def get_section_content(title, text):
            pattern = re.compile(r"\*\*" + re.escape(title) + r":\*\*\n(.*?)(?=\n\n\*\*|$)", re.DOTALL)
            match = pattern.search(text)
            return match.group(1).strip() if match else ""
        
        # Helper for list items
        def get_list_items(block_content):
            if not block_content: return []
            items = re.findall(r"\* (.*?)(?=\s*\(Citation:)", block_content)
            return [item.strip() for item in items]

        title_match = re.search(r"\*\*Persona Title:\*\*\s*(.*)", persona_text)
        if title_match:
            persona_data["personaTitle"] = title_match.group(1).strip()

        demographics_block = get_section_content("Demographics", persona_text)
        if demographics_block:
            age_match = re.search(r"Age \(Estimated\):\*\* (.*?)\s*\(Citation:", demographics_block)
            loc_match = re.search(r"Location \(Estimated\):\*\* (.*?)\s*\(Citation:", demographics_block)
            occ_match = re.search(r"Occupation \(Estimated\):\*\* (.*?)\s*\(Citation:", demographics_block)
            if age_match: persona_data["demographics"]["Age"] = age_match.group(1).strip()
            if loc_match: persona_data["demographics"]["Location"] = loc_match.group(1).strip()
            if occ_match: persona_data["demographics"]["Occupation"] = occ_match.group(1).strip()

        persona_data["habits"] = get_list_items(get_section_content("Interests", persona_text))
        persona_data["frustrations"] = get_list_items(get_section_content("Frustrations/Pain Points", persona_text))
        persona_data["goals"] = get_list_items(get_section_content("Motivations", persona_text))

        quote_match = re.search(r"\*\*Quote:\*\*\s*\n\s*\*.*?\"(.*?)\"", persona_text, re.DOTALL)
        if quote_match:
            persona_data["quote"] = quote_match.group(1).strip()
        
        output_dir = 'public'

        os.makedirs(output_dir, exist_ok=True)
        
        filename = os.path.join(output_dir, f"persona_data.json")
        
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(persona_data, f, indent=4)
        
        print(f"Definitive persona data successfully saved to {filename}")
        
    except Exception as e:
        print(f"An error occurred in the FINAL JSON parser. Error: {e}")

In [None]:

if __name__ == '__main__':
    reddit_url = input("Enter the full Reddit user profile URL: ")
    
    try:
        username_part = reddit_url.split('/user/')[1]
        reddit_username = username_part.split('/')[0]
        print(f"\nExtracted username: '{reddit_username}'. Starting process...")

        print("\nStep 1: Scraping data from Reddit...")
        user_data = scrape_redditor_data(reddit_username)

        if user_data:
            print("\nStep 2: Building user persona with Google Gemini...")
            persona_text = build_user_persona(user_data)
            
            print("\nStep 3: Saving persona to files...")
            save_persona_to_file(reddit_username, persona_text)
            save_persona_to_json_final(reddit_username, persona_text)

            print("----------------------------------")
            
        else:
            print("\nProcess stopped because no data could be scraped for the user.")

    except IndexError:
        print("\n[ERROR] Invalid URL format. Please use a full Reddit user profile URL.")