# Install Dependencies

In [None]:
!pip install httpx loguru parsel pandas requests beautifulsoup4


# Import Dependencies 

In [4]:
import json
import asyncio
from typing import List, Dict, Union
from datetime import datetime, timezone
from httpx import AsyncClient, Response
from loguru import logger as log
from parsel import Selector
import logging
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from httpx import AsyncClient, Response
from parsel import Selector
from typing import List, Dict, Union

# Fetch the post of the User

In [2]:

# Minimal logger
class log:
    @staticmethod
    def success(msg): print("[SUCCESS]", msg)
    @staticmethod
    def error(msg): print("[ERROR]", msg)

client = AsyncClient(
    headers={"User-Agent": "Mozilla/5.0"},
    timeout=10.0,
    follow_redirects=True
)
from typing import List, Dict, Union
from httpx import Response

def parse_user_posts(response: Response) -> Dict:
    selector = Selector(response.text)
    data = []
    for box in selector.xpath("//div[@id='siteTable']/div[contains(@class, 'thing')]"):
        author = box.xpath("./@data-author").get()
        link = box.xpath("./@data-permalink").get()
        publishing_date = box.xpath("./@data-timestamp").get()
        publishing_date = (
            datetime.fromtimestamp(int(publishing_date) / 1000.0, tz=timezone.utc).isoformat()
            if publishing_date else None
        )
        comment_count = box.xpath("./@data-comments-count").get()
        post_score = box.xpath("./@data-score").get()
        data.append({
            "authorId": box.xpath("./@data-author-fullname").get(),
            "author": author,
            "authorProfile": f"https://www.reddit.com/user/{author}" if author else None,
            "postId": box.xpath("./@data-fullname").get(),
            "postLink": f"https://www.reddit.com{link}" if link else None,
            "postTitle": box.xpath(".//p[@class='title']/a/text()").get(),
            "postSubreddit": box.xpath("./@data-subreddit-prefixed").get(),
            "publishingDate": publishing_date,
            "commentCount": int(comment_count) if comment_count else None,
            "postScore": int(post_score) if post_score else None,
            "attachmentType": box.xpath("./@data-type").get(),
            "attachmentLink": box.xpath("./@data-url").get(),
        })
    next_page_url = selector.xpath("//span[@class='next-button']/a/@href").get()
    return {"data": data, "url": next_page_url}

async def scrape_user_posts(username: str, sort: Union["new", "top", "controversial"], max_pages: int = None) -> List[Dict]:
    url = f"https://old.reddit.com/user/{username}/submitted/?sort={sort}"
    response = await client.get(url)
    data = parse_user_posts(response)
    post_data, next_page_url = data["data"], data["url"]

    visited_urls = set()
    while next_page_url and (max_pages is None or max_pages > 0):
        if next_page_url in visited_urls:
            break
        visited_urls.add(next_page_url)

        response = await client.get(next_page_url)
        data = parse_user_posts(response)
        next_page_url = data["url"]
        post_data.extend(data["data"])
        if max_pages is not None:
            max_pages -= 1

    log.success(f"Scraped {len(post_data)} posts from {username}'s Reddit profile")
    return post_data



---

## Provide username

In [14]:
reddit_username = 
posts = await scrape_user_posts(reddit_username, "new", max_pages=None)


[SUCCESS] Scraped 2 posts from Hungry-Move-6603's Reddit profile


In [15]:
# Convert it into a data frame
df = pd.DataFrame(posts)
df.head(5)

import requests
from bs4 import BeautifulSoup
import re

def fetch_reddit_post_text(post_url: str):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(post_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Generalized match: find div where ID matches "t3_<something>-post-rtjson-content"
    post_div = soup.find("div", id=re.compile(r"t3_[\w]+-post-rtjson-content"))

    if post_div:
        post_paragraphs = post_div.find_all("p")
        post_text = "\n\n".join(p.get_text(strip=True) for p in post_paragraphs)
        return post_text
    else:
        return "❌ Post content not found."


df['postContent'] = df['postLink'].apply(fetch_reddit_post_text)

In [16]:
# visualize the data frame
df.head(5)

Unnamed: 0,authorId,author,authorProfile,postId,postLink,postTitle,postSubreddit,publishingDate,commentCount,postScore,attachmentType,attachmentLink,postContent
0,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t3_1lx50qm,https://www.reddit.com/r/lucknow/comments/1lx5...,Productive weekend activities in LKO?,r/lucknow,2025-07-11T12:00:04+00:00,0,1,link,/r/lucknow/comments/1lx50qm/productive_weekend...,❌ Post content not found.
1,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t3_1lwyhny,https://www.reddit.com/r/lucknow/comments/1lwy...,Everyone is something in LKO,r/lucknow,2025-07-11T05:12:13+00:00,94,222,link,/r/lucknow/comments/1lwyhny/everyone_is_someth...,Born and raised in Delhi - I shifted to LKO in...


---

## Save the Posts Data in a CSV_File ( Give a name to it)

In [17]:
df.to_csv('{username}_posts.csv')

---

---

---

---

# Fetch Comments of the Users


In [18]:
client = AsyncClient(
    headers={"User-Agent": "Mozilla/5.0"},
    timeout=10.0,
    follow_redirects=True
)

class log:
    @staticmethod
    def success(msg): print("[SUCCESS]", msg)
    @staticmethod
    def error(msg): print("[ERROR]", msg)


def parse_user_comments(response: Response) -> Dict:
    """Parse user comments from user profile comment page"""
    selector = Selector(response.text)
    data = []

    for box in selector.xpath("//div[@id='siteTable']/div[contains(@class, 'thing')]"):
        author = box.xpath("./@data-author").get()
        link = box.xpath("./@data-permalink").get()

        dislikes = box.xpath(".//span[contains(@class, 'dislikes')]/@title").get()
        upvotes = box.xpath(".//span[contains(@class, 'likes')]/@title").get()
        downvotes = box.xpath(".//span[contains(@class, 'unvoted')]/@title").get()

        comment_body = "".join(
            box.xpath(".//div[contains(@class, 'usertext-body')]/div/p//text()").getall()
        ).replace("\n", "").strip()

        data.append({
            "authorId": box.xpath("./@data-author-fullname").get(),
            "author": author,
            "authorProfile": f"https://www.reddit.com/user/{author}" if author else None,
            "commentId": box.xpath("./@data-fullname").get(),
            "commentLink": f"https://www.reddit.com{link}" if link else None,
            "commentBody": comment_body,
            "attachedCommentLinks": box.xpath(".//div[contains(@class, 'usertext-body')]/div/p/a/@href").getall(),
            "publishingDate": box.xpath(".//time/@datetime").get(),
            "dislikes": int(dislikes) if dislikes else None,
            "upvotes": int(upvotes) if upvotes else None,
            "downvotes": int(downvotes) if downvotes else None,
            "replyTo": {
                "postTitle": box.xpath(".//p[@class='parent']/a[@class='title']/text()").get(),
                "postLink": f"https://www.reddit.com{box.xpath('.//p[@class=\"parent\"]/a[@class=\"title\"]/@href').get()}",
                "postAuthor": box.xpath(".//p[@class='parent']/a[contains(@class, 'author')]/text()").get(),
                "postSubreddit": box.xpath("./@data-subreddit-prefixed").get(),    
            }
        })

    next_page_url = selector.xpath("//span[@class='next-button']/a/@href").get()
    return {"data": data, "url": next_page_url}


async def scrape_user_comments(username: str, sort: Union["new", "top", "controversial"] = "new", max_pages: int = None) -> List[Dict]:
    """Scrape comments from a Reddit user profile"""
    url = f"https://old.reddit.com/user/{username}/comments/?sort={sort}"
    response = await client.get(url)
    parsed = parse_user_comments(response)
    comment_data, next_page_url = parsed["data"], parsed["url"]

    while next_page_url and (max_pages is None or max_pages > 0):
        response = await client.get(next_page_url)
        parsed = parse_user_comments(response)
        comment_data.extend(parsed["data"])
        next_page_url = parsed["url"]
        if max_pages is not None:
            max_pages -= 1

    log.success(f"Scraped {len(comment_data)} comments from {username}")
    return comment_data


## Provide Username

In [19]:
reddit_username = 
comments = await scrape_user_comments(reddit_username, sort="new", max_pages=None)

[SUCCESS] Scraped 10 comments from Hungry-Move-6603


## Creating and Saving the Data Frame as a CSV_File

In [20]:
new_df = pd.DataFrame(comments)
new_df.to_csv('{username}_comments.csv',index = False)

In [21]:
# visualize the data frame
new_df.head(5)

Unnamed: 0,authorId,author,authorProfile,commentId,commentLink,commentBody,attachedCommentLinks,publishingDate,dislikes,upvotes,downvotes,replyTo
0,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t1_n2ybup0,https://www.reddit.com/r/nagpur/comments/1lyb0...,I was caught without helmet and license (close...,[],2025-07-13T19:53:17+00:00,0,0,1,"{'postTitle': 'A very odd experience', 'postLi..."
1,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t1_n2y7g0s,https://www.reddit.com/r/nagpur/comments/1lyb0...,Cops keep a civ around to discuss bribes.,[],2025-07-13T19:31:23+00:00,0,0,1,"{'postTitle': 'A very odd experience', 'postLi..."
2,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t1_n2vkdpb,https://www.reddit.com/r/IndiaUnfilter/comment...,Toh hum Noida or Ghaziabad se pahadio ko bhaga...,[],2025-07-13T10:31:41+00:00,0,0,1,{'postTitle': 'People are smoking hookah in th...
3,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t1_n2kh3aq,https://www.reddit.com/r/lucknow/comments/1lwb...,A menu easy to cook/process - healthy and quick.,[],2025-07-11T15:47:19+00:00,0,0,1,{'postTitle': 'Any Tiffin service providing hi...
4,t2_bcxve1ah,Hungry-Move-6603,https://www.reddit.com/user/Hungry-Move-6603,t1_n2ilsqh,https://www.reddit.com/r/lucknow/comments/1lwy...,Haha Delhi is hateable too but mostly those ar...,[],2025-07-11T08:46:21+00:00,0,0,1,"{'postTitle': 'Everyone is something in LKO', ..."
