## Reddit
- Using Pushshift, via PSAW
- Useful for retrieving large amount of static, historical reddit submissions/comments
- Example script below runs a query across submissions, and then retrieves all comments within the submission
- This is good, if the topicality of the submission holds, but is not good for passing references/relevant comments in non-matching submissions
- Would rather not rake the search across both submissions and comments, so the first (probably stronger) assumption (initial submission topicality) has been implemented

In [None]:
import math
from datetime import datetime, timedelta

import pandas as pd
from tqdm import tqdm

from psaw import PushshiftAPI

TOTAL_SUBMISSION_LIMIT = 1000
DAY_DELTA = 30

pushshift_client = PushshiftAPI()
last_month_start_epoch = int((datetime.now() - timedelta(days=DAY_DELTA)).timestamp())
reddit_query = ""

subreddits = [
    "fiaustralia",
    "ASX_Bets",
    "ausstocks",
    "AusProperty",
    "AusFinance",
    "ausstocks",
    "AusEcon",
    "AusPropertyChat",
    "ASX",
    "AustralianAccounting",
]
per_subreddit_limit = math.ceil(TOTAL_SUBMISSION_LIMIT / len(subreddits))


In [None]:
all_subreddit_submissions = []

for subreddit in tqdm(
    subreddits,
    desc=f"Collecting {per_subreddit_limit} submissions for each subreddit..",
):
    # apply search across each subreddit
    submission_raw = list(
        pushshift_client.search_submissions(
            q=reddit_query,
            after=last_month_start_epoch,
            subreddit=subreddit,
            filter=[
                "url",
                "author",
                "id",
                "parent_id",
                "link_id",
                "title",
                "subreddit",
            ],
            limit=per_subreddit_limit,
        )
    )
    submissions_formatted = pd.DataFrame([e.d_ for e in submission_raw])
    all_subreddit_submissions.append(submissions_formatted)

all_subreddit_submissions = pd.concat(all_subreddit_submissions)


In [None]:
all_subreddit_submissions


In [None]:
# for a list of submissions, retrieval all comments
submissions_and_comments = []

for idx, record in tqdm(
    all_subreddit_submissions.iterrows(),
    total=all_subreddit_submissions.shape[0],
    desc="Collecting submission comments..",
):
    comments_raw = list(
        pushshift_client.search_comments(
            after=last_month_start_epoch,
            subreddit=record.subreddit,
            link_id=record.id,
            filter=["url", "author", "id", "parent_id", "title", "body", "subreddit"],
        )
    )
    comments_formatted = pd.DataFrame([e.d_ for e in comments_raw])

    submissions_and_comments.append(
        pd.concat([record.to_frame().transpose(), comments_formatted], sort=True)
    )


In [None]:
all_submissions_and_comments = (
    pd.concat(submissions_and_comments, sort=True)
    # date formatting
    .assign(
        document_publish_date=lambda x: x.created.apply(
            lambda y: datetime.fromtimestamp(y)
        )
    ).drop(labels=["created", "created_utc"], axis="columns", inplace=False)
)
