In [1]:
import praw
import time
import os

from psaw import PushshiftAPI # to use PSAW

api = PushshiftAPI()

subreddits = ['Switzerland']
start_year = 2018
end_year = 2022
query = "(keyword_1)|(keyword_2)|..." # update keywords for query here
basecorpus = './my-dataset/' # directory to store the data

In [None]:
# Fill in with agent data
reddit = praw.Reddit(
    user_agent="",
    client_id="",
    client_secret="",
    username="",
    password="",
)

In [None]:
import pandas as pd
import datetime as dt

### BLOCK 1 ###
# Extract comments containing query keywords from Reddit using the Praw API
total_comments = 0

for year in range(start_year, end_year+1):
    action = "[Year] " + str(year)
    print(action)

    dirpath = basecorpus + str(year)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    # timestamps that define window of posts
    ts_after = int(dt.datetime(year, 1, 1).timestamp())
    ts_before = int(dt.datetime(year+1, 1, 1).timestamp())

    ### BLOCK 2 ###
    for subreddit in subreddits:
        start_time = time.time()

        action = "\t[Subreddit] " + subreddit
        print(action)

        subredditdirpath = dirpath + '/' + subreddit
        if os.path.exists(subredditdirpath):
            continue
        else:
            os.makedirs(subredditdirpath)

        submissions_csv_path = str(year) + '-' + subreddit + '-submissions.csv'
        
        ### BLOCK 3 ###
        submissions_dict = {
            "id" : [],
            "url" : [],
            "title" : [],
            "score" : [],
            "num_comments": [],
            "created_utc" : [],
            "selftext" : [],
            "upvoteratio": []
        }

        ### BLOCK 4 ###
        # use PSAW only to get id of submissions in time interval
        gen = api.search_submissions(
            after=ts_after,
            before=ts_before,
            filter=['id'],
            subreddit=subreddit,
            limit=100, 
            q = query
        )

        ### BLOCK 5 ###
        # use PRAW to get actual info and traverse comment tree
        for submission_psaw in gen:
            # use psaw here
            submission_id = submission_psaw.d_['id']
            # use praw from now on
            submission_praw = reddit.submission(id=submission_id)

            submissions_dict["id"].append(submission_praw.id)
            submissions_dict["url"].append(submission_praw.url)
            submissions_dict["title"].append(submission_praw.title)
            submissions_dict["score"].append(submission_praw.score)
            submissions_dict["num_comments"].append(submission_praw.num_comments)
            submissions_dict["created_utc"].append(submission_praw.created_utc)
            submissions_dict["selftext"].append(submission_praw.selftext)
            submissions_dict["upvoteratio"].append(submission_praw.upvote_ratio)

            ### BLOCK 6 ###
            submission_comments_csv_path = str(year) + '-' + subreddit + '-submission_' + submission_id + '-comments.csv'
            submission_comments_dict = {
                "comment_id" : [],
                "comment_parent_id" : [],
                "comment_body" : [],
                "comment_link_id" : [],
            }

            ### BLOCK 7 ###
            # extend the comment tree all the way
            submission_praw.comments.replace_more(limit=None)
            # for each comment in flattened comment tree
            for comment in submission_praw.comments.list():
                submission_comments_dict["comment_id"].append(comment.id)
                submission_comments_dict["comment_parent_id"].append(comment.parent_id)
                submission_comments_dict["comment_body"].append(comment.body)
                submission_comments_dict["comment_link_id"].append(comment.link_id)
                total_comments += 1

            # for each submission save separate csv comment file
            pd.DataFrame(submission_comments_dict).to_csv(subredditdirpath + '/' + submission_comments_csv_path,
                                                          index=False)

        ### BLOCK 8 ###
        # single csv file with all submissions
        pd.DataFrame(submissions_dict).to_csv(subredditdirpath + '/' + submissions_csv_path,
                                              index=False)

        action = f"\t\t[Info] Found submissions: {pd.DataFrame(submissions_dict).shape[0]}"
        print(action)

        action = f"\t\t[Info] Elapsed time: {time.time() - start_time: .2f}s"
        print(action)

print(total_comments)