In [10]:
import praw
import pymongo
import datetime

def reddit_subreddit_scraper(sub_reddit, db_name, client_id, client_secret, user_agent,uri,start_date, end_date):
    """
    This function seaches a subReddit on reddit and adds the information to a MongoDB database.
    Parameters:
        sub_reddit(String) = sub reddit name
        db_name(String) = what you want the database called
        client_id(String) = the client ID given from Reddit API
        client_secret(String) = secret given by Reddit API
        user_agent(String) = username of the user
        uri(String) = The link to the MongoDB database 
        start_date(datetime)= The earliest posts created that can be collected
        end_date(datetime) = The latest a post can be collected
    """
    # If the date is not specified then starts in 2005, when Reddit was founded and ends on the current day.
    if start_date is None:
        start_date = datetime.datetime(2005, 1, 1)
    if end_date is None:
        end_date = datetime.today()
    
    # Information for praw
    reddit = praw.Reddit(
        client_id = client_id,
        client_secret = client_secret,
        user_agent = user_agent
    )

    # The name of the MongoDB 
    client = pymongo.MongoClient(uri)
    
    # Name of the subreddit to search for
    subRedditName = sub_reddit
    
    # Name of the database
    db = client[db_name]
    
    # Names of the collections
    collection = db["RedditPosts"]
    collection_comments = db["RedditComments"]
    
    # Loop for searching reddit, can change the limit to a whole number to limit the amount of posts per search
    # However it is currently no limit which will scan everypost in the subreddit
    for post in reddit.subreddit(subRedditName).top(limit=None):

        # Check if post already exists in the database
        # Skip this post if it already exists in the database
        if collection.find_one({"post_url": f"https://www.reddit.com{post.permalink}"}):
            continue 

        # Timestamp formatting
        timestamp = datetime.datetime.utcfromtimestamp(post.created_utc)

        # Skip the post if it was created after or before the specfiied dates
        if timestamp > end_date or timestamp < start_date:
            continue
        
        # Formats the timestamp into the correct format
        formatted_timestamp = timestamp.isoformat()

        # Post dictionary to be added to the database
        post_dict = {
            "title": post.title,
            "post_id": post.id,
            "author": post.author.name if post.author else 'N/A',
            "num_comments": post.num_comments,
            "score": post.score,
            "attachment_file": post.url,
            "timestamp": formatted_timestamp,
            "subreddit_name": subRedditName,
            "post_url": f"https://www.reddit.com{post.permalink}"
        }
        collection.insert_one(post_dict)
        # Created a set so that duplicates could not be added
        comment_set = set()

        # Scrapping the comment and if the author is deleted account then put N/A
        for comment in post.comments.list():
            if collection_comments.find_one({"id": comment.id}):
                continue

            if isinstance(comment, praw.models.Comment):
                comment_dict = {
                    "id": comment.id,
                    "post_id": post.id,
                    "author": comment.author.name if comment.author else 'N/A',
                    "score": comment.score,
                    "num_replies": len(comment.replies),
                    "text": comment.body,
                    "subreddit_name": subRedditName,
                    "timestamp": datetime.datetime.utcfromtimestamp(comment.created_utc).isoformat(),
                }

            # Skip the comment if it was created after or before the specified dates
            comment_timestamp = datetime.datetime.utcfromtimestamp(comment.created_utc)
            if comment_timestamp > end_date or comment_timestamp < start_date:
                continue
            
            # Convert the set back into a tuple in order to be added to the database.
            comment_tuple = tuple(comment_dict.items())
            comment_set.add(comment_tuple)

        comment_list = [dict(comment_tuple) for comment_tuple in comment_set]
        collection_comments.insert_many(comment_list)
        
    client.close()

In [9]:
sub_reddit = "ChatGPT"

db_name = "RedditData"
client_id = "XXJgyOD5LF7dxRTGedxliQ"
client_secret = "4W2SxwrOIz6xJWNz4ePVTX2_GwF2BA"
user_agent = "Bombe_Cerise"
uri = "mongodb+srv://testbot:king@cluter1.kov9r66.mongodb.net/?retryWrites=true&w=majority"

# Which dates to start the scrapper for posts and comments and which date to end it
start_date = datetime.datetime(2020, 3, 17)
end_date = datetime.datetime(2023, 3, 17)

reddit_subreddit_scraper(sub_reddit, db_name, client_id, client_secret, user_agent,uri,start_date, end_date)

NameError: name 'client_id' is not defined