In [4]:
import praw
import pymongo
import datetime
import requests

def stackoverflow_scraper(tag, keyword, db_name, user_agent, uri, fromDate, toDate):
    """
    This function searches for Stack Overflow questions tagged with a certain tag and saves the information to a MongoDB database.
    Parameters:
        tag(String) = tag name
        keyword(String) = keyword to search in titles and body of questions
        db_name(String) = what you want the database called
        user_agent(String) = user agent name
        uri(String) = The link to the MongoDB database 
        fraomDate(Datetime) = What date to start scrapping 
        toDate(Datetime) = The end date to end scrapping
    """
    # Set the base URL for the API
    base_url = "https://api.stackexchange.com/2.3/"

    # Setting the parameters for the request
    params = {
        "site": "stackoverflow",
        "tagged": tag,
        "filter": "withbody",
        "pagesize": 100,
        "fromdate": fromDate,
        "todate": toDate,
        "answers": "true"
    }


    # Set the headers for the request
    headers = {
        "User-Agent": user_agent
    }

    # Make the request to the API
    response = requests.get(f"{base_url}search", params=params, headers=headers)

    # Get the JSON data from the response
    data = response.json()

    # Name of the database
    client = pymongo.MongoClient(uri)
    db = client[db_name]
    collection = db["StackOverflowPosts"]
    collection_users = db["StackOverflowUsers"]
    
    # change the code so that it checks for a duplicate before adding for both posts and users
    
    # Loop through the questions and save them to the database
    for question in data["items"]:
        # Timestamp formatting
        timestamp = datetime.datetime.utcfromtimestamp(question["creation_date"])
        formatted_timestamp = timestamp.isoformat()

        # Question dictionary to be added to the database
        question_dict = {
            "question_id": question["question_id"],
            "title": question["title"],
            "body": question["body"],
            "score": question["score"],
            "tags": question["tags"],
            "view_count": question["view_count"],
            "answer_count": question["answer_count"],
            "timestamp": formatted_timestamp,
        }
        collection.insert_one(question_dict)

        # User dictionary to be added to the database
        user_id = question["owner"]["user_id"]
        user_response = requests.get(f"{base_url}users/{user_id}", params={"site": "stackoverflow"}, headers=headers)
        user_data = user_response.json()
        
        # Using the .get as using this if cannot find then just place N/A instead
        # However the priv, question_count, answer_count are always returning N/A
        user_dict = {
            "username": user_data["items"][0].get("display_name", "N/A"),
            "user_id": user_id,
            "location": user_data["items"][0].get("location", "N/A"),
            "reputation": user_data["items"][0].get("reputation", "N/A"),
            "badges": user_data["items"][0].get("badge_counts", "N/A"),
            "experience": user_data["items"][0].get("creation_date", "N/A"),
            "privileges": user_data["items"][0].get("privileges"),
            "question_count": user_data["items"][0].get("question_count", "N/A"),
            "answer_count": user_data["items"][0].get("answer_count", "N/A")
        }


        collection_users.insert_one(user_dict)


In [5]:
tag = "tabnine"
keyword = None

db_name = "StackOverFlowData"
uri = "mongodb+srv://testbot:king@cluter1.kov9r66.mongodb.net/?retryWrites=true&w=majority"
username = "Da16King"
fromDate = datetime.datetime(2020,1,1)
toDate = datetime.datetime(2023,3,23)

stackoverflow_scraper(tag,keyword, db_name , username, uri, fromDate, toDate)

KeyboardInterrupt: 