## Dataset Download

First, we download all post with tag 'nlp' and we fetch it in JSON format. In this retrieval, we also pull the accepted answer id to indicate later if a post has been answered or not


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import time
import json
from datetime import datetime
from bs4 import BeautifulSoup

# Function to clean HTML tags and remove newline characters
def clean_html(body):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(body, "html.parser")
    text = soup.get_text()

    # Remove newline characters and extra spaces
    return text.replace("\n", " ").strip()

# Base API URL
BASE_URL = "https://api.stackexchange.com/2.3/questions"

# API Parameters
params = {
    "order": "desc",
    "sort": "creation",
    "tagged": "nlp",
    "site": "stackoverflow",
    "filter": "withbody",
    "pagesize": 100,  # Max per request
    "page": 1,  # Start from page 1
    "key": "#######################" ## API Key from Stack Exchange
}

# Variables to store results
questions = []
total_needed = 20100  # Target number of questions

while len(questions) < total_needed:
    print(f"Fetching page {params['page']}...")

    # Make API Request
    response = requests.get(BASE_URL, params=params)
    data = response.json()

    # Handle API errors
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break

    # Handle backoff if present
    if "backoff" in data:
        backoff_time = data["backoff"]
        print(f"Backoff detected! Sleeping for {backoff_time} seconds...")
        time.sleep(backoff_time)

    # Extract required fields
    items = data.get("items", [])
    for item in items:
        question = {
            "tags": item.get("tags", []),
            "title": item.get("title", ""),
            "body": clean_html(item.get("body", "")),  # Clean HTML tags and remove \n
            "creation_date": datetime.utcfromtimestamp(item.get("creation_date", 0)).strftime('%Y-%m-%d %H:%M:%S'),
            "view_count": item.get("view_count", 0),
            "score": item.get("score", 0),
            "accepted_answer_id": item.get("accepted_answer_id", None)  # Add accepted_answer_id
        }
        questions.append(question)

    # Stop if there are no more pages
    if not data.get("has_more", False):
        print("No more pages. Switching to time-based pagination...")
        last_question_date = items[-1]["creation_date"] if items else params.get("todate")
        if last_question_date:
            params["todate"] = last_question_date - 1
            params["page"] = 1
        else:
            break
    else:
        # Increase page number for next request
        params["page"] += 1

    # Respect API rate limits
    time.sleep(1)  # Avoid hitting API quota too fast

# Trim excess questions if needed
questions = questions[:total_needed]

# Print Results Summary
print(f"\nTotal questions fetched: {len(questions)}")

# Save results to a JSON file
with open("/content/drive/MyDrive/ANLP_A2/nlp_questions_only.json", "w", encoding="utf-8") as f:
    json.dump(questions, f, indent=2)

print("Data saved to nlp_questions_only.json ✅")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Fetching 

There are total of 20100 questions post downloaded along with its metadata such as tags, title, creation date, view count, etc.

After we pull the posts with tag 'nlp', we filter post with accepted answer only, as in this study we will use the combination of title, body, and answer for document classification.

In [None]:
# Function to process and filter questions with accepted answers
def filter_questions_with_accepted_answers(input_file, output_file):
    with open(input_file, 'r') as file:
        questions = json.load(file)

    filtered_questions = []

    for question in questions:
        # Check if the question has an accepted answer by looking for a non-null 'accepted_answer_id'
        if question.get('accepted_answer_id') is not None:
            # Add the question to the filtered list if it has an accepted answer
            filtered_questions.append(question)

    # Save the filtered questions with accepted answers to a new file
    with open(output_file, 'w') as output:
        json.dump(filtered_questions, output, indent=4)

# Example usage
input_file = '/content/drive/MyDrive/ANLP_A2/nlp_questions_only.json'  # Path to your input JSON file
output_file = '/content/drive/MyDrive/ANLP_A2/nlp_questions_with_accepted_answers.json'  # Path to output file
filter_questions_with_accepted_answers(input_file, output_file)

# Check the length of output_file
with open(output_file, 'r') as output:
    output_data = json.load(output)
    print(f"Length of output_file: {len(output_data)}")


Length of output_file: 8165


Finally, with the post with answer file, we request the answer of each post one by one using stack exchange API. There are a total of 8165 questions with answer retrieved.

In [None]:
# Function to clean HTML tags and remove newline characters
def clean_html(body):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(body, "html.parser")
    text = soup.get_text()

    # Remove newline characters and extra spaces
    return text.replace("\n", " ").strip()

# Function to fetch accepted answer for a given answer_id
def fetch_answer(answer_id, api_key):
    url = f"https://api.stackexchange.com/2.3/answers/{answer_id}?order=desc&sort=activity&site=stackoverflow&key={api_key}&filter=withbody"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()

        # Check if the response contains items (answers)
        if 'items' in data and data['items']:
            answer = data['items'][0]


            # Check if the body exists in the answer
            if 'body' in answer:
                cleaned_answer = clean_html(answer['body'])
                votes = answer.get('score', 0)  # Get the vote count, default to 0 if not found
                print('Done')
                return cleaned_answer, votes
            else:
                print(f"Answer body not found for answer_id {answer_id}")
                return None, None
        else:
            print(f"No items found for answer_id {answer_id}")
            return None, None
    else:
        print(f"Error fetching answer {answer_id}: {response.status_code}")
        return None, None

# Load JSON data from file
with open("/content/drive/MyDrive/ANLP_A2/nlp_questions_with_accepted_answers.json", "r") as file:
    data = json.load(file)

# API key (replace with your actual API key)
api_key = "rl_WQFnNLX6hFZoyWkU77T9tFiNg"

# Add accepted answer and vote count to each question
for question in data:
    accepted_answer, votes = fetch_answer(question['accepted_answer_id'], api_key)

    # If we fetched an accepted answer, add it to the question
    if accepted_answer:
        question['accepted_answer'] = accepted_answer
        question['accepted_answer_votes'] = votes

    # Sleep for 1 second to avoid hitting API quota too fast
    time.sleep(1)

# Output the updated JSON with accepted answers and vote counts
with open("questions_and_answers.json", "w") as file:
    json.dump(data, file, indent=4)

# Print the updated JSON to verify
#print(json.dumps(data, indent=4))
