In [None]:
import requests
import csv
from datetime import datetime

# Base URL of the Discourse site
BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"

headers = {
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-language": "en-IN,en-US;q=0.9,en-GB;q=0.8,en;q=0.7,hi;q=0.6",
    "discourse-logged-in": "true",
    "discourse-present": "true",
    "priority": "u=1, i",
    "sec-ch-ua": "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Chrome OS\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "cookie": "removed",
    "x-csrf-token": "removed",
    "x-requested-with": "XMLHttpRequest",
}

def fetch_users(user_ids):
    """Fetch user details for a list of user IDs."""
    users = {}
    for user_id in set(user_ids):
        user_response = requests.get(f"{BASE_URL}/u/{user_id}.json", headers=headers)
        if user_response.status_code == 200:
            user_data = user_response.json()
            users[user_id] = user_data['user']['username']
        else:
            users[user_id] = "Unknown"
    return users

def fetch_topics(tag_name, writer, user_cache):
    """Fetch topics for a given tag and write to CSV."""
    print(f"Fetching topics for tag: {tag_name}")
    page = 1
    while True:
        tag_topics_url = f"{BASE_URL}/tag/{tag_name}.json?page={page}"
        tag_response = requests.get(tag_topics_url, headers=headers)

        if tag_response.status_code == 200:
            topics_data = tag_response.json()
            topics = topics_data['topic_list']['topics']

            if not topics:
                break  # Exit if no more topics on the current page

            original_poster_ids = []  # Collect user IDs to fetch later
            topic_data_list = []  # List to store topic data

            for topic in topics:
                created_at = topic['created_at']
                topic_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%S.%fZ")
                if topic_date.year == 2024:
                    title = topic['title']
                    topic_id = topic['id']
                    posts_count = topic['posts_count']
                    like_count = topic.get('like_count', 0)
                    views = topic.get('views', 0)
                    reply_count = topic.get('reply_count', 0)

                    # Collect original poster's user_id
                    posters = topic.get('posters', [])
                    if posters:
                        original_poster_id = posters[0].get('user_id')
                        if original_poster_id is not None:
                            original_poster_ids.append(original_poster_id)

                    topic_tags = ', '.join(topic.get('tags', []))

                    # Store topic data in a list
                    topic_data_list.append([tag_name, title, topic_id, created_at, posts_count, like_count, views, reply_count, "Unknown", topic_tags])

            # Fetch all unique original poster usernames after collecting user IDs
            user_cache.update(fetch_users(original_poster_ids))

            # Write topic data to CSV, including original poster usernames
            for topic_data in topic_data_list:
                original_poster_id = topic_data[8]  # Access original_poster_id from topic_data
                # Use the correct key to get the username
                original_poster_username = user_cache.get(original_poster_id, "Unknown")
                topic_data[8] = original_poster_username  # Update original poster username in topic_data
                writer.writerow(topic_data)

            page += 1  # Increment to the next page
        else:
            print(f"Failed to fetch topics for {tag_name}: {tag_response.status_code}")
            break

# Fetch the tags in JSON format
tags_url = f"{BASE_URL}/tags.json"
response = requests.get(tags_url, headers=headers)

# Open CSV file for writing
with open('Dataset_discourse_tag_topics_2024.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Tag', 'Topic Title', 'Topic ID', 'Created At', 'Posts Count', 'Like Count', 'Views', 'Reply Count', 'Original Poster', 'Topic Tags'])

    if response.status_code == 200:
        tags_data = response.json()

        user_cache = {}  # Cache for user details

        # Fetch topics for tags in tags_data['tags']
        for tag in tags_data['tags']:
            fetch_topics(tag['id'], writer, user_cache)

        # Fetch topics for tags in tag groups
        for tag_group in tags_data['extras']['tag_groups']:
            for tag in tag_group['tags']:
                fetch_topics(tag['id'], writer, user_cache)
    else:
        print(f"Failed to fetch tags: {response.status_code}")

print("Data written to Dataset_discourse_tag_topics_2024.csv")


Fetching topics for tag: about-me
Fetching topics for tag: purpose
Fetching topics for tag: intropython
Fetching topics for tag: compthink
Fetching topics for tag: week4
Fetching topics for tag: communication
Fetching topics for tag: error
Fetching topics for tag: features
Fetching topics for tag: exploration
Fetching topics for tag: introduction
Fetching topics for tag: best-practices
Fetching topics for tag: projects
Fetching topics for tag: challenge-question
Fetching topics for tag: intermediate-level
Fetching topics for tag: evaluation
Fetching topics for tag: annoncement
Fetching topics for tag: rules
Fetching topics for tag: competition
Fetching topics for tag: week-5-feedback
Fetching topics for tag: ppa
Fetching topics for tag: practice-material
Fetching topics for tag: capstoneproject
Fetching topics for tag: database-management-
Fetching topics for tag: feedback-mlf-course
Fetching topics for tag: statistics2-feedback
Fetching topics for tag: course-feedback
Fetching topics 