In [None]:
import requests
import csv
from datetime import datetime

# Base URL of the Discourse site
BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"

headers = {
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-language": "en-IN,en-US;q=0.9,en-GB;q=0.8,en;q=0.7,hi;q=0.6",
    "discourse-logged-in": "true",
    "discourse-present": "true",
    "priority": "u=1, i",
    "sec-ch-ua": "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Chrome OS\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "cookie": "removed",
    "x-csrf-token": "removed",
    "x-requested-with": "XMLHttpRequest",
}

def fetch_topic_details(topic_id):
    """Fetch topic details including the first post (to get the original poster's username)."""
    topic_url = f"{BASE_URL}/t/{topic_id}.json"
    topic_response = requests.get(topic_url, headers=headers)

    if topic_response.status_code == 200:
        topic_data = topic_response.json()
        # Extract original poster from the first post of the topic
        original_poster_username = topic_data['post_stream']['posts'][0]['username']
        return original_poster_username
    else:
        print(f"Failed to fetch topic details for {topic_id}: {topic_response.status_code}")
        return "Unknown"

def fetch_topics(tag_name, writer):
    """Fetch topics for a given tag and write to CSV."""
    print(f"Fetching topics for tag: {tag_name}")

    page = 0  # Start with page 1
    total_records = 0  # Keep track of total records fetched

    while True:
        if page == 0:
            tag_topics_url = f"{BASE_URL}/tag/{tag_name}.json"
        else:
            tag_topics_url = f"{BASE_URL}/tag/{tag_name}.json?page={page}"

        print(f"Fetching page {page}: {tag_topics_url}")  # Debugging message for the current page URL

        tag_response = requests.get(tag_topics_url, headers=headers)

        if tag_response.status_code == 200:
            topics_data = tag_response.json()
            topics = topics_data['topic_list']['topics']

            if not topics:
                print(f"No topics found on page {page}. Exiting pagination.")  # Debugging: No more topics
                break  # Exit the loop if no more topics on the current page

            topic_data_list = []  # List to store topic data

            for topic in topics:
                created_at = topic['created_at']
                topic_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%S.%fZ")
                if topic_date.year == 2024:
                    title = topic['title']
                    topic_id = topic['id']
                    posts_count = topic['posts_count']
                    like_count = topic.get('like_count', 0)
                    views = topic.get('views', 0)
                    reply_count = topic.get('reply_count', 0)
                    topic_tags = ', '.join(topic.get('tags', []))

                    # Fetch the original poster username from the topic details
                    original_poster_username = fetch_topic_details(topic_id)

                    # Store the topic data
                    topic_data_list.append([tag_name, title, topic_id, created_at, posts_count, like_count, views, reply_count, original_poster_username, topic_tags])

            # Write topic data to CSV
            for topic_data in topic_data_list:
                writer.writerow(topic_data)

            # Debugging: Print current page stats
            records_fetched = len(topic_data_list)
            total_records += records_fetched
            #print(f"Page {page} fetched {records_fetched} records. Total records so far: {total_records}")

            # Check for more pages
            if not topics_data['topic_list'].get('more_topics_url'):
                print(f"No more topics for {tag_name}. Finished fetching.")  # Debugging: No more topics
                break  # No more topics, exit the loop

            page += 1  # Increment the page number for the next request
        else:
            print(f"Failed to fetch topics for {tag_name} on page {page}: {tag_response.status_code}")
            break

# Fetch the tags in JSON format
tags_url = f"{BASE_URL}/tags.json"
response = requests.get(tags_url, headers=headers)

# Open CSV file for writing
with open('discourse_tag_topics_2024.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Tag', 'Topic Title', 'Topic ID', 'Created At', 'Posts Count', 'Like Count', 'Views', 'Reply Count', 'Original Poster', 'Topic Tags'])

    if response.status_code == 200:
        tags_data = response.json()

      # Set to keep track of processed tag URLs
        processed_tags = set()

        # Fetch topics for tags in tags_data['tags']
        for tag in tags_data['tags']:
            tag_id = tag['id']
            if tag_id not in processed_tags:  # Check if tag has been processed
                fetch_topics(tag_id, writer)
                processed_tags.add(tag_id)  # Mark this tag as processed
            else:
                print(f"Skipping tag {tag_id} as it has already been processed.")


        # Fetch topics for tags in tag groups
        for tag_group in tags_data['extras']['tag_groups']:
            for tag in tag_group['tags']:
                tag_id = tag['id']
                if tag_id not in processed_tags:  # Check if tag has been processed
                    fetch_topics(tag_id, writer)
                    processed_tags.add(tag_id)  # Mark this tag as processed
                else:
                  print(f"Skipping tag {tag_id} as it has already been processed.")
    else:
        print(f"Failed to fetch tags: {response.status_code}")

print("Data written to discourse_tag_topics_2024.csv")

Fetching topics for tag: about-me
Fetching topics for tag: purpose
Fetching topics for tag: intropython
Fetching topics for tag: compthink
Fetching topics for tag: week4
Fetching topics for tag: communication
Fetching topics for tag: error
Fetching topics for tag: features
Fetching topics for tag: exploration
Fetching topics for tag: introduction
Fetching topics for tag: best-practices
Fetching topics for tag: projects
Fetching topics for tag: challenge-question
Fetching topics for tag: intermediate-level
Fetching topics for tag: evaluation
Fetching topics for tag: annoncement
Fetching topics for tag: rules
Fetching topics for tag: competition
Fetching topics for tag: week-5-feedback
Fetching topics for tag: ppa
Fetching topics for tag: practice-material
Fetching topics for tag: capstoneproject
Fetching topics for tag: database-management-
Fetching topics for tag: feedback-mlf-course
Fetching topics for tag: statistics2-feedback
Fetching topics for tag: course-feedback
Fetching topics 