In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
import time
import os
from datetime import datetime, timedelta
import re
import json

def tidy_text(raw_text):
    return raw_text.strip().replace('\n', ' ').replace('\t', ' ').replace('\xa0', ' ')

def parse_date_string(date_input):
    date_input = str(date_input).strip()
    today = datetime.today()

    for fmt in ("%d-%m-%Y %I:%M %p", "%d-%m-%Y"):
        try:
            return datetime.strptime(date_input, fmt).strftime("%Y-%m-%d")
        except:
            pass

    date_lower = date_input.lower()
    rel_map = {
        "yesterday": timedelta(days=1),
        "a week ago": timedelta(weeks=1),
        "2 weeks ago": timedelta(weeks=2),
        "3 weeks ago": timedelta(weeks=3),
        "4 weeks ago": timedelta(weeks=4),
        "a month ago": timedelta(days=30)
    }
    if date_lower in rel_map:
        return (today - rel_map[date_lower]).strftime("%Y-%m-%d")

    match = re.match(r"(\d+)\s+(day|week|hour)s?\s+ago", date_lower)
    if match:
        amount = int(match.group(1))
        unit = match.group(2)
        if unit == "day":
            return (today - timedelta(days=amount)).strftime("%Y-%m-%d")
        elif unit == "week":
            return (today - timedelta(weeks=amount)).strftime("%Y-%m-%d")
        elif unit == "hour":
            return (today - timedelta(hours=amount)).strftime("%Y-%m-%d")

    weekdays = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    if date_lower in weekdays:
        diff = (today.weekday() - weekdays.index(date_lower)) % 7
        if diff == 0:
            diff = 7
        return (today - timedelta(days=diff)).strftime("%Y-%m-%d")

    return "Unknown"

In [None]:
def fetch_thread_content(browser, thread_url, max_replies=40):
    replies = []
    page_no = 1
    main_post = None
    reply_id = 1

    while len(replies) < max_replies:
        browser.get(thread_url)
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        message_blocks = soup.find_all('div', class_='lia-panel-message')

        if not message_blocks:
            break

        for idx, block in enumerate(message_blocks):
            if page_no > 1 and idx == 0:
                continue

            try:
                text_tag = block.find('div', class_='lia-message-body-content')
                comment_body = tidy_text(text_tag.get_text()) if text_tag else ""

                author_tag = block.find('a', class_='lia-user-name-link')
                author_name = author_tag.get_text(strip=True) if author_tag else "Unknown"

                timestamp = "Unknown"
                time_tag = block.find('span', class_='local-date')
                if time_tag:
                    raw_time = time_tag.text.replace("\u200e", "").strip()
                    try:
                        timestamp = datetime.strptime(raw_time, "%d-%m-%Y").strftime("%Y-%m-%d")
                    except Exception:
                        timestamp = raw_time

                if timestamp == "Unknown":
                    time_tag = block.find('span', class_='local-friendly-date')
                    if time_tag and time_tag.has_attr("title"):
                        raw_time = time_tag["title"].replace("\u200e", "").strip()
                        try:
                            timestamp = datetime.strptime(raw_time, "%d-%m-%Y %I:%M %p").strftime("%Y-%m-%d")
                        except Exception:
                            timestamp = raw_time

                if page_no == 1 and idx == 0:
                    main_post = comment_body
                else:
                    replies.append({
                        "comment_id": f"{reply_id}",
                        "author": author_name,
                        "timestamp": timestamp,
                        "comment": comment_body
                    })
                    reply_id += 1

                if len(replies) >= max_replies:
                    break

            except Exception as e:
                print(f"Comment parse failed: {e}")
                continue

        next_btn = soup.find("a", rel="next")
        if not next_btn:
            break
        href = next_btn.get("href")
        thread_url = href if href.startswith("http") else "https://forums.beyondblue.org.au" + href
        page_no += 1

    return main_post, replies

In [None]:
def scrape_forum_to_csv(categories, category_codes, page_limits):
    chrome_opts = Options()
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    collected_posts = []
    post_counter = 1

    for idx, category in enumerate(categories):
        code = category_codes[category]
        for page in range(1, page_limits[idx] + 1):
            url = f"https://forums.beyondblue.org.au/t5/{category}/bd-p/{code}/page/{page}"
            try:
                driver.get(url)
                time.sleep(3)
                soup = BeautifulSoup(driver.page_source, "html.parser")
                articles = soup.find('div', class_="custom-message-list all-discussions").find_all("article")

                for article in articles:
                    try:
                        title_el = article.select_one("h3 > a[href*='/td-p/']")
                        title_text = title_el.text.strip()
                        thread_link = "https://forums.beyondblue.org.au" + title_el["href"]

                        preview_el = article.select_one("p.body-text")
                        preview_text = preview_el.text.strip() if preview_el else ""

                        author_el = article.select_one("div.custom-tile-author-info a")
                        author_name = author_el.text.strip() if author_el else "Unknown"

                        date_el = article.select_one("div.custom-tile-date time")
                        if date_el:
                            raw_date = date_el.get("datetime") or date_el.text.strip()
                            post_date = parse_date_string(raw_date)
                        else:
                            post_date = "Unknown"
                        if post_date == "Unknown" or post_date < "2019-01-01":
                            continue

                        cat_el = article.select_one("div.custom-tile-category a")
                        category_name = cat_el.text.strip() if cat_el else category

                        post_body, replies = fetch_thread_content(driver, thread_link, max_replies=40)

                        collected_posts.append({
                            "post_id": post_counter,
                            "title": title_text,
                            "author": author_name,
                            "date": post_date,
                            "category": category_name,
                            "preview": preview_text,
                            "post_text": post_body or "",
                            "num_comments": len(replies),
                            "comments_combined": json.dumps(replies),
                            "url": thread_link
                        })
                        post_counter += 1
                        print(f"{thread_link} - comments: {len(replies)}")

                    except Exception as e:
                        print("Error in article block:", e)
                        continue
            except Exception as e:
                print("Failed to load page:", url, e)
                continue

    driver.quit()

    if not os.path.exists("data"):
        os.makedirs("data")

    csv_path = "data/beyondblue_updated.csv"
    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        fieldnames = [
            "post_id", "title", "author", "date", "category",
            "preview", "post_text", "num_comments", "comments_combined", "url"
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in collected_posts:
            writer.writerow(row)

    print(f"CSV saved to {csv_path}")

In [None]:
category_list = ['anxiety', 'depression', 'ptsd-and-trauma', 'suicidal-thoughts-and-self-harm']
category_code_map = {
    'anxiety': 'c1-sc2-b1',
    'depression': 'c1-sc2-b2',
    'ptsd-and-trauma': 'c1-sc2-b3',
    'suicidal-thoughts-and-self-harm': 'c1-sc2-b4'
}
page_count_list = [200, 200, 200, 200]

scrape_forum_to_csv(category_list, category_code_map, page_count_list)