In [None]:
# Suppress only the single InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Path to save output (modify as per your environment)
OUTPUT_PATH = "./news_archive_data/"
os.makedirs(OUTPUT_PATH, exist_ok=True)

def compute_starttime(date_obj, reference_date=date(2013, 6, 1), reference_value=41426):
    """
    Compute an offset-based starttime used in some archive URLs.

    Args:
        date_obj (datetime.date): The target date.
        reference_date (datetime.date): A base date for offset calculation.
        reference_value (int): Starttime value corresponding to the reference_date.

    Returns:
        int: Offset-based date parameter.
    """
    return reference_value + (date_obj - reference_date).days

def build_archive_url(base_url_template, year, month, day):
    """
    Construct the archive URL based on a date and template.

    Args:
        base_url_template (str): A URL template with placeholders {year}, {month}, {day}, {starttime}
        year, month, day (int): Target date

    Returns:
        str: Constructed archive URL
    """
    date_obj = date(year, month, day)
    starttime = compute_starttime(date_obj)  # Optional usage if the site needs it
    return base_url_template.format(year=year, month=month, day=day, starttime=starttime)

def parse_article_links(html_content, base_url, filter_func):
    """
    Parses HTML and extracts relevant article info using a custom filter function.

    Args:
        html_content (str): HTML page content
        base_url (str): Base URL for building full article links
        filter_func (function): A function to identify valid article URLs from <a> tags

    Returns:
        list[dict]: List of articles with metadata
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all('a', href=True)

    articles = []
    for link in links:
        href = link['href']
        if filter_func(href):
            category = href.strip("/").split("/")[0]  # Customize based on site's URL pattern
            articles.append({
                "headline_text": link.get_text(strip=True),
                "headline_category": category,
                "article_link": href if href.startswith("http") else base_url + href
            })
    return articles

def scrape_day(base_url_template, base_url, year, month, day, filter_func):
    """
    Scrape headlines for a given day.

    Args:
        base_url_template (str): Archive URL template with placeholders
        base_url (str): Main site URL
        year, month, day (int): Date to scrape
        filter_func (function): Custom function to filter valid articles

    Returns:
        list[dict]: Scraped article info
    """
    url = build_archive_url(base_url_template, year, month, day)
    publish_date = f"{year}{month:02d}{day:02d}"
    print(f"Fetching {publish_date} ...")

    try:
        response = requests.get(url, verify=False, timeout=10)
        response.raise_for_status()
        articles = parse_article_links(response.text, base_url, filter_func)
        for a in articles:
            a["publish_date"] = publish_date
        print(f"{len(articles)} articles found.")
        return articles

    except Exception as e:
        print(f"Error fetching {publish_date}: {e}")
        return []

def scrape_year(base_url_template, base_url, year, filter_func):
    """
    Scrape news data for an entire year.

    Args:
        base_url_template (str): Archive URL template
        base_url (str): Base domain
        year (int): Year to scrape
        filter_func (function): Function to filter article links

    Saves:
        Excel file of results
    """
    all_articles = []

    for month in range(1, 13):
        start = date(year, month, 1)
        end = (date(year, month + 1, 1) - timedelta(days=1)) if month < 12 else date(year, 12, 31)

        for day_offset in range((end - start).days + 1):
            current_date = start + timedelta(days=day_offset)
            daily_articles = scrape_day(base_url_template, base_url, current_date.year, current_date.month, current_date.day, filter_func)
            all_articles.extend(daily_articles)

            # Be polite with random delays
            time.sleep(random.randint(1, 3))

    # Save to Excel
    if all_articles:
        df = pd.DataFrame(all_articles)
        output_file = os.path.join(OUTPUT_PATH, f"News_Archive_{year}.xlsx")
        df.to_excel(output_file, index=False)
        print(f"Data saved at: {output_file}")
    else:
        print("No data scraped.")

# Example Usage:
if __name__ == "__main__":
    # Customize the following based on your target site
    BASE_URL = "https://example.com"
    ARCHIVE_URL_TEMPLATE = "https://example.com/archive/{year}/{month}/{day}/start-{starttime}/"

    # Filter function to recognize valid article links (customize for each site)
    def article_filter(href):
        return "/news/" in href and "/article/" in href  # Update as needed

    # Start the scraper
    TARGET_YEAR = 2015
    print(f"Starting scrape for {TARGET_YEAR}...\n")
    scrape_year(ARCHIVE_URL_TEMPLATE, BASE_URL, TARGET_YEAR, article_filter)
    print("\nDone scraping.")
