In [None]:
import requests
from bs4 import BeautifulSoup
import logging
from datetime import datetime

def scrape_cnbc_article(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract title
            title_tag = soup.find('title')
            title = title_tag.get_text(strip=True) if title_tag else 'No title available'

            # Extract publish date
            publish_date = None
            date_tag = soup.find('div', class_='text-cm text-gray')  # Adjust this selector as needed
            if date_tag:
                date_text = date_tag.get_text(strip=True)
                logging.debug(f"Raw date text found: {date_text}")
                try:
                    # Parse date with the correct format
                    publish_date = datetime.strptime(date_text, "%d %B %Y %H:%M").strftime("%Y-%m-%d")
                    logging.debug(f"Parsed publish date: {publish_date}")
                except ValueError as e:
                    logging.error(f"Failed to parse date: {date_text} -> {e}")
                    publish_date = None
            else:
                logging.warning("Date element not found. Setting publish_date to None.")
                publish_date = None

            # Extract content
            content = None
            article_body = soup.find('div', class_='detail-text')  # Adjust this selector as needed
            if article_body:
                paragraphs = article_body.find_all('p')
                content = ' '.join([p.get_text(strip=True) for p in paragraphs])
            else:
                content = "No content available"

            # Final validation
            if not title or not publish_date or not content:
                logging.warning(f"Invalid data extracted: title={title}, publish_date={publish_date}, content={content}")
                return None, None, None

            # **Return values in the order expected by save_raw_to_db: (title, content, publish_date)**
            return title, content, publish_date
        else:
            logging.error(f"Failed to fetch URL {url} with status code {response.status_code}")
            return None, None, None
    except requests.RequestException as e:
        logging.error(f"Request error while scraping {url}: {e}")
        return None, None, None


In [37]:
url = "https://www.cnbcindonesia.com/news/20241126120138-4-591184/makin-ekspansif-mind-id-anggarkan-investasi-sampai-rp-267-triliun"
title, publish_date, content = scrape_cnbc_article(url)
print(f"Title: {title}\nPublish Date: {publish_date}\nContent: {content}")


Title: Makin Ekspansif! MIND ID Anggarkan Investasi Sampai Rp 267 Triliun
Publish Date: 2024-11-26
Content: Jakarta, CNBC Indonesia -Holding BUMN Pertambangan MIND ID akan mengalokasikan anggaran investasi hingga Rp 267,8 triliun sampai 2029 mendatang. Direktur Portofolio dan Pengembangan Usaha MIND ID Dilo Seno Widagdo mengatakan, rencana investasi tersebut untuk meningkatkan nilai tambah bagi proyek-proyek hilirisasi perusahaan, termasuk guna mengejar pertumbuhan pendapatan hingga 5 tahun ke depan. Nilai investasi tersebut menurutnya belum termasuk anggaran investasi untuk PT Freeport Indonesia dan PT Vale Indonesia Tbk (INCO). "Tanpa Freeport dan Vale sampai 2029 Rp 267 triliun ini investasi sudah kita rencanakan. Sudah bisa kita masukan di program-program ini harus mulai dari sekarang, bukan di 2041. Tapi sekarang kita harus investasi di Kucing Liar (tambang bawah tanah PT Freeport Indonesia)," ucapnya dalam acara MIND ID Commodities Outlook 2025 di Jakarta, Selasa (26/11/2024). Di

In [17]:
if date_tag:
    logging.debug(f"Date tag found: {date_tag}")
else:
    logging.warning("Date tag not found.")


NameError: name 'date_tag' is not defined