In [4]:
import requests as req
from bs4 import BeautifulSoup
from pymongo import MongoClient
import datetime
import re
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
Client = MongoClient('mongodb://localhost:27017/')
db = Client.bigdata
collection = db.techCNBC

In [9]:
try:
    db.command('ping')
    print("Koneksi ke database berhasil!")
except Exception as e:
    print(f"Koneksi ke database gagal: {e}")

Koneksi ke database berhasil!


In [None]:
def filter_urls(url):
    rejected_urls = [
        "https://www.cnnindonesia.com/olahraga/indeks/7?page=",
        "https://www.cnnindonesia.com/olahraga/indeks/7",
        "https://www.cnnindonesia.com/olahraga/sepakbola",
        "https://www.cnnindonesia.com/olahraga/moto-gp",
        "https://www.cnnindonesia.com/olahraga/f1",
        "https://www.cnnindonesia.com/olahraga/indeks/kolom/7",
        "https://www.cnnindonesia.com/olahraga/indeks/foto/7",
        "https://www.cnnindonesia.com/olahraga/indeks/video/7",
        "https://www.cnnindonesia.com/olahraga/indeks/infografis/7"
    ]
    for rejected_url in rejected_urls:
        if url == rejected_url or url.startswith(rejected_url):
            return False
    return True

def fetch_urls_from_index(i):
    url = f"https://www.cnbcindonesia.com/tech/indeks/12/{i}"
    res = req.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    parsed_urls = set()

    # Definisikan pola regex yang mengharuskan URL diakhiri oleh nomor
    regex_pattern = r"^https://www\.cnbcindonesia\.com/tech/\d{14}-\d{2}-\d+/.*$"

    for j in soup.find_all("a", href=True):
        href = j["href"]
        # Memeriksa apakah URL sesuai dengan regex dan tidak ada dalam daftar reject
        if re.match(regex_pattern, href) and filter_urls(href):
            parsed_urls.add(href)

    print(f"Fetched {len(parsed_urls)} URLs from {url}")
    return parsed_urls

count = 0
unique_urls = set()

# Hapus file lama jika ada
if os.path.exists('output.txt'):
    os.remove('output.txt')

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(fetch_urls_from_index, i) for i in range(1, 1571)]
    for future in as_completed(futures):
        urls = future.result()
        unique_urls.update(urls)
        count = len(unique_urls)
        print(f"Total unique URLs so far: {count}")

# Tulis URL yang telah difilter ke dalam file
with open('output.txt', 'w') as file:
    for href in unique_urls:
        file.write(href + '\n')

print(f"{count} unique URLs written to output.txt")


Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/1Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/3
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/4
Total unique URLs so far: 10
Total unique URLs so far: 20

Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/2
Total unique URLs so far: 30
Total unique URLs so far: 40
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/8
Total unique URLs so far: 50
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/6
Total unique URLs so far: 60
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/5
Total unique URLs so far: 70
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/9
Total unique URLs so far: 80
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/7
Total unique URLs so far: 90
Fetched 10 URLs from https://www.cnbcindonesia.com/tech/indeks/12/10
Total unique URLs so far: 100
Fetched 10 URLs from https:/

In [None]:
def getHtml(url):
    res = req.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    return soup

def process_url(url, index, total_urls, limit=0):
    try:
        if index < limit:
            soup = getHtml(url)
            title = soup.find("title").text
            author = soup.find(class_="mb-1 text-base font-semibold").text
            create_at = soup.find(class_="text-cm text-gray").text

            
            paragraphs = soup.find_all(class_="detail-text", recursive=True)
            html_content = "\n".join(div.text for div in paragraphs)

            data = {
                "url": url,
                "title": title,
                "author": author,
                "create_at": create_at,
                "crawling_at": datetime.datetime.now().strftime("%d/%m/%Y %H:%M") + " WITA",
                "html_content": str(soup),
                "article_content": html_content
            }
            collection.insert_one(data)
            
            progress = ((index + 1) / total_urls) * 100 
            author = data["author"] 
            date = data["create_at"]
            print(f"[{index + 1}/{total_urls}] | Progress: {progress:.2f}% for: {url} | author : {author}, {date}\n")
        
    except Exception as e:
        print(f"Error processing {url}: {e}")


with open('output.txt', 'r') as file:
    urls = [line.strip() for line in file.readlines()]
total_urls = len(urls)


start_line = 4410 
urls_to_process = urls[start_line:]


with ThreadPoolExecutor(max_workers=10) as executor:  
    futures = [
        executor.submit(process_url, url, index, total_urls, total_urls)
        for index, url in enumerate(urls_to_process, start=start_line)
    ]
    for future in as_completed(futures):
        future.result()  


[4411/15700] | Progress: 28.10% for: https://www.cnbcindonesia.com/tech/20240503145841-37-535563/bukti-as-butuh-china-joe-biden-jangan-asal-blokir | author : 
    Redaksi, CNBC Indonesia
, 03 May 2024 21:10

[4415/15700] | Progress: 28.12% for: https://www.cnbcindonesia.com/tech/20240305060935-37-519615/startup-gaji-instan-disuntik-modal-rp-362-miliar-mau-pakai-ai | author : 
    Redaksi, CNBC Indonesia
, 05 March 2024 08:20

[4412/15700] | Progress: 28.10% for: https://www.cnbcindonesia.com/tech/20231123112633-39-491355/strategi-pln-icon-plus-hadirkan-internet-cepat | author : 
    CNBC Indonesia TV, CNBC Indonesia
, 23 November 2023 11:38

[4416/15700] | Progress: 28.13% for: https://www.cnbcindonesia.com/tech/20231006075153-37-478354/cara-baterai-laptop-windows-awet-walau-sudah-mau-habis | author : 
    Intan Rakhmayanti Dewi, CNBC Indonesia
, 06 October 2023 09:00

[4418/15700] | Progress: 28.14% for: https://www.cnbcindonesia.com/tech/20240703095628-37-551341/cara-cek-nik-ktp-terd

In [None]:
# Fungsi untuk membersihkan `article_content` dan `author`
def clean_article_content(doc, total_docs, index):
    try:
        # 1. Bersihkan baris baru berlebihan di article_content
        cleaned_content = re.sub(r'\n+', '\n', doc['article_content'])
        
        # 2. Bersihkan data author dari tambahan seperti ", CNBC Indonesia"
        cleaned_author = re.sub(r',?\s*CNBC Indonesia', '', doc['author']).strip()
        
        # Update jika ada perubahan
        update_data = {}
        if cleaned_content != doc['article_content']:
            update_data['article_content'] = cleaned_content
        if cleaned_author != doc['author']:
            update_data['author'] = cleaned_author
        
        # Lakukan update hanya jika ada perubahan
        if update_data:
            collection.update_one({'_id': doc['_id']}, {'$set': update_data})
        
        # Menampilkan progres
        progress = ((index + 1) / total_docs) * 100
        print(f"[{index + 1}/{total_docs}] Progress: {progress:.2f}% - Updated document with _id: {doc['_id']}")
    
    except Exception as e:
        print(f"Error updating document with _id: {doc['_id']}: {e}")

# Fungsi utama untuk memproses dokumen dengan opsi `limit`
def process_documents(limit=0):
    # Mengambil semua dokumen dalam koleksi dan menerapkan limit jika ditentukan
    if limit > 0:
        documents = list(collection.find().limit(limit))
    else:
        documents = list(collection.find())
    
    total_docs = len(documents)

    # Menggunakan ThreadPoolExecutor untuk memproses dokumen secara paralel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(clean_article_content, doc, total_docs, index)
            for index, doc in enumerate(documents)
        ]
        for future in as_completed(futures):
            future.result()

    print("Finished cleaning article_content and author in all documents.")

# Jalankan proses dengan limit (atur limit sesuai kebutuhan, default 0 untuk semua data)
process_documents()  # Jika ingin memproses semua data
# Atau tentukan limit, contoh: process_documents(limit=100)


[2/15700] Progress: 0.01% - Updated document with _id: 6727674c984fcfc961075a23[1/15700] Progress: 0.01% - Updated document with _id: 6727674c984fcfc961075a24
[3/15700] Progress: 0.02% - Updated document with _id: 6727674c984fcfc961075a27
[5/15700] Progress: 0.03% - Updated document with _id: 6727674c984fcfc961075a25
[6/15700] Progress: 0.04% - Updated document with _id: 6727674c984fcfc961075a28
[7/15700] Progress: 0.04% - Updated document with _id: 6727674d984fcfc961075a29
[8/15700] Progress: 0.05% - Updated document with _id: 6727674d984fcfc961075a2a
[9/15700] Progress: 0.06% - Updated document with _id: 6727674d984fcfc961075a2b
[4/15700] Progress: 0.03% - Updated document with _id: 6727674c984fcfc961075a26
[10/15700] Progress: 0.06% - Updated document with _id: 672768954c7c82998e4a4779

[11/15700] Progress: 0.07% - Updated document with _id: 672768954c7c82998e4a477c
[12/15700] Progress: 0.08% - Updated document with _id: 672768954c7c82998e4a477a
[13/15700] Progress: 0.08% - Updated 