<a href="https://colab.research.google.com/github/reidzf8/Kuliah/blob/main/Semester_5/Scrapping_Data_Buku.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = "https://books.toscrape.com/"
catalogue_url = "https://books.toscrape.com/catalogue/"
books_data = []

In [3]:
# Fungsi ambil detail buku
def get_book_details(book_url, category):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # --- Ambil data dari halaman detail ---
    title = soup.find('div', class_='product_main').h1.text.strip()

    # Rating
    rating_tag = soup.find('p', class_='star-rating')
    rating = rating_tag['class'][1] if rating_tag else None

    # Tabel info produk
    table = soup.find('table', class_='table table-striped')
    details = {row.th.text.strip(): row.td.text.strip() for row in table.find_all('tr')}

    # Cover image
    cover = soup.find('div', class_='item active').img['src'].replace('../../', base_url)

    # Description
    desc_tag = soup.find('div', id='product_description')
    description = desc_tag.find_next('p').text.strip() if desc_tag else ''

    # Stock info
    stock_info = details.get('Availability', '')
    stock_status = "In stock" if "In stock" in stock_info else "Out of stock"
    num_stock = ''.join([c for c in stock_info if c.isdigit()]) or '0'

    # Masukkan data
    books_data.append({
        'category': category,
        'code': details.get('UPC', ''),
        'cover': cover,
        'title': title,
        'rating': rating,
        'price (excl. tax)': details.get('Price (excl. tax)', ''),
        'price (incl. tax)': details.get('Price (incl. tax)', ''),
        'tax': details.get('Tax', ''),
        'stock status': stock_status,
        'number of stock available': num_stock,
        'description': description,
        'number of reviews': details.get('Number of reviews', ''),
        'book url': book_url
    })

In [4]:
# Fungsi ambil semua buku dalam 1 kategori
def scrape_category(category_url, category_name):
    page_num = 1
    while True:
        url = category_url.replace("index.html", f"page-{page_num}.html") if page_num > 1 else category_url
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        books = soup.find_all('article', class_='product_pod')
        if not books:
            break  # Tidak ada lagi halaman berikutnya

        for book in books:
            book_url = book.h3.a['href'].replace('../../../', catalogue_url)
            get_book_details(book_url, category_name)
            time.sleep(0.5)  # biar tidak overload server

        page_num += 1

In [5]:

# --- Mulai scraping semua kategori ---
home = requests.get(base_url)
soup = BeautifulSoup(home.text, 'html.parser')
categories = soup.find('ul', class_='nav-list').find('ul').find_all('a')

for cat in categories:
    category_name = cat.text.strip()
    category_url = base_url + cat['href']
    print(f"Scraping kategori: {category_name}")
    scrape_category(category_url, category_name)

# Buat DataFrame
df = pd.DataFrame(books_data)

# Simpan ke CSV
df.to_csv('books_detailed_dataset.csv', index=False)
print(f"\n✅ Total buku dikumpulkan: {len(df)}")
print("Data berhasil disimpan ke 'books_detailed_dataset.csv'")

Scraping kategori: Travel
Scraping kategori: Mystery
Scraping kategori: Historical Fiction
Scraping kategori: Sequential Art
Scraping kategori: Classics
Scraping kategori: Philosophy
Scraping kategori: Romance
Scraping kategori: Womens Fiction
Scraping kategori: Fiction
Scraping kategori: Childrens
Scraping kategori: Religion
Scraping kategori: Nonfiction
Scraping kategori: Music
Scraping kategori: Default
Scraping kategori: Science Fiction
Scraping kategori: Sports and Games
Scraping kategori: Add a comment
Scraping kategori: Fantasy
Scraping kategori: New Adult
Scraping kategori: Young Adult
Scraping kategori: Science
Scraping kategori: Poetry
Scraping kategori: Paranormal
Scraping kategori: Art
Scraping kategori: Psychology
Scraping kategori: Autobiography
Scraping kategori: Parenting
Scraping kategori: Adult Fiction
Scraping kategori: Humor
Scraping kategori: Horror
Scraping kategori: History
Scraping kategori: Food and Drink
Scraping kategori: Christian Fiction
Scraping kategori: 