# Vessel Data Scraping Notebook

This notebook contains code to scrape vessel data from a web interface. It uses Selenium WebDriver with Chrome to:

- Navigate through paginated results
- Extract vessel information from HTML tables 
- Store the data in a structured format

The scraping is done using Python libraries including:
- Selenium for browser automation
- BeautifulSoup for HTML parsing
- CSV for data export


In [12]:
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

In [17]:
def scrape_vessel_data(base_url: str, driver_path: str = "../drivers/chromedriver") -> dict:
    """
    Scrapes vessel data from the given base_url (direct iframe URL) by iterating over paginated results.
    Returns a dictionary keyed by the unique vessel id (N°).
    """
    # For debugging in Jupyter, disable headless so you can visually inspect the browser.
    chrome_options = Options()
    # Comment out headless mode for debugging:
    # chrome_options.add_argument("--headless")
    
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(base_url)
    
    wait = WebDriverWait(driver, 30)
    
    # Wait until the table is visible
    try:
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.siforpa-table")))
    except Exception as e:
        print("Timeout waiting for table:", e)
        driver.quit()
        return {}
    
    vessel_data = {}
    page_num = 1
    
    while True:
        # Allow extra time for dynamic content to load
        time.sleep(3)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table", class_="siforpa-table")
        if not table:
            print(f"Page {page_num}: Table not found")
            break

        # Identify tbody with real data (skip those showing "No se encontraron registros")
        tbody = None
        for tb in table.find_all("tbody"):
            if "No se encontraron registros" not in tb.get_text():
                tbody = tb
                break

        if tbody:
            rows = tbody.find_all("tr", class_="ng-scope")
            print(f"Page {page_num}: Found {len(rows)} rows")
            for row in rows:
                tds = row.find_all("td")
                # Filter out any td that has the "ng-hide" class
                visible_tds = [td for td in tds if "ng-hide" not in (td.get("class") or [])]
                if len(visible_tds) >= 8:
                    unique_id = visible_tds[0].get_text(strip=True)
                    record = {
                        "inscripcion": visible_tds[1].get_text(strip=True),
                        "documento": visible_tds[2].get_text(strip=True),
                        "pescador": visible_tds[3].get_text(strip=True),
                        "matricula": visible_tds[4].get_text(strip=True),
                        "embarcacion": visible_tds[5].get_text(strip=True),
                        "matricula_leg": visible_tds[6].get_text(strip=True),
                        "protocolo": visible_tds[7].get_text(strip=True)
                    }
                    vessel_data[unique_id] = record
        else:
            print(f"Page {page_num}: No vessel data found")
            break

        # Attempt to navigate to the next page
        try:
            next_li = driver.find_element(By.XPATH, "//li[contains(@class, 'pagination-next')]")
            if "disabled" in next_li.get_attribute("class"):
                print("Reached the last page.")
                break
            next_button = next_li.find_element(By.TAG_NAME, "a")
            next_button.click()
            page_num += 1
        except NoSuchElementException:
            print("Next button not found, ending pagination.")
            break
    
    driver.quit()
    return vessel_data

In [18]:
# Use the direct iframe URL
base_url = "https://siforpa.produce.gob.pe/General/Portal/Index"
data = scrape_vessel_data(base_url)

# Save the scraped data to a CSV file
with open("../datavessel_data.csv", "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["unique_id", "inscripcion", "documento", "pescador", "matricula", "embarcacion", "matricula_leg", "protocolo"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for uid, record in data.items():
        row = {"unique_id": uid}
        row.update(record)
        writer.writerow(row)

print("Scraping complete. Data saved to vessel_data.csv.")

Page 1: Found 10 rows
Page 2: Found 10 rows
Page 3: Found 10 rows
Page 4: Found 10 rows
Page 5: Found 10 rows
Page 6: Found 10 rows
Page 7: Found 10 rows
Page 8: Found 10 rows
Page 9: Found 10 rows
Page 10: Found 10 rows
Page 11: Found 10 rows
Page 12: Found 10 rows
Page 13: Found 10 rows
Page 14: Found 10 rows
Page 15: Found 10 rows
Page 16: Found 10 rows
Page 17: Found 10 rows
Page 18: Found 10 rows
Page 19: Found 10 rows
Page 20: Found 10 rows
Page 21: Found 10 rows
Page 22: Found 10 rows
Page 23: Found 10 rows
Page 24: Found 10 rows
Page 25: Found 10 rows
Page 26: Found 10 rows
Page 27: Found 10 rows
Page 28: Found 10 rows
Page 29: Found 10 rows
Page 30: Found 10 rows
Page 31: Found 10 rows
Page 32: Found 10 rows
Page 33: Found 10 rows
Page 34: Found 10 rows
Page 35: Found 10 rows
Page 36: Found 10 rows
Page 37: Found 10 rows
Page 38: Found 10 rows
Page 39: Found 10 rows
Page 40: Found 10 rows
Page 41: Found 10 rows
Page 42: Found 10 rows
Page 43: Found 10 rows
Page 44: Found 10 ro