# Ecuador Fishing Industry Data Scraper

Objective: Script to scrape data about fishing vessels and fishmeal plants from Ecuador's government websites

Sources: 
- https://srp.produccion.gob.ec/industrial/web/embarcaciones
- https://bitacora.produccion.gob.ec/industrial/web/empresas

To explore: 
https://datos.produccion.gob.ar/dataset/distribucion-geografica-de-los-establecimientos-productivos
https://datos.produccion.gob.ar/dataset/distribucion-geografica-de-los-establecimientos-productivos/archivo/15d42a00-0d1f-480c-bea8-3257e34b7804




In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Base URL for the site
vessel_base_url = "https://bitacora.produccion.gob.ec/industrial/web/embarcaciones"

# Base URL for fishmeal plants
fishmeal_base_url = "https://bitacora.produccion.gob.ec/industrial/web/empresas"

In [19]:
# Function to scrape main page data
def scrape_main_page(url, item_div_class, item_name_class, item_id_class):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    items = []
    # Find all items
    item_divs = soup.find_all('div', class_=item_div_class)
    for item in item_divs:
        name = item.find('div', class_=item_name_class).get_text(strip=True)
        identifier = item.find('div', class_=item_id_class).get_text(strip=True)
        details_link = item.find('a', string='Ver detalles')['href']
        items.append({
            'name': name,
            'identifier': identifier,
            'details_link': details_link
        })
    return items

In [4]:
# Function to scrape vessel details page data
def scrape_vessel_details_page(url):
    url = 'https://srp.produccion.gob.ec' + url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    details = {}

    # Ensure there are at least three divs to avoid IndexError
    divs = soup.find_all('div', class_='col-md-4')
    if len(divs) < 3:
        print(f"Warning: Unexpected structure for details page: {url}")
        return details

    # Extract alt text from the image in the first div
    first_div = divs[0]
    img_alt = first_div.find('img')['alt'] if first_div.find('img') else None
    details['image_alt'] = img_alt

    # Extract pairs of item-name and item-res from the second div
    second_div = divs[1]
    spans = second_div.find_all('span', class_=['item-name', 'item-res'])
    for i in range(0, len(spans) - 1, 2):
        if 'item-name' in spans[i].get('class', []) and 'item-res' in spans[i + 1].get('class', []):
            key = spans[i].get_text(strip=True)
            value = spans[i + 1].get_text(strip=True)
            details[key] = value

    # Extract key-value pairs from the third div
    third_div = divs[2]
    span_pairs = third_div.find_all('span', class_='item-res')
    for i in range(0, len(span_pairs) - 1, 2):
        key = span_pairs[i].get_text(strip=True)
        value = span_pairs[i + 1].get_text(strip=True)
        details[key] = value

    return details

In [46]:
# Function to scrape fishmeal plant details page data
def scrape_fishmeal_details_page(url):
    url = 'https://bitacora.produccion.gob.ec' + url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    details = {}

    # Extract data from the first div (second part of the image name)
    first_div = soup.find('div', class_='col-md-4')
    img = first_div.find('img') if first_div else None
    if img and 'src' in img.attrs:
        img_name = img['src'].split('/')[-1].split('.')[0]  # Extract image name
        if '-' in img_name:
            key, value = img_name.split('-', 1)
            details[key.capitalize()] = value.capitalize()

    # Get all elements with class item-name
    item_names = soup.find_all(class_='item-name')

    for item_name in item_names:
        # Get the heading text (column name)
        heading = item_name.get_text(strip=True)

        # Find the next sibling element with class item-res
        item_res_elements = []
        next_sibling = item_name.find_next_sibling(class_='item-res')

        # Handle the special case for 'Acuerdo Ministerial'
        if heading.startswith('Acuerdo Ministerial'):
            while next_sibling and 'item-res' in next_sibling.get('class', []):
                item_res_elements.append(next_sibling.get_text(strip=True))
                next_sibling = next_sibling.find_next_sibling(class_='item-res')

            details[heading] = item_res_elements if item_res_elements else 'N/A'
        else:
            # For other fields, use the text of the first 'item-res' sibling
            details[heading] = next_sibling.get_text(strip=True) if next_sibling else 'N/A'

    return details

In [23]:
# Function to scrape all pages
def scrape_all_pages(base_url, item_div_class, item_name_class, item_id_class):
    page = 1
    all_items = []

    while True:
        print(f"Scraping page {page}...")
        url = f"{base_url}/index?page={page}&per-page=10"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Scrape items from the current page
        items = scrape_main_page(url,  item_div_class, item_name_class, item_id_class)
        if not items:
            break  # Stop if no items found (end of pagination)

        all_items.extend(items)

        # Check if there is a next page
        next_button = soup.find('li', class_='next')
        if not next_button or not next_button.find('a', class_='page-link'):
            break

        page += 1

    return all_items

In [None]:
# Scraping vessels
print("Scraping vessels...")
all_vessels = scrape_all_pages(vessel_base_url, 'embarcacion-item', 'item-name', 'item-matricula')



In [None]:
# Scraping fishmeal plants
print("Scraping fishmeal plants...")
all_fishmeal = scrape_all_pages(fishmeal_base_url, 'empresa-item', 'item-name', 'item-cedula')

In [None]:
artesenials

In [30]:
# Scraping details for vessels
for i, vessel in enumerate(all_vessels, start=1):
    print(f"Scraping details for vessel {i} of {len(all_vessels)}: {vessel['name']}")
    details_url = vessel['details_link']
    try:
        vessel_details = scrape_vessel_details_page(details_url)
        vessel.update(vessel_details)
    except Exception as e:
        print(f"Error scraping details for vessel {vessel['name']}: {e}")


In [None]:
# Scraping details for fishmeal plants
failed_plants = []
for i, plant in enumerate(all_fishmeal, start=1):
    print(f"Scraping details for plant {i} of {len(all_fishmeal)}: {plant['name']}")
    details_url = plant['details_link']
    try:
        plant_details = scrape_fishmeal_details_page(details_url)
        plant.update(plant_details)
    except Exception as e:
        print(f"Error scraping details for plant {plant['name']}: {e}")
        failed_plants.append(plant['name'])

# Write failed plants to log file
if failed_plants:
    with open('plants-log.txt', 'w') as f:
        f.write('\n'.join(failed_plants))

In [None]:
# Convert data to DataFrame and save to CSV
vessel_df = pd.DataFrame(all_vessels)
vessel_df.to_csv('vessel_data.csv', index=False)
print("Scraping complete. Data saved to 'vessel_data.csv'.")

fishmeal_df = pd.DataFrame(all_fishmeal)
fishmeal_df.to_csv('fishmeal_data-test.csv', index=False)
print("Scraping complete. Data saved to 'fishmeal_data.csv'.")

In [None]:
# Export both dataframes to Excel file with separate sheets
with pd.ExcelWriter('ecuador_fishing_data.xlsx') as writer:
    vessel_df.to_excel(writer, sheet_name='Vessels', index=False)
    fishmeal_df.to_excel(writer, sheet_name='Fishmeal Plants', index=False)
print("Data exported to 'ecuador_fishing_data.xlsx'")

