In [1]:
import os
import pandas as pd
import numpy as np
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from google.cloud import storage
from io import StringIO

In [2]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('--disable-gpu')  # Required for some environments
chrome_options.add_argument('--remote-debugging-port=9222')

In [3]:
# Set up WebDriver
driver = webdriver.Chrome(options=chrome_options)

In [72]:
from google.cloud import storage
from google.oauth2 import service_account

# Function to fetch table data from a given URL
def fetch_table_data(url):
    driver.get(url)
    try:
        # Wait for the presence of the table
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'table[role="table"]'))
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")
        tables = soup.find_all("table")

        if tables:
            html_content = str(tables[0])
            df = pd.read_html(StringIO(html_content))[0]
            return df
        else:
            print("No tables found.")
            return pd.DataFrame()
    except Exception as e:
        print("Error waiting for table:", e)
        return pd.DataFrame()

# Function to fetch company details from the links and add scraping URL and timestamp
def fetch_company_details(links):
    details = []
    for link in links:
        driver.get(link)
        try:
            isin = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'lbl-details-2-2'))
            ).text
            ticker = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'lbl-details-2-3'))
            ).text
            details.append({
                'ISIN': isin,
                'Ticker': ticker,
                'scraping_url': link,  # Add the scraping URL
                'scraping_timestamp': datetime.now().isoformat()  # Add the scraping timestamp
            })
        except Exception as e:
            print(f"Error fetching details for {link}: {e}")

        # between 1 to 3 seconds between API calls
        random_int = np.random.choice([1, 2, 3])
        time.sleep(random_int)

    return pd.DataFrame(details)

# Function to upload file to GCS
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    credentials = service_account.Credentials.from_service_account_file('/workspaces/financial_data_scraping/jaber-financial-b20f23e23588.json')
    storage_client = storage.Client(credentials=credentials)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}.")

# List of URLs to check (page 1 and page 2)
urls = [
    "https://www.bolsasymercados.es/bme-exchange/en/Prices-and-Markets/Shares/Main-Market/Listed-Companies",
    "https://www.bolsasymercados.es/bme-exchange/en/Prices-and-Markets/Shares/Main-Market/Listed-Companies?page=2"
]

all_companies = pd.DataFrame()  # To store the final merged data

for url in urls:
    df = fetch_table_data(url)

    if not df.empty:
        # Extract links
        data_links = []
        for a_tag in driver.find_elements(By.CSS_SELECTOR, 'td[role="rowheader"] a'):
            data_links.append(a_tag.get_attribute('href'))

        # Create a DataFrame for the links
        links_df = pd.DataFrame({'links': data_links})

        # Fetch company details with scraping_url and scraping_timestamp
        companies_df = fetch_company_details(links_df['links'])

        # Combine the table data with the fetched details
        combined_data = pd.concat([df, companies_df], axis=1)
        all_companies = pd.concat([all_companies, combined_data], ignore_index=True)

# Save the final combined data to a local Parquet file
local_file = '/workspaces/financial_data_scraping/data/companies.parquet'
all_companies.to_parquet(local_file, index=False)

# Define Google Cloud Storage bucket name and file path
bucket_name = 'companies_details'
destination_blob_name = 'data/companies_details.parquet'

# Upload to GCS
upload_to_gcs(bucket_name, local_file, destination_blob_name)

# Quit the WebDriver
driver.quit()

File /workspaces/financial_data_scraping/data/companies.parquet uploaded to data/companies_details.parquet in bucket companies_details.


In [73]:
all_companies

Unnamed: 0,Name,Sector - Subsector,Market,Indices,ISIN,Ticker,scraping_url,scraping_timestamp
0,"ACCIONA,S.A.","Basic Mat., Industry and Construction - Constr...",Continuous Market,"IBEX 35®, IGBM",ES0125220311,ANA,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:26:22.129581
1,"ACERINOX, S.A.","Basic Mat., Industry and Construction - Minera...",Continuous Market,"IBEX 35®, IGBM, IBEX TOP Dividendo®",ES0132105018,ACX,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:26:23.881395
2,"ACS,ACTIVIDADES DE CONST.Y SERVICIOS S.A","Basic Mat., Industry and Construction - Constr...",Continuous Market,"IBEX 35®, IGBM, IBEX TOP Dividendo®",ES0167050915,ACS,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:26:27.614561
3,"ADOLFO DOMINGUEZ, S.A.","Consumer Goods - Textile, Footwear, Cosmetics,...",Continuous Market,IGBM,ES0106000013,ADZ,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:26:29.425235
4,"AEDAS HOMES, S.A.",Real Estate Services - Real Estate and Others,Continuous Market,"IGBM, IBEX TOP Dividendo®",ES0105287009,AEDAS,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:26:32.172026
...,...,...,...,...,...,...,...,...
65,"UNICAJA BANCO, S.A.",Financial Services - Banks,Continuous Market,"IBEX 35®, IGBM, IBEX TOP Dividendo®",ES0180907000,UNI,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:29:42.298817
66,"URBAS GRUPO FINANCIERO, S.A.",Real Estate Services - Real Estate and Others,Continuous Market,IGBM,ES0182280018,UBS,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:29:44.219888
67,VIDRALA S.A.,Consumer Goods - Other Consumer Goods,Continuous Market,IGBM,ES0183746314,VID,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:29:46.998276
68,"VISCOFAN, S.A.",Consumer Goods - Food and Beverage,Continuous Market,IGBM,ES0184262212,VIS,https://www.bolsasymercados.es/bme-exchange/en...,2024-09-12T11:29:50.757857
