Here, we are trying to scrape only part of a specific ULR - a table 

In [15]:
import requests
from bs4 import BeautifulSoup

import os
import pandas as pd
from datetime import datetime

In [None]:
# send a GET request to the ULR
target_url = 'https://www.fi.se/en/our-registers/net-short-positions/'
access_time = datetime.now().strftime("%Y-%m-%d_%H%M%S")
response = requests.get(target_url)
response.raise_for_status() # Check for HTTP errors

In [11]:
# parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# find the table on the page
table = soup.find('table', id='aktuella')
print(table)

<table id="aktuella"><thead><tr><th>Issuer name</th><th>Issuer LEI code</th><th>Latest position date</th><th class="numeric">Sum short %</th></tr><tbody><tr><td><a href="/en/our-registers/net-short-positions/emittent/?id=549300ZQH0FIF1P0MX67">NIBE Industrier AB</a></td><td>549300ZQH0FIF1P0MX67</td><td>2024-11-15</td><td class="numeric">11.21</td></tr><tr><td><a href="/en/our-registers/net-short-positions/emittent/?id=549300OWGJPYKC6JF790">Arjo AB (publ)</a></td><td>549300OWGJPYKC6JF790</td><td>2024-11-15</td><td class="numeric">1.36</td></tr><tr><td><a href="/en/our-registers/net-short-positions/emittent/?id=549300XFXK7DVGDRP410">Moberg Pharma AB (publ)</a></td><td>549300XFXK7DVGDRP410</td><td>2024-11-15</td><td class="numeric">2.44</td></tr><tr><td><a href="/en/our-registers/net-short-positions/emittent/?id=549300YJUYJ9A24RXL71">Billerud Aktiebolag (publ)</a></td><td>549300YJUYJ9A24RXL71</td><td>2024-11-15</td><td class="numeric">1.14</td></tr><tr><td><a href="/en/our-registers/net-sh

In [12]:
# extract table headers (if any)
headers = [] # will store the header names (i.e., the text content inside the <th> tags) that we extract from the table.
for th in table.find_all('th'): # iterate over all <th> (Table Header Cell - column names) elements:
    headers.append(th.text.strip()) # extract the text from each <th> element:

# extract table rows
rows = []
for tr in table.find_all('tr')[1:]:  # iterate over all the <tr> (Table Rows - rows) but skip the header row 
    cols = tr.find_all('td') # find all <td> (Table Data Cell - regular data cell in a row) elements for each row
    row = [col.text.strip() for col in cols]
    rows.append(row)

In [None]:
# view as Pandas DataFrame
data = pd.DataFrame(rows, columns=headers)
data

Unnamed: 0,Issuer name,Issuer LEI code,Latest position date,Sum short %
0,NIBE Industrier AB,549300ZQH0FIF1P0MX67,2024-11-15,11.21
1,Arjo AB (publ),549300OWGJPYKC6JF790,2024-11-15,1.36
2,Moberg Pharma AB (publ),549300XFXK7DVGDRP410,2024-11-15,2.44
3,Billerud Aktiebolag (publ),549300YJUYJ9A24RXL71,2024-11-15,1.14
4,Kebni AB (publ),549300UWE2XN3O7TXX74,2024-11-15,0.2
...,...,...,...,...
370,Hästkällaren Rid Trav & Western AB,894500464CPRG49EGI94,2020-07-10,0.14
371,Spintso International,549300N0YFGUA3NVL475,2020-03-13,0.18
372,"VEONEER, INC.",54930082R4LTC7PERT23,2021-12-16,0.89
373,Hövding Sverige AB (publ),5493002CWEZ4BUCNG484,2022-01-19,0.19


In [19]:
# save in desired directory
working_directory = os.getcwd()
downloads_dir = os.path.join(working_directory, "downloads")

export_path = os.path.join(downloads_dir, f'Sweden_shorts_{access_time}.csv')
data.to_csv(export_path, index=False)  # index=False avoids writing the row index to the file