## Data Scraping from Markets Business Insider

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [28]:
def scrape_index_components_to_csv(url, csv_filename):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Create a list to hold all the company data
    companies_data = []

    # Extract the header names for columns
    headers = soup.select_one('table.table thead').find_all('th')
    columns = [header.get_text(strip=True) for header in headers]

    # Loop through the rows and extract data
    for row in soup.select('table.table tbody tr'):
        # Get all columns in the row
        cols = row.find_all('td')
        # Extract text and split on new lines or other appropriate separators
        col_text = [col.get_text(" ", strip=True) for col in cols]
        companies_data.append(col_text)

    # Create the DataFrame
    df = pd.DataFrame(companies_data, columns=columns)

    # Clean the DataFrame to remove rows with no company name
    df = df.dropna(subset=[columns[0]])  # assuming the first column is the company name

    # Save the DataFrame as CSV
    df.to_csv(f'../data/raw_data/{csv_filename}', index=False)

    return df


In [10]:
def scrape_wikipedia_to_csv(url, csv_filename, table_position):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Since the tables have a common class, we find all and then select the third one
    tables = soup.find_all('table', {'class': 'wikitable sortable'})
    
    # Table position
    table = tables[table_position] 
    
    # Create a list to hold all the company data
    companies_data = []

    # Extract the header names for columns
    headers = table.find_all('th')
    columns = [header.get_text(strip=True) for header in headers]

    # Loop through the rows and extract data
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if cols:  # This checks if there are any 'td' elements found to avoid header rows
            col_text = [col.get_text(" ", strip=True) for col in cols]
            companies_data.append(col_text)

    # Create the DataFrame
    df = pd.DataFrame(companies_data, columns=columns)

    # Clean the DataFrame to remove rows with no company name
    # assuming the first column is the company name
    df = df.dropna(subset=[columns[0]])  

    # Save the DataFrame as CSV
    df.to_csv(f'../data/raw_data/{csv_filename}', index=False)


    return df

### AEX. Amsterdam Exchange Index

In [12]:
url = "https://markets.businessinsider.com/index/components/aex"
csv_filename = 'aex_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                         Name Latest PricePrevious Close    LowHigh  \
0                    ABN Amro                15.22 15.22  0.00 0.00   
1   Adyen B.V. Parts Sociales          1,520.40 1,520.40  0.00 0.00   
2              Ahold Delhaize                27.16 27.16  0.00 0.00   
3                  Akzo Nobel                67.54 67.54  0.00 0.00   
4               ArcelorMittal                25.20 25.20  0.00 0.00   
5                     ASML NV              913.00 913.00  0.00 0.00   
6           ASR Nederland N.V                43.88 43.88  0.00 0.00   
7                    Heineken                86.60 86.60  0.00 0.00   
8                        IMCD              163.50 163.50  0.00 0.00   
9                   ING Group                14.70 14.70  0.00 0.00   
10                        KPN                  3.39 3.39  0.00 0.00   
11                   NN Group                41.95 41.95  0.00 0.00   
12                    Philips                18.42 18.42  0.00 0.00   
13    

In [13]:
url = "https://en.wikipedia.org/wiki/AEX_index"
csv_filename = 'aex_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                      Company                                   ICBSector  \
0                       Adyen  support services, financial administration   
1                       Aegon                              life insurance   
2              Ahold Delhaize                food retailers & wholesalers   
3                   AkzoNobel                         specialty chemicals   
4               ArcelorMittal                                iron & steel   
5           ASM International                              semiconductors   
6                ASML Holding                              semiconductors   
7       Universal Music Group                               entertainment   
8           BE Semiconductors                              semiconductors   
9            DSM Firmenich AG                         specialty chemicals   
10                   Heineken                                     brewers   
11                       IMCD                         specialty chemicals   

### BEL 20. Brussels Stock Exchange

In [14]:
url = "https://markets.businessinsider.com/index/components/bel_20"
csv_filename = 'bel_20_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                     Name Latest PricePrevious Close  \
0   AB InBev SA-NV (Anheuser-Busch InBev)                55.48 55.16   
1             Ackermans & van Haaren S.A.              158.80 158.10   
2                                ageas NV                41.85 41.54   
3                  Ahold Delhaize (Ahold)                27.09 27.15   
4                          Cofinimmo S.A.                58.30 58.50   
5                            D'Ieteren NV              203.80 204.60   
6              Elia System Operator SA-NV              100.70 100.40   
7                     Engie (ex GDF Suez)                15.79 15.60   
8        Etablissementen Franz Colruyt NV                41.80 42.30   
9     Groupe Bruxelles Lambert S.A. (GBL)                69.58 69.20   
10                         KBC Groep N.V.                68.78 68.14   
11                          NV Bekaert SA                47.28 46.38   
12                 Proximus (ex Belgacom)                  7.35 

In [15]:
url = "https://en.wikipedia.org/wiki/BEL_20"
csv_filename = 'bel_20_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                   Company             ICBSector             Ticker symbol  \
0                 AB InBev       Food & Beverage   Euronext Brussels : ABI   
1   Ackermans & van Haaren    Financial Services  Euronext Brussels : ACKB   
2                 Aedifica           Real Estate   Euronext Brussels : AED   
3                    Ageas             Insurance   Euronext Brussels : AGS   
4                   Aperam       Basic Resources  Euronext Brussels : APAM   
5           arGEN-X [ nl ]           Health Care  Euronext Brussels : ARGX   
6                Cofinimmo           Real Estate  Euronext Brussels : COFB   
7                  Colruyt                Retail  Euronext Brussels : COLR   
8                     Elia             Utilities   Euronext Brussels : ELI   
9                Galapagos           Health Care  Euronext Brussels : GLPG   
10                     GBL    Financial Services  Euronext Brussels : GBLB   
11                     KBC                 Banks   Euronext Brus

### CAC 40. Cotation Assistée en Continu
Benchmark French Stock Market Index

In [16]:
url = "https://markets.businessinsider.com/index/components/cac_40"
csv_filename = 'cac_40_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                Name Latest PricePrevious Close    LowHigh  \
0                        Air Liquide              191.22 191.22  0.00 0.00   
1                             Airbus              170.12 170.12  0.00 0.00   
2                      ArcelorMittal                25.20 25.20  0.00 0.00   
3                                AXA                34.58 34.58  0.00 0.00   
4                        BNP Paribas                62.49 62.49  0.00 0.00   
5                           Bouygues                37.65 37.65  0.00 0.00   
6                         Cap Gemini              214.00 214.00  0.00 0.00   
7                          Carrefour                16.00 16.00  0.00 0.00   
8                    Crédit Agricole                13.40 13.40  0.00 0.00   
9                             Danone                59.32 59.32  0.00 0.00   
10                             Engie                15.52 15.52  0.00 0.00   
11                  EssilorLuxottica              209.65 209.65 

In [17]:
url = "https://en.wikipedia.org/wiki/CAC_40"
csv_filename = 'cac_40_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                      Company                  Sector  \
0                 Air Liquide         Basic Materials   
1                      Airbus             Industrials   
2                      Alstom             Industrials   
3               ArcelorMittal         Basic Materials   
4                         AXA      Financial Services   
5                 BNP Paribas      Financial Services   
6                    Bouygues             Industrials   
7                   Capgemini              Technology   
8                   Carrefour      Consumer Defensive   
9             Crédit Agricole      Financial Services   
10                     Danone      Consumer Defensive   
11          Dassault Systèmes              Technology   
12                    Edenred             Industrials   
13                      Engie               Utilities   
14           EssilorLuxottica              Healthcare   
15        Eurofins Scientific              Healthcare   
16                     Hermès  

### ISEQ 20. Ireland Overall Stock Exchange Index

In [18]:
url = "https://markets.businessinsider.com/index/components/iseq"
csv_filename = 'iseq_20_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                 Name Latest PricePrevious Close      LowHigh          +/-%  \
0             CRH plc                80.16 79.74  80.16 80.16    0.42 0.53%   
1     Kerry Group plc                81.58 82.08  81.58 81.58  -0.50 -0.61%   
2  Kingspan Group plc                83.00 83.32  82.06 83.00  -0.32 -0.38%   

                                            TimeDate    3 Mo.+/-%  \
0  03:01 AM 03/26/2024 03:01:53 AM UTC-0400 26.03...   3.66 7.47%   
1  03:05 AM 03/26/2024 03:05:31 AM UTC-0400 26.03...  8.06 10.75%   
2  09:18 AM 03/26/2024 09:18:19 AM UTC-0400 26.03...  8.06 10.47%   

      6 Mo.+/-%    1 Year+/-%  
0  11.63 62.97%  30.86 66.22%  
1    0.00 0.00%  -7.18 -7.96%  
2    0.00 0.00%  23.60 38.41%  


In [19]:
url = "https://en.wikipedia.org/wiki/ISEQ_20"
csv_filename = 'iseq_20_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 0)
print(df)

                 MNEM code                            Company Domicile
0    Euronext Dublin : A5G                          AIB Group  Ireland
1   Euronext Dublin : BIRG                    Bank of Ireland  Ireland
2    Euronext Dublin : C5H                        Cairn Homes  Ireland
3    Euronext Dublin : CRG                                CRH  Ireland
4    Euronext Dublin : DHG                 Dalata Hotel Group  Ireland
5    Euronext Dublin : EG7                       FBD Holdings  Ireland
6   Euronext Dublin : FLTR              Flutter Entertainment  Ireland
7    Euronext Dublin : GL9                            Glanbia  Ireland
8    Euronext Dublin : GVR               Glenveagh Properties  Ireland
9    Euronext Dublin : GRP               Greencoat Renewables  Ireland
10  Euronext Dublin : IR5B            Irish Continental Group  Ireland
11  Euronext Dublin : IRES  Irish Residential Properties REIT  Ireland
12   Euronext Dublin : KRZ                        Kerry Group  Ireland
13   E

### OBX. Oslo Stock Exchange

In [21]:
url = "https://markets.businessinsider.com/index/components/obx"
csv_filename = 'obx_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                    Name Latest PricePrevious Close  \
0                     Aker Solutions ASA                  3.25 3.20   
1                             BW LPG Ltd                10.37 10.28   
2             Det Norske Oljeselskap ASA                23.64 23.66   
3              DNO International ASA (A)                  0.81 0.80   
4                                Equinor                24.72 25.14   
5              Gjensidige Forsikring ASA                13.57 13.49   
6                                   Mowi                16.82 17.10   
7               Nordic Semiconductor ASA                  7.33 7.34   
8                        Norsk Hydro ASA                  5.18 5.10   
9                  Norwegian Air Shuttle                  1.39 1.39   
10                              Orkla AS                  6.54 6.50   
11                Otello Corporation ASA                  0.66 0.68   
12            Petroleum Geo-Services ASA                  0.67 0.63   
13    

In [22]:
url = "https://en.wikipedia.org/wiki/OBX_Index"
csv_filename = 'obx_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 0)
print(df)

                  Company                                ICBsubsector  \
0                 Aker BP                        oil: crude producers   
1                  BW LPG                       marine transportation   
2                DNB Bank                                       banks   
3                 Equinor                      integrated oil and gas   
4               Frontline                       marine transportation   
5      Golden Ocean Group                       marine transportation   
6          Hafnia Limited                       marine transportation   
7    Höegh Autoliners ASA                       marine transportation   
8       Kongsberg Gruppen                     diversified industrials   
9                    Mowi  farming, fishing, ranching and plantations   
10    MPC Container Ships                       marine transportation   
11                    NEL                  renewable energy equipment   
12   Nordic Semiconductor                          

### OSEBX. Oslo Børs All Share Index

In [23]:
url = "https://markets.businessinsider.com/index/components/osebx"
csv_filename = 'osebx_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                    Name Latest PricePrevious Close  \
0              ABG Sundal Collier ASAShs                  0.51 0.51   
1                     Af Gruppen Asa (A)                11.94 11.60   
2                        Aker ASAShs -A-                53.20 52.10   
3                     Aker Solutions ASA                  3.25 3.20   
4          American Shipping Company ASA                  2.33 2.29   
5                               Atea ASA                10.94 11.00   
6                 Avance Gas Holding Ltd                10.60 10.40   
7                             Bakkafrost                58.60 58.85   
8                Biotec Pharmacon ASAShs                  2.39 2.43   
9                             BW LPG Ltd                10.37 10.28   
10            Det Norske Oljeselskap ASA                23.64 23.66   
11             DNO International ASA (A)                  0.81 0.80   
12          Entra ASA Unitary 144A-Reg S                  9.42 9.44   
13    

In [24]:
url = "https://en.wikipedia.org/wiki/OSEAX"
csv_filename = 'osebx_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

    Ticker                   Name Mkt Cap (Million NOK)
0     ACTA           Acta Holding                659.28
1      ACY                 Acergy              24124.35
2      AFG             AF Gruppen               2840.85
3      AFK  Arendals Fossekompani               3538.59
4      AGR              AGR Group               1758.13
..     ...                    ...                   ...
174    WRL    Wentworth Resources                178.99
175  WWASA       Wilh. Wilhelmsen                  7458
176    WWI   Wilh. Wilhelmsen ...               4727.96
177   WWIB   Wilh. Wilhelmsen ...               1619.81
178    YAR     Yara International              83428.89

[179 rows x 3 columns]


### PSI. Portuguese Stock Index

In [25]:
url = "https://markets.businessinsider.com/index/components/psi20"
csv_filename = 'psi_20_stock_data.csv'
df = scrape_index_components_to_csv(url, csv_filename)
print(df)

                                                 Name  \
0                                    ALTRI SGPS SAShs   
1                                      Cofina SGPS SA   
2                                  EDP Renovaveis, SA   
3                                        GALP Energia   
4     Grupo EDP S.A. (Electricidade de Portugal S.A.)   
5                          Jeronimo Martins SGPS S.A.   
6                                  Mota-Engil SGPS SA   
7                                        NOS SGPS, SA   
8                               Portugal Telecom S.A.   
9       REN - Redes Energeticas Nacionais SGPS, SAShs   
10  Sociedade de Investimento e Gestao SGPS SA SEMAPA   
11                                      Sonae SGPS SA   
12                                   Sonaecom SGPS SA   
13                              The Navigator Company   

   Latest PricePrevious Close      LowHigh          +/-%  \
0                   5.06 4.93    5.06 5.06    0.13 2.64%   
1                   0.41

In [26]:
url = "https://en.wikipedia.org/wiki/PSI-20"
csv_filename = 'psi_20_wikipedia_data.csv'
df = scrape_wikipedia_to_csv(url, csv_filename, 1)
print(df)

                        Company                              Industry Ticker  \
0                         Altri                       Basic Resources   ALTR   
1     Banco Comercial Português                                 Banks    BCP   
2             Corticeira Amorim           Industrial Goods & Services    COR   
3      CTT Correios de Portugal           Industrial Goods & Services    CTT   
4                EDP Renováveis                             Utilities   EDPR   
5          Energias de Portugal                             Utilities    EDP   
6                  Galp Energia                                Energy   GALP   
7                       Ibersol                      Travel & Leisure    IBS   
8              Jerónimo Martins  Personal Care, Drug & Grocery Stores    JMT   
9                    Mota-Engil              Construction & Materials    EGL   
10                          NOS                    Telecommunications    NOS   
11                     Novabase         