## Preparations

In [246]:
# Imports
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag

In [247]:
# Basic URL for information search in ECG database
url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga=&ser_land=&ser_branche=&ser_ftevon=&ser_ftebis=&ser_ort=&ser_b2b=&ser_b2c=&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep=&ser_standard=&ser_auditart='

# Base url for pages when browsing through the table
base_url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga&ser_land&ser_branche&ser_ftevon&ser_ftebis&ser_ort&ser_b2b&ser_b2c&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep&ser_standard&ser_auditart&frm-page-19795='

In [248]:
# Create list with URLs to scrape through (as max. 50 entries per page are displayed)

# Creating soup
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
print(f'Response status: {response.status_code}')
print()

# Finding out number of total entries
total_entries = int(soup.select('strong')[0].get_text())
print(f'Total entries in ECG database: {total_entries}')

# Finding out number of URLs (pages)
number_of_urls = int(total_entries/50)+1
print(f'Number of table pages: {number_of_urls}')
print()

dynamic_url_list = [base_url + str(i) for i in range(1,number_of_urls+1)]
print(f'URL list created. Length of dynamic_url_list: {len(dynamic_url_list)}')

Response status: 200

Total entries in ECG database: 1240
Number of table pages: 25

URL list created. Length of dynamic_url_list: 25


## Checks

In [203]:
# Checking soup
print(soup.prettify())

<!DOCTYPE html>
<html class="avada-html-layout-wide avada-html-header-position-top" lang="de-DE" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Firmenauskunft (FVZ) – Interactive – AuditAdministration
  </title>
  <meta content="max-image-preview:large" name="robots"/>
  <link href="https://audit.ecogood.org/firmenauskunft-fvz/" hreflang="de" rel="alternate"/>
  <link href="https://audit.ecogood.org/en-firmenauskunft-fvz-2/?lang=en" hreflang="en" rel="alternate"/>
  <link href="https://audit.ecogood.org/it-firmenauskunft-fvz-2/?lang=it" hreflang="it" rel="alternate"/>
  <link href="https://audit.ecogood.org/informaciones-sobre-la-empresa/?lang=es" hreflang="es" rel="alternate"/>
  <link href="//s.w.org" rel="dns-prefetch"/>
  <link href="https://audit.ecogood

In [210]:
# Checking tables and their classes (turns out there are sub tables in the main table)
for table in soup.select('table')[0:6]: # More "RW_Aufträge" sub tables coming after
    print(table.get('id'))

RW_Firmenliste
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge


In [206]:
# Checking column titles
column_titles = []
for header in company_data.select('tr')[0]:
    if header.get_text().strip() != '':
        column_titles.append(header.get_text().strip())

#print(len(titles))
print(column_titles)

['ID', 'Organisation', 'Land', 'PLZ', 'Ort', 'Größe', 'Branche', 'Bilanzdetails…', 'BilanztypPeer-Audit…', 'gültig bis…', 'Dokumente…']


## Scraping the first page

In [250]:
# Selecting table with relevant information and assigning variable
company_data = soup.select('table')[0]

In [209]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []
balance_types = []
balance_valid_until = []
documents = []

# Scraping main data
for row in company_data.select('tbody tr'):
    if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
        ids.append(row.get_text().strip().split('\n')[0])
        organizations.append(row.get_text().strip().split('\n')[1])
        countries.append(row.get_text().strip().split('\n')[2])
        postal_codes.append(row.get_text().strip().split('\n')[3])
        cities.append(row.get_text().strip().split('\n')[4])
        company_sizes.append(row.get_text().strip().split('\n')[5])
        economic_sectors.append(row.get_text().strip().split('\n')[6])
        
# Check
len(ids)

50

In [208]:
# Creating dataframe
ecg_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       #'balance_details': balance_details, 
                       #'balance_type': balance_types,
                       #'balance_valid_until': balance_valid_until,
                       #'documents': documents
                      })
ecg_df

Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur
5,14887,Abitare,DE,10829,Berlin,(2)3-10,Handwerk
6,46986,"abw – gemeinnützige Gesellschaft für Arbeit, B...",DE,14057,Berlin,(3)11-25,Bildung / Universität / FH / Schulen
7,92361,"Action Waterscape, S.L.",ES,46200,Paiporta,(8) Todas las tallas,Otros
8,14888,ad fontes,DE,27624,Geestland,(2)3-10,Elektro / Elektronik
9,40981,ADAMAH BioHof G. Zoubek Vertriebs KG,AT,2280,Glinzendorf,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft


## Scraping all pages of the table

In [253]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []
balance_types = []
balance_valid_until = []
documents = []

# Scraping
i=0
for url in dynamic_url_list:
    # Preparations
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    company_data = soup.select('table')[0]
    
    # Scraping main data
    for row in company_data.select('tbody tr'):
        if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
            ids.append(row.get_text().strip().split('\n')[0])
            organizations.append(row.get_text().strip().split('\n')[1])
            countries.append(row.get_text().strip().split('\n')[2])
            postal_codes.append(row.get_text().strip().split('\n')[3])
            cities.append(row.get_text().strip().split('\n')[4])
            company_sizes.append(row.get_text().strip().split('\n')[5])
            economic_sectors.append(row.get_text().strip().split('\n')[6])
        print(f'Page {i} scraped.')
        
# Check
len(ids)

1240

In [266]:
# Creating dataframe
ecg_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       #'balance_details': balance_details, 
                       #'balance_type': balance_types,
                       #'balance_valid_until': balance_valid_until,
                       #'documents': documents
                      })
print(f'Shape: {ecg_df.shape}\n')
print(f'Missing values: \n{ecg_df.isna().sum()}\n')
print(f'Datatype: \n{ecg_df.dtypes}\n')
display(ecg_df)

Shape: (1240, 7)

Missing values: 
id                 0
organization       0
country            0
postal_code        0
city               0
company_size       0
economic_sector    0
dtype: int64

Datatype: 
id                 object
organization       object
country            object
postal_code        object
city               object
company_size       object
economic_sector    object
dtype: object



Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur
...,...,...,...,...,...,...,...
1235,118849,Zentrum Zeitgenössischer Musik – Kunsthaus Nexus,AT,5760,Saalfelden,(2)3-10,Kunst / Kultur / Unterhaltung
1236,92844,Zimmerei Diedrich – Die Gesundhausbauer GmbH,DE,37434,Rüdershausen,(3)11-25,Baugewerbe / Architektur
1237,45156,zimmerwerkstatt Nicola Bannier und Michael Weber,DE,29456,Hitzacker,(1)1-2,Handwerk
1238,14718,Zukunftswerk eG,DE,82319,Starnberg,(2)3-10,Beratung / Consulting


## Saving the dataframe

In [259]:
ecg_df.to_csv('ecg_df.csv', index=False)

# ACHTUNG

In [270]:
# To correct: This one case from Valencia