## Imports

In [7]:
# Imports
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag

## Scraping organization-oriented view
https://audit.ecogood.org/firmenauskunft-fvz/

#### URL creation

In [3]:
# Basic URL for information search in ECG database
url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga=&ser_land=&ser_branche=&ser_ftevon=&ser_ftebis=&ser_ort=&ser_b2b=&ser_b2c=&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep=&ser_standard=&ser_auditart='

# Base url for pages when browsing through the table
base_url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga&ser_land&ser_branche&ser_ftevon&ser_ftebis&ser_ort&ser_b2b&ser_b2c&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep&ser_standard&ser_auditart&frm-page-19795='

In [4]:
# Create list with URLs to scrape through (as max. 50 entries per page are displayed)

# Creating soup
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
print(f'Response status: {response.status_code}')
print()

# Finding out number of total entries
total_entries = int(soup.select('strong')[0].get_text())
print(f'Total entries in ECG database: {total_entries}')

# Finding out number of URLs (pages)
number_of_urls = int(total_entries/50)+1
print(f'Number of table pages: {number_of_urls}')
print()

dynamic_url_list = [base_url + str(i) for i in range(1,number_of_urls+1)]
print(f'URL list created. Length of dynamic_url_list: {len(dynamic_url_list)}')

Response status: 200

Total entries in ECG database: 1266
Number of table pages: 26

URL list created. Length of dynamic_url_list: 26


#### Checks

In [5]:
# Checking soup
print(soup.prettify())

<!DOCTYPE html>
<html class="avada-html-layout-wide avada-html-header-position-top" lang="de-DE" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Firmenauskunft (FVZ) – Interactive – AuditAdministration
  </title>
  <meta content="max-image-preview:large" name="robots"/>
  <link href="https://audit.ecogood.org/firmenauskunft-fvz/" hreflang="de" rel="alternate"/>
  <link href="https://audit.ecogood.org/en-firmenauskunft-fvz-2/?lang=en" hreflang="en" rel="alternate"/>
  <link href="https://audit.ecogood.org/it-firmenauskunft-fvz-2/?lang=it" hreflang="it" rel="alternate"/>
  <link href="https://audit.ecogood.org/informaciones-sobre-la-empresa/?lang=es" hreflang="es" rel="alternate"/>
  <link href="//s.w.org" rel="dns-prefetch"/>
  <link href="https://audit.ecogood

In [6]:
# Checking tables and their classes (turns out there are sub tables in the main table)
for table in soup.select('table')[0:6]: # More "RW_Aufträge" sub tables coming after
    print(table.get('id'))

RW_Firmenliste
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge


In [7]:
# Checking column titles
company_data = soup.select('table')[0]
column_titles = []
for header in company_data.select('tr')[0]:
    if header.get_text().strip() != '':
        column_titles.append(header.get_text().strip())

#print(len(titles))
print(column_titles)

['ID', 'Organisation', 'Land', 'PLZ', 'Ort', 'Größe', 'Branche', 'Bilanzdetails…', 'BilanztypPeer-Audit…', 'gültig bis…', 'Dokumente…']


#### Scraping the first page (for testing)

In [8]:
# Selecting table with relevant information and assigning variable
company_data = soup.select('table')[0]

In [9]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []

# Scraping main data
for row in company_data.select('tbody tr'):
    if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
        ids.append(row.get_text().strip().split('\n')[0])
        organizations.append(row.get_text().strip().split('\n')[1])
        countries.append(row.get_text().strip().split('\n')[2])
        postal_codes.append(row.get_text().strip().split('\n')[3])
        cities.append(row.get_text().strip().split('\n')[4])
        company_sizes.append(row.get_text().strip().split('\n')[5])
        economic_sectors.append(row.get_text().strip().split('\n')[6])
        balance_details.append(row.get_text().strip().split('\n')[7:])
        
# Check
len(ids)

50

In [10]:
# Creating dataframe
ecg_org_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       'balance_details': balance_details,
                      })
ecg_org_df

Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
5,14887,Abitare,DE,10829,Berlin,(2)3-10,Handwerk,"[, , …6453, 4.1 StandardDeskaudit, 30.11.2020..."
6,46986,"abw – gemeinnützige Gesellschaft für Arbeit, B...",DE,14057,Berlin,(3)11-25,Bildung / Universität / FH / Schulen,"[, , …46985, M5.0 KompaktbilanzPeerevaluation,..."
7,92361,"Action Waterscape, S.L.",ES,46200,Paiporta,(8) Todas las tallas,Otros,"[, , …92518, M5.0 Balance completoauditoría in..."
8,14888,ad fontes,DE,27624,Geestland,(2)3-10,Elektro / Elektronik,"[, , …6454, M5.0 KompaktbilanzPeerevaluation, ..."
9,40981,ADAMAH BioHof G. Zoubek Vertriebs KG,AT,2280,Glinzendorf,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …40980, 4.1 StandardDeskaudit, 30.06.201..."


In [13]:
# Turn missing information in balance_details into missing values
ecg_org_df.loc[ecg_org_df['balance_details'].astype(str).str.contains('liegen keine'), 'balance_details'] = None

# Check
ecg_org_df['balance_details']

0     [, , …6451, 4.1 StandardPeerevaluation,  30.03...
1     [, , …6452, M5.0 KompaktbilanzPeerevaluation, ...
2        [, , …24837, M5.0 KompaktbilanzPeerevaluation]
3     [, , …44667, 4.0 StandardPeerevaluation,  31.1...
4     [, , …29651, 4.1 StandardDeskaudit,  , Bericht...
5     [, , …6453, 4.1 StandardDeskaudit,  30.11.2020...
6     [, , …46985, M5.0 KompaktbilanzPeerevaluation,...
7     [, , …92518, M5.0 Balance completoauditoría in...
8     [, , …6454, M5.0 KompaktbilanzPeerevaluation, ...
9     [, , …40980, 4.1 StandardDeskaudit,  30.06.201...
10    [, , …138298, M5.0 VollbilanzBesuchsaudit,  31...
11                                                 None
12        [, , …150226, M5.0 KompaktbilanzBesuchsaudit]
13    [, , …30336, M5.0 KompaktbilanzPeerevaluation,...
14      [, , …29634, 4.1 StandardDeskaudit,  , Bericht]
15    [, , …132862, M5.0 VollbilanzBesuchsaudit,  31...
16                                                 None
17                                              

In [15]:
# Check entry 1
list(ecg_org_df[ecg_org_df['id'] == '14885']['balance_details'])

[['',
  '',
  '…6451',
  '4.1 StandardPeerevaluation',
  ' 30.03.2020  690 ',
  'Bericht Zertifikat']]

In [16]:
# Check entry 2
list(ecg_org_df[ecg_org_df['id'] == '14886']['balance_details'])

[['',
  '',
  '…6452',
  'M5.0 KompaktbilanzPeerevaluation',
  ' 28.02.2021   ',
  ' Zertifikat']]

In [17]:
# Check entry 3
list(ecg_org_df[ecg_org_df['id'] == '35275']['balance_details'])

[['', '', '…24837', 'M5.0 KompaktbilanzPeerevaluation']]

In [18]:
# Check for multiple audit entries: The algorithm writes them correctly into once cell
list(ecg_org_df[ecg_org_df['id'] == '78313']['balance_details'])

[['',
  '',
  '…154215',
  'M5.0 VollbilanzBesuchsaudit',
  ' ',
  ' ',
  '',
  '',
  '…78312',
  'M5.0 VollbilanzPeerevaluation',
  ' 31.07.2024  266 ',
  'Bericht Zertifikat']]

In [19]:
# Check another multiple audit entry
list(ecg_org_df[ecg_org_df['id'] == '29652']['balance_details'])

[['',
  '',
  '…29651',
  '4.1 StandardDeskaudit',
  ' ',
  'Bericht ',
  '',
  '',
  '…28492',
  '3.0 StandardDeskaudit',
  ' ',
  'Bericht']]

In [20]:
# Check another multiple audit entry
list(ecg_org_df[ecg_org_df['id'] == '14818']['balance_details'])

[['',
  '',
  '…32990',
  'M5.0 KompaktbilanzPeerevaluation',
  ' 31.05.2023  437 ',
  ' Zertifikat',
  '',
  '',
  '…6371',
  'M5.0 KompaktbilanzBesuchsaudit',
  ' 31.07.2020 489 ',
  'Bericht Testat',
  '',
  '',
  '…36551',
  '4.1 StandardDeskaudit',
  ' 31.01.2018 405 ',
  'Bericht Testat']]

In [21]:
# Checking stuff
ecg_org_df[ecg_org_df['balance_details'].astype(str).str.contains('\d\d.')][['id','balance_details']]

Unnamed: 0,id,balance_details
0,14885,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
5,14887,"[, , …6453, 4.1 StandardDeskaudit, 30.11.2020..."
6,46986,"[, , …46985, M5.0 KompaktbilanzPeerevaluation,..."
7,92361,"[, , …92518, M5.0 Balance completoauditoría in..."
8,14888,"[, , …6454, M5.0 KompaktbilanzPeerevaluation, ..."
9,40981,"[, , …40980, 4.1 StandardDeskaudit, 30.06.201..."


In [23]:
# Extracting information from balance_details

number_of_balances_org_table = []

balance1_valid_until_date = []
balance2_valid_until_date = []
balance3_valid_until_date = []
balance4_valid_until_date = []
balance5_valid_until_date = []
balance6_valid_until_date = []
balance7_valid_until_date = []

balance1_score = []
balance2_score = []
balance3_score = []
balance4_score = []
balance5_score = []
balance6_score = []
balance7_score = []

for balance_detail in ecg_org_df['balance_details']:
    
    # number_of_balances_org_table
    try:
        if len(balance_detail) <=6:
            number_of_balances_org_table.append(1)
        elif len(balance_detail) <=12:
            number_of_balances_org_table.append(2)
        elif len(balance_detail) <=18:
            number_of_balances_org_table.append(3)
        elif len(balance_detail) <=24:
            number_of_balances_org_table.append(4)
        elif len(balance_detail) <=30:
            number_of_balances_org_table.append(5)
        elif len(balance_detail) <=36:
            number_of_balances_org_table.append(6)
        elif len(balance_detail) <=42:
            number_of_balances_org_table.append(7)
        else:
            number_of_balances_org_table.append('>7')
    except:
        number_of_balances_org_table.append(None)
    
    # valid_until_date
    try:
        balance1_valid_until_date.append(balance_detail[4][1:11])
    except:
        balance1_valid_until_date.append(None)
    try:
        balance2_valid_until_date.append(balance_detail[10][1:11])
    except:
        balance2_valid_until_date.append(None)    
    try:
        balance3_valid_until_date.append(balance_detail[16][1:11])
    except:
        balance3_valid_until_date.append(None)        
    try:
        balance4_valid_until_date.append(balance_detail[22][1:11])
    except:
        balance4_valid_until_date.append(None)
    try:
        balance5_valid_until_date.append(balance_detail[28][1:11])
    except:
        balance5_valid_until_date.append(None)
    try:
        balance6_valid_until_date.append(balance_detail[34][1:11])
    except:
        balance6_valid_until_date.append(None)
    try:
        balance7_valid_until_date.append(balance_detail[40][1:11])
    except:
        balance7_valid_until_date.append(None)        
    
    # score
    try:
        balance1_score.append(int(balance_detail[4][12:]))
    except:
        balance1_score.append(None)    
    try:
        balance2_score.append(int(balance_detail[10][12:]))
    except:
        balance2_score.append(None)
    try:
        balance3_score.append(int(balance_detail[16][12:]))
    except:
        balance3_score.append(None)    
    try:
        balance4_score.append(int(balance_detail[22][12:]))
    except:
        balance4_score.append(None)    
    try:
        balance5_score.append(int(balance_detail[28][12:]))
    except:
        balance5_score.append(None)    
    try:
        balance6_score.append(int(balance_detail[34][12:]))
    except:
        balance6_score.append(None)    
    try:
        balance7_score.append(int(balance_detail[40][12:]))
    except:
        balance7_score.append(None)

#### Scraping all pages of the table

In [24]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []

In [25]:
# Scraping
i=0
for url in dynamic_url_list:
    # Preparations
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    company_data = soup.select('table')[0]
    
    # Scraping main data
    for row in company_data.select('tbody tr'):
        if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
            if row.get_text().strip().split('\n')[0] == '92435': # Special treatment of this case
                ids.append(row.get_text().strip().split('\n')[0])
                organizations.append((row.get_text().strip().split('\n')[1] + ' ' + row.get_text().strip().split('\n')[2]))
                countries.append(row.get_text().strip().split('\n')[3])
                postal_codes.append(row.get_text().strip().split('\n')[4])
                cities.append(row.get_text().strip().split('\n')[5])
                company_sizes.append(row.get_text().strip().split('\n')[6])
                economic_sectors.append(row.get_text().strip().split('\n')[7])
                balance_details.append(row.get_text().strip().split('\n')[8:])

            else:
                ids.append(row.get_text().strip().split('\n')[0])
                organizations.append(row.get_text().strip().split('\n')[1])
                countries.append(row.get_text().strip().split('\n')[2])
                postal_codes.append(row.get_text().strip().split('\n')[3])
                cities.append(row.get_text().strip().split('\n')[4])
                company_sizes.append(row.get_text().strip().split('\n')[5])
                economic_sectors.append(row.get_text().strip().split('\n')[6])
                balance_details.append(row.get_text().strip().split('\n')[7:])
                
    # Show status
    i += 1
    print(f'Page {i}/{len(dynamic_url_list)} scraped.')
        
# Check
print(f'\nAll pages scraped. Length: {len(ids)} entries.')

Page 1/26 scraped.
Page 2/26 scraped.
Page 3/26 scraped.
Page 4/26 scraped.
Page 5/26 scraped.
Page 6/26 scraped.
Page 7/26 scraped.
Page 8/26 scraped.
Page 9/26 scraped.
Page 10/26 scraped.
Page 11/26 scraped.
Page 12/26 scraped.
Page 13/26 scraped.
Page 14/26 scraped.
Page 15/26 scraped.
Page 16/26 scraped.
Page 17/26 scraped.
Page 18/26 scraped.
Page 19/26 scraped.
Page 20/26 scraped.
Page 21/26 scraped.
Page 22/26 scraped.
Page 23/26 scraped.
Page 24/26 scraped.
Page 25/26 scraped.
Page 26/26 scraped.

All pages scraped. Length: 1266 entries.


In [27]:
# Creating basic dataframe
ecg_org_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       'balance_details': balance_details,
                      })
print(f'Shape: {ecg_org_df.shape}\n')
print(f'Datatype: \n{ecg_org_df.dtypes}\n')
display(ecg_org_df)

Shape: (1266, 8)

Datatype: 
id                 object
organization       object
country            object
postal_code        object
city               object
company_size       object
economic_sector    object
balance_details    object
dtype: object



Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
...,...,...,...,...,...,...,...,...
1261,92844,Zimmerei Diedrich – Die Gesundhausbauer GmbH,DE,37434,Rüdershausen,(3)11-25,Baugewerbe / Architektur,"[, , …92843, M5.0 KompaktbilanzPeerevaluation,..."
1262,45156,zimmerwerkstatt Nicola Bannier und Michael Weber,DE,29456,Hitzacker,(1)1-2,Handwerk,"[, , …45155, 4.1 StandardPeerevaluation, 28.0..."
1263,165659,ZORA Kinder- und Jugendhilfe gGmbH,DE,17489,Greifswald,(5)51-100,Gesundheitswesen / Soziales / Pflege,"[, , …165658, M5.0 VollbilanzBesuchsaudit]"
1264,14718,Zukunftswerk eG,DE,82319,Starnberg,(2)3-10,Beratung / Consulting,"[, , …3335, M5.0 KompaktbilanzPeerevaluation, ..."


#### Data cleaning and initial transformations

In [29]:
# Checking specific case (that has a linebreak within the organization name)
ecg_org_df[ecg_org_df['id'] == '92435']

Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
573,92435,IMPRESUM GRAFICAS LITOLEMA S.L.,ES,46006,Valencia,(8) Todas las tallas,Otros,"[, , …92434, M5.0 Balance completoPeerevaluation]"


In [30]:
# Remove leading and trailing whitespaces
ecg_org_df = ecg_org_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [31]:
# Make missing values visible
ecg_org_df = ecg_org_df.applymap(lambda x: None if x == '' else x)

print(f'Missing values: \n{ecg_org_df.isna().sum()}\n')

Missing values: 
id                  0
organization        0
country             0
postal_code         6
city                0
company_size       20
economic_sector     4
balance_details     0
dtype: int64



In [304]:
### TO FINALIZE!!!
'''
FINALIZE!!


# Turn balance_details into missing value for entries without any balance
#ecg_org_df[ecg_org_df['balance_details'].str[0].str[0:3] == 'Für']
ecg_org_df['balance_details'] = ecg_org_df['balance_details'].apply(lambda x: None if x.str[0].str[0:3] == 'Für' else x)
'''

"\nFINALIZE!!\n\n\n# Turn balance_details into missing value for entries without any balance\n#ecg_org_df[ecg_org_df['balance_details'].str[0].str[0:3] == 'Für']\necg_org_df['balance_details'] = ecg_org_df['balance_details'].apply(lambda x: None if x.str[0].str[0:3] == 'Für' else x)\n"

In [182]:
ecg_org_df[ecg_org_df['id'] == 164050]['balance_details'].str[0].str[0:3]

48    Für
Name: balance_details, dtype: object

In [34]:
# id
ecg_org_df['id'] = ecg_org_df['id'].astype(int) # As type integer
if ecg_org_df.duplicated(subset='id').any() == False: # Duplicates?
    print('No id duplicates.')

No id duplicates.


In [61]:
# organization
if ecg_org_df['organization'].isna().any() == False:
    print('\nNo missing values.')
else:
    print('\n')
    print(ecg_org_df[ecg_org_df['organization'].isna()])

if ecg_org_df.duplicated(subset='organization').any() == False: # Duplicates?
    print('No organization duplicates.')
else:
    display(ecg_org_df[ecg_org_df.duplicated(subset='organization', keep=False)])


No missing values.


Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
48,164050,Apple,FR,,Minsk,Wallmart,Jack Nicholson,[Für 164050 liegen keine entsprechenden Anträ...
49,164351,Apple,GI,,Minsk,Nokia,Will smiff D,[Für 164351 liegen keine entsprechenden Anträ...
91,19776,Bean United GmbH,DE,82041.0,Oberhaching,(1)1-2,Handel / Konsum,"[, , …19775, M5.0 VollbilanzDeskaudit, 31.10...."
92,135008,Bean United GmbH,DE,82041.0,Oberhaching,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …135007, M5.0 VollbilanzBesuchsaudit, 30..."
121,30326,Biohof Lecker GbR,DE,83410.0,Laufen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …30325, M5.0 KompaktbilanzPeerevaluation,..."
122,82924,Biohof Lecker GbR,DE,83410.0,Laufen,(2)3-10,Handel / Konsum,"[, , …82923, M5.0 VollbilanzPeerevaluation, 3..."
136,40988,Bioland Lammertzhof – Fam. Hannen GbR,DE,41564.0,Kaarst,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …40989, 4.1 StandardDeskaudit, 31.12.201..."
137,85214,Bioland Lammertzhof – Fam. Hannen GbR,DE,41564.0,Kaarst,(5)51-100,Nahrungsmittel / Land / Forstwirtschaft,"[, , …85213, M5.0 KompaktbilanzPeerevaluation,..."
187,40892,camera obscura GmbH,DE,10707.0,Berlin,(1)1-2,Medien,"[, , …40891, M5.0 KompaktbilanzPeerevaluation,..."
188,124487,camera obscura GmbH,DE,10707.0,Berlin,(2)3-10,Marketing / Werbung / PR,"[, , …124486, M5.0 KompaktbilanzPeerevaluation..."


In [62]:
# country
ecg_org_df = ecg_org_df.rename(columns={'country':'country_code'})

country_dictionary = {
    'AT': 'Austria',
    'BE': 'Belgium',
    'CH': 'Switzerland',
    'DE': 'Germany',
    'DK': 'Denmark',
    'ES': 'Spain',
    'HR': 'Croatia',
    'IT': 'Italy',
    'LU': 'Luxembourg',
    'NL': 'Netherlands',
    'PL': 'Poland',
    'PT': 'Portugal',
    'SE': 'Sweden',
    'UK': 'United Kingdom',
    'US': 'United States',
    'UR': 'Uruguay'
    }
ecg_org_df['country_name'] = ecg_org_df['country_code'].map(country_dictionary)

if ecg_org_df['country_name'].isna().any() == True:
    new_country_codes_df = ecg_org_df[ecg_org_df['country_name'].isna()][['id','country_code', 'city']]
    new_country_codes_summary = ecg_org_df[ecg_org_df['country_name'].isna()]['country_code'].value_counts()
    new_country_codes_count = len(new_country_codes_summary)
    print(f'{new_country_codes_count} new country code(s) found. Please review. \nCode(s) & count:')
    print(f'{new_country_codes_summary.to_string()}')
    print(f'\n{new_country_codes_df}')

else:
    print(f'Country codes of {len(country_dictionary)} countries mapped successfully.')

3 new country code(s) found. Please review. 
Code(s) & count:
FR    1
GI    1
NO    1

         id country_code   city
48   164050           FR  Minsk
49   164351           GI  Minsk
382  163006           NO  Minsk


In [186]:
# postal_code
if ecg_org_df['postal_code'].isna().any() == False:
    print('No missing postal codes.')

else:
    print('Missing postal codes before manual inserts:')
    print(ecg_org_df[ecg_org_df['postal_code'].isna()][['id', 'organization', 'country_code', 'postal_code', 'city']])

    ecg_org_df.loc[ecg_org_df['id'] == 153690, 'postal_code'] = '12400' # Known missing replacement

    print('\nMissing postal codes after manual inserts:')
    print(ecg_org_df[ecg_org_df['postal_code'].isna()][['id', 'organization', 'country_code', 'postal_code', 'city']])

Missing postal codes before manual inserts:
         id organization country_code postal_code   city
22   164090   AliExpress           ES        None  Minsk
48   164050        Apple           FR        None  Minsk
49   164351        Apple           GI        None  Minsk
382  163006          FBI           NO        None  Minsk
462  165419       Google           DE        None  Minsk

Missing postal codes after manual inserts:
         id organization country_code postal_code   city
22   164090   AliExpress           ES        None  Minsk
48   164050        Apple           FR        None  Minsk
49   164351        Apple           GI        None  Minsk
382  163006          FBI           NO        None  Minsk
462  165419       Google           DE        None  Minsk


In [311]:
# city
# Temporarily turn missing values into empty strings
ecg_org_df['city'] = ecg_org_df['city'].fillna('')

# Check for commas
if ecg_org_df['city'].str.contains(',').any() == False:
    print('No cases of commas in city column.')
else:
    print(ecg_org_df[ecg_org_df['city'].str.contains(',')][['id', 'country_code', 'postal_code', 'city']])
    
# Check for digits
if ecg_org_df['city'].str.contains('\d').any() == False:
    print('No cases of digits in city column.')
else:
    print(ecg_org_df[ecg_org_df['city'].str.contains('\d')][['id', 'country_code', 'postal_code', 'city']]) # works like this as well

# Check for underscores
if ecg_org_df['city'].str.contains('_').any() == False:
    print('No cases of underscores in city column.')
else:
    print(ecg_org_df[ecg_org_df['city'].str.contains('_')][['id', 'country_code', 'postal_code', 'city']]) # Anything not a letter, number, underscore

city_dictionary = {
    'Gent, Belgien': 'Gent',
    'Ettingen, Basel-Landschaft (Schweiz)': 'Ettingen (Basel-Landschaft)',
    '52074 Aachen': 'Aachen',
    'NL1 City': None
    }

ecg_org_df['city'] = ecg_org_df['city'].apply(lambda x: city_dictionary.get(x,x))

# Define missings as missing value again
ecg_org_df['city'] = ecg_org_df['city'].apply(lambda x: None if x == '' else x)

No cases of commas in city column.
No cases of digits in city column.
No cases of underscores in city column.


In [190]:
# company_size
print(ecg_org_df['company_size'].value_counts())

employees_dictionary = {
    '(1)1-2': '1-2',
    '(2)3-10': '3-10',
    '(3)11-25': '11-25',
    '(4)26-50': '26-50',
    '(5)51-100': '51-100',
    '(6)101-250': '101-250',
    '(7)>250': '>250',
    '(7)250-': '>250',
    '(8)501-1000': '501-1000',
    '(7)251-500': '251-500',
    '(10)2501-5000': '2501-5000',
    '(11)5001-10000': '5001-10000',
    '(9)1001-2500': '1001-2500',
    '(8) alle Größen': None,
    '(8) Todas las tallas': None,
    '': None
    }

company_size_category_dictionary = {
    '(1)1-2': 'Micro (≤10)',
    '(2)3-10': 'Micro (≤10)',
    '(3)11-25': 'Small (≤50)',
    '(4)26-50': 'Small (≤50)',
    '(5)51-100': 'Mid (≤250)',
    '(6)101-250': 'Mid (≤250)',
    '(7)>250': 'Large (>250)',
    '(7)250-': 'Large (>250)',
    '(8)501-1000': 'Large (>250)',
    '(7)251-500': 'Large (>250)',
    '(10)2501-5000': 'Large (>250)',
    '(11)5001-10000': 'Large (>250)',
    '(9)1001-2500': 'Large (>250)',
    '(8) alle Größen': None,
    '(8) Todas las tallas': None,
    '': None
    }

ecg_org_df['company_size_category'] = ecg_org_df['company_size'].map(company_size_category_dictionary)
print('\nCompany size categories:')
print(ecg_org_df['company_size_category'].value_counts().to_string())

ecg_org_df['employees'] = ecg_org_df['company_size'].map(employees_dictionary)
print('\nEmployees:')
print(ecg_org_df['employees'].value_counts().to_string())

missing_company_sizes = int(ecg_org_df['company_size_category'].isna().sum())
print(f'\n{missing_company_sizes} entries are missing information about company size.')

(1)1-2                  394
(2)3-10                 328
(3)11-25                156
(4)26-50                121
(5)51-100                75
(6)101-250               59
(8) Todas las tallas     57
(7)>250                  25
(7)250-                  12
(8)501-1000               5
(7)251-500                3
(10)2501-5000             2
(11)5001-10000            2
Wallmart                  2
(8) alle Größen           1
(9)1001-2500              1
AliExpress                1
Apple                     1
Nokia                     1
Name: company_size, dtype: int64

Company size categories:
Micro (≤10)     722
Small (≤50)     277
Mid (≤250)      134
Large (>250)     50

Employees:
1-2           394
3-10          328
11-25         156
26-50         121
51-100         75
101-250        59
>250           37
501-1000        5
251-500         3
2501-5000       2
5001-10000      2
1001-2500       1

83 entries are missing information about company size.


In [191]:
# economic_sector
print(f"{len(ecg_org_df['economic_sector'].value_counts())} unique sectors.\n")
unique_ordered_sectors = ecg_org_df['economic_sector'].sort_values(ascending=True)
unique_ordered_sectors = unique_ordered_sectors.drop_duplicates().reset_index(drop=True)
print(unique_ordered_sectors.head(60))
print(unique_ordered_sectors.tail(5))

72 unique sectors.

0                                      Abfallentsorgung
1                                   Advice / Consulting
2                         Art / Culture / Entertainment
3                                         Assicurazione
4     Assistenza sanitaria / Servizi sociali / Infer...
5                       Automobil / Automobilzulieferer
6                                                Banken
7                              Baugewerbe / Architektur
8                                              Beratung
9                                 Beratung / Consulting
10                                            Bertatung
11                 Bildung / Universität / FH / Schulen
12                                               Chemie
13                    Cibo / Agricoltura / Silvicoltura
14                                       Clint Eastwood
15                                  Commercio / Consumo
16                                Consigli / Consulenza
17                          

In [198]:
sector_dictionary_EN = {
    'Abfallentsorgung': 'Waste Disposal',
    'Advice / Consulting': 'Consulting',
    'Art / Culture / Entertainment': 'Art / Culture / Entertainment',
    'Assicurazione': 'Waste Disposal',
    'Assistenza sanitaria / Servizi sociali / Infermieristica': 'Health / Social Affairs / Nursing',
    'Automobil / Automobilzulieferer': 'Automotive / Automotive supplier',
    'Banken': 'Banking',
    'Baugewerbe / Architektur': 'Construction / Architecture',
    'Beratung': 'Consulting',
    'Beratung / Consulting': 'Consulting',
    'Bertatung': 'Consulting',
    'Bildung / Universität / FH / Schulen': 'Education / University / Polytechnic / Schools',
    'Chemie': 'Chemistry',
    'Cibo / Agricoltura / Silvicoltura': 'Nutrition / Agriculture / Forestry',
    'Commercio / Consumo': 'Trade / Consumption',
    'Consigli / Consulenza': 'Consulting',
    'Construction / Architecture': 'Construction / Architecture',
    'Dienstleistung': 'Services',
    'Druck / Papier / Verpackung': 'Printing / Paper / Packaging',
    'EDP / IT': 'EDP / IT',
    'EDV / IT': 'EDP / IT',
    'Education / University / FH / Schools': 'Education / University / Polytechnic / Schools',
    'Electrical / Electronics': 'Electrical / Electronics',
    'Elektro / Elektronik': 'Electrical / Electronics',
    'Energiewirtschaft': 'Energy',
    'Finanzen': 'Finance',
    'Forschung / Entwicklung / Wissenschaft': 'Research / Development / Science',
    'Gesundheitswesen / Soziales / Pflege': 'Health / Social Affairs / Nursing',
    'Handel / Konsum': 'Trade / Consumption',
    'Handwerk': 'Craft',
    'Immobilien / Facility Management': 'Real Estate / Facility Management',
    'Industrie': 'Industry',
    'Industry': 'Industry',
    'Internet / Multimedia': 'Internet / Multimedia',
    'Kunst / Kultur / Unterhaltung': 'Art / Culture / Entertainment',
    'Marketing / Werbung / PR': 'Marketing / Avertising / PR',
    'Marktforschung': 'Market Research',
    'Maschinen / Anlagenbau': 'Machinery & Plant Engineering',
    'Media': 'Media',
    'Medien': 'Media',
    'Medizin / Pharma': 'Pharma',
    'Medizintechnik': 'Medical Engineering',
    'Nahrungsmittel / Land /  Forstwirtschaft': 'Nutrition / Agriculture / Forestry',
    'Nahrungsmittel / Land / Forstwirtschaft': 'Nutrition / Agriculture / Forestry',
    'Other branches': 'Other',
    'Otros': 'Other',
    'Personalwesen / Personalbeschaffung': 'HR',
    'Seminar / Messeanbieter': 'Seminar / Fair Provider',
    'Seminario / Fornitori di fiere': 'Seminar / Fair Provider',
    'Software · Branding · SEO · Gemeinwohl': 'EDP / IT',
    'Sonstige Branchen': 'Other',
    'Sport / Fitness / Beauty': 'Sports / Fitness / Beauty',
    'Steuerberatung / Wirtschaftsprüfung': 'Fiscal Advice / Auditing',
    'Telecommunication': 'Telecommunication',
    'Telekommunikation': 'Telecommunication',
    'Textilbranche': 'Textile',
    'Tourism / Hotel / Gastronomy': 'Tourism / Hotel / Catering',
    'Tourismus / Hotel / Gastronomie': 'Tourism / Hotel / Catering',
    'Tourismus / Hotel / Gastronomie/ Lebensmittel': 'Tourism / Hotel / Catering',
    'Turismo / Hotel / Gastronomia': 'Tourism / Hotel / Catering',
    'Vereine': 'Associations / Societies',
    'Verkehr / Transport / Logistik': 'Transport / Logistics',
    'Versicherung': 'Insurance',
    'sonstiges': 'Other',
    'Öffentliche Verwaltung': 'Public Administration',
    'Wirtschaftsförderung': 'Economic Promotion',
    '': None
    }

sector_dictionary_DE = {
    'Abfallentsorgung': 'Abfallentsorgung',
    'Advice / Consulting': 'Beratung / Consulting',
    'Art / Culture / Entertainment': 'Kunst / Kultur / Unterhaltung',
    'Assicurazione': 'Abfallentsorgung',
    'Assistenza sanitaria / Servizi sociali / Infermieristica': 'Gesundheitswesen / Soziales / Pflege',
    'Automobil / Automobilzulieferer': 'Automobil / Automobilzulieferer',
    'Banken': 'Banken',
    'Baugewerbe / Architektur': 'Baugewerbe / Architektur',
    'Beratung': 'Beratung / Consulting',
    'Beratung / Consulting': 'Beratung / Consulting',
    'Bertatung': 'Beratung / Consulting',
    'Bildung / Universität / FH / Schulen': 'Bildung / Universität / FH / Schulen',
    'Chemie': 'Chemie',
    'Cibo / Agricoltura / Silvicoltura': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Commercio / Consumo': 'Handel / Konsum',
    'Consigli / Consulenza': 'Beratung / Consulting',
    'Construction / Architecture': 'Baugewerbe / Architektur',
    'Dienstleistung': 'Dienstleistung',
    'Druck / Papier / Verpackung': 'Druck / Papier / Verpackung',
    'EDP / IT': 'EDV / IT',
    'EDV / IT': 'EDV / IT',
    'Education / University / FH / Schools': 'Bildung / Universität / FH / Schulen',
    'Electrical / Electronics': 'Elektro / Elektronik',
    'Elektro / Elektronik': 'Elektro / Elektronik',
    'Energiewirtschaft': 'Energiewirtschaft',
    'Finanzen': 'Finanzen',
    'Forschung / Entwicklung / Wissenschaft': 'Forschung / Entwicklung / Wissenschaft',
    'Gesundheitswesen / Soziales / Pflege': 'Gesundheitswesen / Soziales / Pflege',
    'Handel / Konsum': 'Handel / Konsum',
    'Handwerk': 'Handwerk',
    'Immobilien / Facility Management': 'Immobilien / Facility Management',
    'Industrie': 'Industrie',
    'Industry': 'Industrie',
    'Internet / Multimedia': 'Internet / Multimedia',
    'Kunst / Kultur / Unterhaltung': 'Kunst / Kultur / Unterhaltung',
    'Marketing / Werbung / PR': 'Marketing / Werbung / PR',
    'Marktforschung': 'Marktforschung',
    'Maschinen / Anlagenbau': 'Maschinen- & Anlagenbau',
    'Media': 'Medien',
    'Medien': 'Medien',
    'Medizin / Pharma': 'Medizin / Pharma',
    'Medizintechnik': 'Medizintechnik',
    'Nahrungsmittel / Land /  Forstwirtschaft': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Nahrungsmittel / Land / Forstwirtschaft': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Other branches': 'Sonstige',
    'Otros': 'Sonstige',
    'Personalwesen / Personalbeschaffung': 'Personalwesen / Personalbeschaffung',
    'Seminar / Messeanbieter': 'Seminar- / Messeanbieter',
    'Seminario / Fornitori di fiere': 'Seminar- / Messeanbieter',
    'Software · Branding · SEO · Gemeinwohl': 'EDV / IT',
    'Sonstige Branchen': 'Sonstige',
    'Sport / Fitness / Beauty': 'Sport / Fitness / Beauty',
    'Steuerberatung / Wirtschaftsprüfung': 'Steuerberatung / Wirtschaftsprüfung',
    'Telecommunication': 'Telekommunikation',
    'Telekommunikation': 'Telekommunikation',
    'Textilbranche': 'Textilbranche',
    'Tourism / Hotel / Gastronomy': 'Tourismus / Hotel / Gastronomie',
    'Tourismus / Hotel / Gastronomie': 'Tourismus / Hotel / Gastronomie',
    'Tourismus / Hotel / Gastronomie/ Lebensmittel': 'Tourismus / Hotel / Gastronomie',
    'Turismo / Hotel / Gastronomia': 'Tourismus / Hotel / Gastronomie',
    'Vereine': 'Vereine',
    'Verkehr / Transport / Logistik': 'Verkehr / Transport / Logistik',
    'Versicherung': 'Versicherung',
    'sonstiges': 'Sonstige',
    'Öffentliche Verwaltung': 'Öffentliche Verwaltung',
    'Wirtschaftsförderung': 'Wirtschaftsförderung', 
    '': None
    }

ecg_org_df['economic_sector_EN'] = ecg_org_df['economic_sector'].map(sector_dictionary_EN)
ecg_org_df['economic_sector_DE'] = ecg_org_df['economic_sector'].map(sector_dictionary_DE)

In [203]:
# Check EN
print('English:\n')
print(f"Sum of missing and unassigned values: {ecg_org_df['economic_sector_EN'].isna().sum()}\n")
print(ecg_org_df[ecg_org_df['economic_sector_EN'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

ecg_org_df.loc[ecg_org_df['id'] == 25050, 'economic_sector_EN'] = 'EDP / IT'
print(ecg_org_df[ecg_org_df['economic_sector_EN'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

# Check DE
print('\nGerman:\n')
print(f"Sum of missing and unassigned values: {ecg_org_df['economic_sector_DE'].isna().sum()}\n")
print(ecg_org_df[ecg_org_df['economic_sector_DE'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

ecg_org_df.loc[ecg_org_df['id'] == 25050, 'economic_sector_DE'] = 'EDV / IT'
print(ecg_org_df[ecg_org_df['economic_sector_DE'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

English:

Sum of missing and unassigned values: 10

         id                                    organization  \
22   164090                                      AliExpress   
48   164050                                           Apple   
49   164351                                           Apple   
309   28513                DROSG-PLÖCKINGER & PLÖCKINGER OG   
371   14805                             Fahnen-Gärtner GmbH   
382  163006                                             FBI   
390   14728  Fischer´s EM-Chiemgau – Christoph Fischer GmbH   
462  165419                                          Google   
726   28515                                Manfred Kofranek   
907  151681                                 pusch & Partner   

          economic_sector economic_sector_EN  
22             Will Smith                NaN  
48         Jack Nicholson                NaN  
49           Will smiff D                NaN  
309                  None                NaN  
371  Druck, Textilb

In [206]:
# balance_details
'''
To extract information from balance_details: Using pattern
CAN BE AUTOMATED (dependent on max number of balances: for balance1....n...)

# With dictionaries: https://stackoverflow.com/questions/6181935/how-do-you-create-different-variable-names-while-in-a-loop
d = {}
for x in range(1, 10):
    d["string{0}".format(x)] = "Hello"
'''

# Turn missing information into missing values
ecg_org_df.loc[ecg_org_df['balance_details'].astype(str).str.contains('liegen keine'), 'balance_details'] = None

# Check
ecg_org_df['balance_details']

# Define lists
number_of_balances_org_table = []

balance1_valid_until_date = []
balance2_valid_until_date = []
balance3_valid_until_date = []
balance4_valid_until_date = []
balance5_valid_until_date = []
balance6_valid_until_date = []
balance7_valid_until_date = []
balance8_valid_until_date = []
balance9_valid_until_date = []
balance10_valid_until_date = []

balance1_score = []
balance2_score = []
balance3_score = []
balance4_score = []
balance5_score = []
balance6_score = []
balance7_score = []
balance8_score = []
balance9_score = []
balance10_score = []

# Fill lists
for balance_detail in ecg_org_df['balance_details']:
    
    # number_of_balances_org_table
    try:
        if len(balance_detail) <=6:
            number_of_balances_org_table.append(1)
        elif len(balance_detail) <=12:
            number_of_balances_org_table.append(2)
        elif len(balance_detail) <=18:
            number_of_balances_org_table.append(3)
        elif len(balance_detail) <=24:
            number_of_balances_org_table.append(4)
        elif len(balance_detail) <=30:
            number_of_balances_org_table.append(5)
        elif len(balance_detail) <=36:
            number_of_balances_org_table.append(6)
        elif len(balance_detail) <=42:
            number_of_balances_org_table.append(7)
        else:
            number_of_balances_org_table.append('>7')
    except:
        number_of_balances_org_table.append(None)
    
    # valid_until_date
    try:
        balance1_valid_until_date.append((balance_detail[4][1:11]).replace('.', '-'))
    except:
        balance1_valid_until_date.append(None)
    try:
        balance2_valid_until_date.append((balance_detail[10][1:11]).replace('.', '-'))
    except:
        balance2_valid_until_date.append(None)    
    try:
        balance3_valid_until_date.append((balance_detail[16][1:11]).replace('.', '-'))
    except:
        balance3_valid_until_date.append(None)        
    try:
        balance4_valid_until_date.append((balance_detail[22][1:11]).replace('.', '-'))
    except:
        balance4_valid_until_date.append(None)
    try:
        balance5_valid_until_date.append((balance_detail[28][1:11]).replace('.', '-'))
    except:
        balance5_valid_until_date.append(None)
    try:
        balance6_valid_until_date.append((balance_detail[34][1:11]).replace('.', '-'))
    except:
        balance6_valid_until_date.append(None)
    try:
        balance7_valid_until_date.append((balance_detail[40][1:11]).replace('.', '-'))
    except:
        balance7_valid_until_date.append(None)
    try:
        balance8_valid_until_date.append((balance_detail[46][1:11]).replace('.', '-'))
    except:
        balance8_valid_until_date.append(None)
    try:
        balance9_valid_until_date.append((balance_detail[52][1:11]).replace('.', '-'))
    except:
        balance9_valid_until_date.append(None)
    try:
        balance10_valid_until_date.append((balance_detail[58][1:11]).replace('.', '-'))
    except:
        balance10_valid_until_date.append(None)
    
    # score
    try:
        balance1_score.append(int(balance_detail[4][12:]))
    except:
        balance1_score.append(None)    
    try:
        balance2_score.append(int(balance_detail[10][12:]))
    except:
        balance2_score.append(None)
    try:
        balance3_score.append(int(balance_detail[16][12:]))
    except:
        balance3_score.append(None)    
    try:
        balance4_score.append(int(balance_detail[22][12:]))
    except:
        balance4_score.append(None)    
    try:
        balance5_score.append(int(balance_detail[28][12:]))
    except:
        balance5_score.append(None)    
    try:
        balance6_score.append(int(balance_detail[34][12:]))
    except:
        balance6_score.append(None)    
    try:
        balance7_score.append(int(balance_detail[40][12:]))
    except:
        balance7_score.append(None)
    try:
        balance8_score.append(int(balance_detail[46][12:]))
    except:
        balance8_score.append(None)
    try:
        balance9_score.append(int(balance_detail[52][12:]))
    except:
        balance9_score.append(None)
    try:
        balance10_score.append(int(balance_detail[58][12:]))
    except:
        balance10_score.append(None)

In [210]:
valid_number_of_balances = []
for entry in number_of_balances_org_table:
    if entry is not None and isinstance(entry, int):
        valid_number_of_balances.append(entry)

max_number_of_balances = max(valid_number_of_balances)

print(f'Maximum number of balances: {max_number_of_balances}')

print(pd.Series(valid_number_of_balances).value_counts().to_string())

print(f'MISSG {len(number_of_balances_org_table)-len(valid_number_of_balances)}')

Maximum number of balances: 7
1    975
2    157
3     40
4     13
5      2
6      2
7      1
MISSG 76


In [211]:
# Check correct date format with an example case
balance1_valid_until_date[859]

'31-12-2021'

In [212]:
# Correct scores being placed in valid_until_date lists due to missing values in valid_until_date 
i = 0
for element in balance1_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance1_valid_until_date[i] = None
            balance1_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance2_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance2_valid_until_date[i] = None
            balance2_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance3_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance3_valid_until_date[i] = None
            balance3_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance4_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance4_valid_until_date[i] = None
            balance4_score[i] = element
    except:
        pass
    i += 1
    
i = 0
for element in balance5_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance5_valid_until_date[i] = None
            balance5_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance6_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance6_valid_until_date[i] = None
            balance6_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance7_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance7_valid_until_date[i] = None
            balance7_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance8_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance8_valid_until_date[i] = None
            balance8_score[i] = element
    except:
        pass
    i += 1
    
i = 0
for element in balance9_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance9_valid_until_date[i] = None
            balance9_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance10_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance10_valid_until_date[i] = None
            balance10_score[i] = element
    except:
        pass
    i += 1

In [283]:
ecg_org_df['number_of_balances_org_table'] = number_of_balances_org_table

list_of_score_dictionaries = []

i=0
for element in range(len(ecg_org_df)):
    element_score_dictionary = {
        balance10_valid_until_date[i]: balance10_score[i], # Reverse order to avoid replacement in case of key=None, value= (a valid score)
        balance9_valid_until_date[i]: balance9_score[i],
        balance8_valid_until_date[i]: balance8_score[i],
        balance7_valid_until_date[i]: balance7_score[i],
        balance6_valid_until_date[i]: balance6_score[i],
        balance5_valid_until_date[i]: balance5_score[i],
        balance4_valid_until_date[i]: balance4_score[i],
        balance3_valid_until_date[i]: balance3_score[i],
        balance2_valid_until_date[i]: balance2_score[i],
        balance1_valid_until_date[i]: balance1_score[i]
        }
    
    # Removing values with missing value
    element_score_dictionary = {key: value for key, value in element_score_dictionary.items() if value is not None and value != ''}

    # Removing keys with missing value
    if None in element_score_dictionary:
        del element_score_dictionary[None]
    
    list_of_score_dictionaries.append(element_score_dictionary) 
    i += 1

list_of_score_dictionaries = [None if bool(x) is False else x for x in list_of_score_dictionaries]
    
ecg_org_df['balance_scores'] = list_of_score_dictionaries
list_of_score_dictionaries

[{'30-03-2020': 690},
 None,
 None,
 {'31-12-2014': 427},
 None,
 {'31-05-2017': 367, '30-11-2020': 476},
 {'31-10-2023': 358},
 {'30-10-2019': 491},
 None,
 {'30-06-2018': 556},
 {'30-09-2022': 392, '31-12-2025': 475},
 None,
 None,
 None,
 None,
 {'31-10-2025': 270},
 None,
 None,
 {'31-12-2015': 622},
 {'30-11-2023': 324},
 {'31-03-2024': 569},
 {'30-04-2023': 377},
 None,
 {'01-04-2019': 446},
 {'31-07-2024': 266},
 {'31-07-2023': 291},
 {'31-07-2025': 242},
 {'30-12-2018': 627, '31-05-2021': 444},
 {'20-01-2016': 548},
 {'31-12-2020': 304},
 {'30-11-2024': 444},
 {'30-11-2025': 338},
 None,
 None,
 {'31-01-2018': 405, '31-07-2020': 489, '31-05-2023': 437},
 {'31-12-2024': 229},
 {'30-03-2021': 702},
 {'31-12-2021': 290},
 {'30-04-2023': 320},
 {'31-07-2019': 415},
 {'24-05-2018': 840},
 {'20-10-2020': 348},
 {'31-12-2015': 480, '30-11-2018': 568},
 {'31-07-2024': 509},
 {'31-07-2025': 305},
 {'31-07-2024': 257},
 None,
 {'30-10-2020': 577},
 None,
 None,
 {'31-07-2020': 728},
 {'3

In [284]:
# Choose relevant columns
ecg_org_df = ecg_org_df[[
    'id',
    'organization',
    'country_code',
    'country_name',
    'postal_code',
    'city',
    'company_size_category',
    'employees',
    'economic_sector_EN',
    'economic_sector_DE',
    'number_of_balances_org_table',
    'balance_scores'
    ]]

In [285]:
# Checks after cleaning and transformation
print(f'Datatype: \n{ecg_org_df.dtypes}\n')

Datatype: 
id                               int64
organization                    object
country_code                    object
country_name                    object
postal_code                     object
city                            object
company_size_category           object
employees                       object
economic_sector_EN              object
economic_sector_DE              object
number_of_balances_org_table    object
balance_scores                  object
dtype: object



In [286]:
print(f'Missing values: \n{ecg_org_df.isna().sum()}\n')

Missing values: 
id                                0
organization                      0
country_code                      0
country_name                      3
postal_code                       5
city                              1
company_size_category            83
employees                        83
economic_sector_EN               10
economic_sector_DE               10
number_of_balances_org_table     75
balance_scores                  205
dtype: int64



In [300]:
# Replace all empty strings with missing values

# Check for empty strings
if ecg_org_df.eq('').any(axis=None) == False:
    print('No empty strings ("") in dataframe.')

else:
    pd.set_option('display.max_columns', None) # Showing all columns
    print(ecg_org_df[ecg_org_df.eq('').any(axis=1)])
    pd.reset_option('max_columns')

    ecg_org_df = ecg_org_df.replace('', np.nan)

    pd.set_option('display.max_columns', None) # Showing all columns
    print(ecg_df[ecg_df.eq('').any(axis=1)])
    pd.reset_option('max_columns')

No empty strings ("") in dataframe.


In [301]:
# Turn None type values into NaN
ecg_org_df = ecg_org_df.fillna(value=np.nan)

In [302]:
# Last check
ecg_org_df

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,number_of_balances_org_table,balance_scores
0,14885,4e solutions GmbH,DE,Germany,70794,Filderstadt,Micro (≤10),3-10,Trade / Consumption,Handel / Konsum,1,{'30-03-2020': 690}
1,14886,4plus5,DE,Germany,89077,Ulm,Micro (≤10),3-10,Construction / Architecture,Baugewerbe / Architektur,1,
2,35275,A & K Engemann GbR,DE,Germany,34439,Willebadessen,Micro (≤10),3-10,Nutrition / Agriculture / Forestry,Nahrungsmittel / Land / Forstwirtschaft,1,
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,Germany,14480,Potsdam,Small (≤50),11-25,Consulting,Beratung / Consulting,1,{'31-12-2014': 427}
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,Austria,1080,Wien,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,2,
...,...,...,...,...,...,...,...,...,...,...,...,...
1261,92844,Zimmerei Diedrich – Die Gesundhausbauer GmbH,DE,Germany,37434,Rüdershausen,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,1,{'31-10-2024': 282}
1262,45156,zimmerwerkstatt Nicola Bannier und Michael Weber,DE,Germany,29456,Hitzacker,Micro (≤10),1-2,Craft,Handwerk,1,{'28-02-2016': 534}
1263,165659,ZORA Kinder- und Jugendhilfe gGmbH,DE,Germany,17489,Greifswald,Mid (≤250),51-100,Health / Social Affairs / Nursing,Gesundheitswesen / Soziales / Pflege,1,
1264,14718,Zukunftswerk eG,DE,Germany,82319,Starnberg,Micro (≤10),3-10,Consulting,Beratung / Consulting,1,{'30-04-2021': 432}


#### Saving the dataframe

In [303]:
ecg_org_df.to_csv('ecg_org_df.csv', index=False)

## Scraping balance-oriented view
https://audit.ecogood.org/firmenauskunft/

#### URL creation

In [243]:
# URLs (first page) & base URLs / indicators (when browsing through the table)

# audit
audit_url = 'https://audit.ecogood.org/firmenauskunft/?ser_fvz_pagesize=50#1574317575063-06b4d9e5-e677'
audit_base_url = 'https://audit.ecogood.org/firmenauskunft/?ser_fvz_pagesize=50&frm-page-4770='
audit_indicator = '#1574317575063-06b4d9e5-e677' # attach directly after page no.

# peer
peer_url = 'https://audit.ecogood.org/firmenauskunft/?ser_fvz_pagesize=50#1574317575155-9e81c480-beb1'
peer_base_url = 'https://audit.ecogood.org/firmenauskunft/?frm-page-4772='
peer_indicator = '#1574317575155-9e81c480-beb1' # attach directly after page no.

In [244]:
# Create list with audit URLs to scrape through (as max. 50 entries per page are displayed)

# Creating soup
audit_response = requests.get(audit_url)
audit_soup = BeautifulSoup(audit_response.content, "html.parser")
print(f'Response status (audit): {audit_response.status_code}')
print()

# Finding out number of total entries
total_audit_entries = int(list(audit_soup.select('div[style="overflow-x:auto;"] table[id="Liste_Firmen_Audittool"]')[0])[0][-4:])
print(f'Total entries in audit database: {total_audit_entries}')

# Finding out number of URLs (pages)
number_of_audit_urls = int(total_audit_entries/50)+1
print(f'Number of audit table pages: {number_of_audit_urls}')
print()

dynamic_audit_url_list = [audit_base_url + str(i) + audit_indicator for i in range(1,number_of_audit_urls+1)]
print(f'Audit URL list created. Length of dynamic_audit_url_list: {len(dynamic_audit_url_list)}')

Response status (audit): 200

Total entries in audit database: 688
Number of audit table pages: 14

Audit URL list created. Length of dynamic_audit_url_list: 14


In [245]:
# Create list with peer URLs to scrape through (as max. 50 entries per page are displayed)

# Creating soup
peer_response = requests.get(peer_url)
peer_soup = BeautifulSoup(peer_response.content, "html.parser")
print(f'Response status (peer): {peer_response.status_code}')
print()

# Finding out number of total entries
total_peer_entries = int(list(peer_soup.select('div[style="overflow-x:auto;"] table[id="Liste_Firmen_Audittool"]')[1])[0][-4:])
print(f'Total entries in peer database: {total_peer_entries}')

# Finding out number of URLs (pages)
number_of_peer_urls = int(total_peer_entries/50)+1
print(f'Number of peer table pages: {number_of_peer_urls}')
print()

dynamic_peer_url_list = [peer_base_url + str(i) + peer_indicator for i in range(1,number_of_peer_urls+1)]
print(f'Peer URL list created. Length of dynamic_peer_url_list: {len(dynamic_peer_url_list)}')

Response status (peer): 200

Total entries in peer database: 767
Number of peer table pages: 16

Peer URL list created. Length of dynamic_peer_url_list: 16


In [246]:
#### Checks

In [247]:
# Checking tables
for table in audit_soup.select('table')[0:2]: # Table 0: audit, table 1: peer 
    print(table.get('id'))

Liste_Firmen_Audittool
Liste_Firmen_Audittool


In [248]:
# Checking column titles
audit_data = audit_soup.select('table')[0]
column_titles = []
for header in audit_data.select('tr')[0]:
    if header.get_text().strip() != '':
        column_titles.append(header.get_text().strip())

#print(len(titles))
print(column_titles)

['Bilanz', 'MA', 'Branche', 'Land-PLZ Ort', 'Organisation', 'Einreichung', 'Gültigkeit', 'Dokumente']


#### Scraping the first page (for testing)

In [317]:
# Selecting table with relevant information and assigning variable
balance_data = audit_soup.select('table')[0]

# Creating empty lists
balance_details = []
organizations = []
submission_date = []
valid_until_date = []
documents = []

# Scraping main data
for row in balance_data.select('tbody tr'):
    balance_details.append(row.get_text().strip().split('\n')[0])
    organizations.append(row.get_text().strip().split('\n')[4])
    submission_date.append(row.get_text().strip().split('\n')[5])
    valid_until_date.append(row.get_text().strip().split('\n')[6])
    documents.append(row.get_text().strip().split('\n')[7])
        
# Check
#documents

In [411]:
# For ids
balance_data = audit_soup.select('table')[0]
organization_links = []

for a in balance_data.find_all('a'):
    if 'firmenauskunft' in a.get('href'):
        organization_links.append(a.get('href'))

len(organization_links)

38

#### Scraping

In [421]:
# Creating empty lists and dictionary
balance_details = []
organizations = []
organization_links = []
submission_date = []
valid_until_date = []
documents = []

In [422]:
# Main scraping

# audit
i=0
for url in dynamic_audit_url_list:
    response = requests.get(url)
    audit_soup = BeautifulSoup(response.content, "html.parser")
    audit_data = audit_soup.select('table')[0]
    
    for row in audit_data.select('tbody tr'):
        balance_details.append(row.get_text().strip().split('\n')[0])
        organizations.append(row.get_text().strip().split('\n')[4])
        submission_date.append(row.get_text().strip().split('\n')[5])
        valid_until_date.append(row.get_text().strip().split('\n')[6])
        documents.append(row.get_text().strip().split('\n')[7])
        
    for a in audit_data.find_all('a'):
        if 'firmenauskunft' in a.get('href'):
            organization_links.append(a.get('href'))
                
    # Show status
    i += 1
    print(f'Page {i}/{len(dynamic_audit_url_list)+len(dynamic_peer_url_list)} scraped.')

# peer
for url in dynamic_peer_url_list:
    response = requests.get(url)
    peer_soup = BeautifulSoup(response.content, "html.parser")
    peer_data = peer_soup.select('table')[1]
    
    for row in peer_data.select('tbody tr'):
        balance_details.append(row.get_text().strip().split('\n')[0])
        organizations.append(row.get_text().strip().split('\n')[4])
        submission_date.append(row.get_text().strip().split('\n')[5])
        valid_until_date.append(row.get_text().strip().split('\n')[6])
        documents.append(row.get_text().strip().split('\n')[7])
                
    for a in peer_data.find_all('a'):
        if 'firmenauskunft' in a.get('href'):
            organization_links.append(a.get('href'))
        
    # Show status
    i += 1
    print(f'Page {i}/{len(dynamic_audit_url_list)+len(dynamic_peer_url_list)} scraped.')        
        
# Check
print(f'\nAll pages scraped. Length: {len(organizations)} entries.')

Page 1/30 scraped.
Page 2/30 scraped.
Page 3/30 scraped.
Page 4/30 scraped.
Page 5/30 scraped.
Page 6/30 scraped.
Page 7/30 scraped.
Page 8/30 scraped.
Page 9/30 scraped.
Page 10/30 scraped.
Page 11/30 scraped.
Page 12/30 scraped.
Page 13/30 scraped.
Page 14/30 scraped.
Page 15/30 scraped.
Page 16/30 scraped.
Page 17/30 scraped.
Page 18/30 scraped.
Page 19/30 scraped.
Page 20/30 scraped.
Page 21/30 scraped.
Page 22/30 scraped.
Page 23/30 scraped.
Page 24/30 scraped.
Page 25/30 scraped.
Page 26/30 scraped.
Page 27/30 scraped.
Page 28/30 scraped.
Page 29/30 scraped.
Page 30/30 scraped.

All pages scraped. Length: 1455 entries.


In [423]:
len(organization_links)

1455

In [448]:
# Test scrape for one id
url = organization_links[0]

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

org_id = int(soup.select('table[id="RW_Firmenliste"] tbody tr td')[0].get_text())
org_id

14665

In [444]:
# Scraping for ids
organization_ids = []

i=0
for link in organization_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    
    try:
        organization_ids.append(int(soup.select('table[id="RW_Firmenliste"] tbody tr td')[0].get_text()))
    
    except:
        organization_ids.append(None)
    
    i += 1
    if i%100 == 0: 
        print(f'Scraped {i} links')
        
print('All links scraped.')

Scraped 100 links
Scraped 200 links
Scraped 300 links
Scraped 400 links
Scraped 500 links
Scraped 600 links
Scraped 700 links
Scraped 800 links
Scraped 900 links
Scraped 1000 links
Scraped 1100 links
Scraped 1200 links
Scraped 1300 links
Scraped 1400 links


In [454]:
organization_ids = organzation_ids
len(organization_ids)

1455

In [455]:
# Creating basic dataframe
ecg_bal_df = pd.DataFrame({#'id': ids, 
                'balance_details': balance_details,
                'organization_id': organization_ids,
                'organization': organizations,
                'organization_link': organization_links,
                'submission_date': submission_date,
                'valid_until_date': valid_until_date,
                'documents': documents
                })
print(f'Shape: {ecg_bal_df.shape}\n')
print(f'Datatype: \n{ecg_bal_df.dtypes}\n')
display(ecg_bal_df)

Shape: (1455, 7)

Datatype: 
balance_details       object
organization_id      float64
organization          object
organization_link     object
submission_date       object
valid_until_date      object
documents             object
dtype: object



Unnamed: 0,balance_details,organization_id,organization,organization_link,submission_date,valid_until_date,documents
0,M5.0 Kompaktbilanz,14665.0,ALLMENDA Social Business eG,https://audit.ecogood.org/firmenauskunft-2/?la...,2018.10.28,2021.05.31,Bericht / Testat
1,M5.0 Vollbilanz,14666.0,Backkultur,https://audit.ecogood.org/firmenauskunft-2/?la...,2019.01.31,2021.05.31,Bericht / Testat
2,M5.0 Vollbilanz,14667.0,Bernhard Oberrauch,https://audit.ecogood.org/firmenauskunft-2/?la...,2018.09.01,2023.03.31,Bericht / Testat
3,M5.0 Vollbilanz,14668.0,buch7.de GmbH,https://audit.ecogood.org/firmenauskunft-2/?la...,2018.11.16,2020.12.31,Bericht / Testat
4,M5.0 Kompaktbilanz,14669.0,"Cosyma, EPU",https://audit.ecogood.org/firmenauskunft-2/?la...,2018.07.01,2020.10.31,Bericht / Testat
...,...,...,...,...,...,...,...
1450,M5.0 Kompaktbilanz,164413.0,Thomas Böke,https://audit.ecogood.org/firmenauskunft-2/?la...,2024.02.27,,/
1451,M5.0 Vollbilanz,164461.0,Beigel Steuerberater PartG mbB,https://audit.ecogood.org/firmenauskunft-2/?la...,2024.02.27,,/
1452,M5.0 Vollbilanz,164463.0,dng die neue gesellschaft,https://audit.ecogood.org/firmenauskunft-2/?la...,2024.02.27,,/
1453,M5.0 Vollbilanz,164465.0,stattbau münchen GmbH,https://audit.ecogood.org/firmenauskunft-2/?la...,2024.02.27,,/


In [458]:
ecg_bal_df[ecg_bal_df['organization_id'].isna()]

Unnamed: 0,balance_details,organization_id,organization,organization_link,submission_date,valid_until_date,documents
82,M5.0 Kompaktbilanz,,"Amann, Ulrike, Raum für Gemeinwesenentwicklung",https://audit.ecogood.org/firmenauskunft-2/?la...,2017.05.11,2020.07.31,Bericht / Testat
569,M5.0 Balance completo,,Marketeasing,https://audit.ecogood.org/firmenauskunft-2/?la...,2013.04.01,2015.04.01,/
570,M5.0 Balance completo,,"Veritable, SL",https://audit.ecogood.org/firmenauskunft-2/?la...,2013.04.01,2015.04.01,/
571,M5.0 Balance completo,,USS,https://audit.ecogood.org/firmenauskunft-2/?la...,2017.04.01,2019.04.01,/
572,M5.0 Balance completo,,Mellowmove,https://audit.ecogood.org/firmenauskunft-2/?la...,2017.04.01,2019.04.01,/
...,...,...,...,...,...,...,...
1305,M5.0 Balance completo,,Pou de Beca,https://audit.ecogood.org/firmenauskunft-2/?la...,2019.11.30,,/
1306,M5.0 Balance completo,,Supernaranjas,https://audit.ecogood.org/firmenauskunft-2/?la...,2019.11.30,,/
1307,M5.0 Balance completo,,Aplicaciones Industriales Valencianas S.L.,https://audit.ecogood.org/firmenauskunft-2/?la...,2019.11.30,,/
1377,M5.0 Kompaktbilanz,,Andreas Bachofner GmbH,https://audit.ecogood.org/firmenauskunft-2/?la...,2023.05.10,2025.06.30,Bericht / Zertifikat
