## Preparations

In [1]:
# Imports
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag

In [2]:
# Basic URL for information search in ECG database
url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga=&ser_land=&ser_branche=&ser_ftevon=&ser_ftebis=&ser_ort=&ser_b2b=&ser_b2c=&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep=&ser_standard=&ser_auditart='

# Base url for pages when browsing through the table
base_url = 'https://audit.ecogood.org/firmenauskunft-fvz/?ser_orga&ser_land&ser_branche&ser_ftevon&ser_ftebis&ser_ort&ser_b2b&ser_b2c&ser_fvz_view=_md&ser_fvz_pagesize=50&ser_rep&ser_standard&ser_auditart&frm-page-19795='

In [3]:
# Create list with URLs to scrape through (as max. 50 entries per page are displayed)

# Creating soup
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
print(f'Response status: {response.status_code}')
print()

# Finding out number of total entries
total_entries = int(soup.select('strong')[0].get_text())
print(f'Total entries in ECG database: {total_entries}')

# Finding out number of URLs (pages)
number_of_urls = int(total_entries/50)+1
print(f'Number of table pages: {number_of_urls}')
print()

dynamic_url_list = [base_url + str(i) for i in range(1,number_of_urls+1)]
print(f'URL list created. Length of dynamic_url_list: {len(dynamic_url_list)}')

Response status: 200

Total entries in ECG database: 1248
Number of table pages: 25

URL list created. Length of dynamic_url_list: 25


## Checks

In [4]:
# Checking soup
print(soup.prettify())

<!DOCTYPE html>
<html class="avada-html-layout-wide avada-html-header-position-top" lang="de-DE" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Firmenauskunft (FVZ) – Interactive – AuditAdministration
  </title>
  <meta content="max-image-preview:large" name="robots"/>
  <link href="https://audit.ecogood.org/firmenauskunft-fvz/" hreflang="de" rel="alternate"/>
  <link href="https://audit.ecogood.org/en-firmenauskunft-fvz-2/?lang=en" hreflang="en" rel="alternate"/>
  <link href="https://audit.ecogood.org/it-firmenauskunft-fvz-2/?lang=it" hreflang="it" rel="alternate"/>
  <link href="https://audit.ecogood.org/informaciones-sobre-la-empresa/?lang=es" hreflang="es" rel="alternate"/>
  <link href="//s.w.org" rel="dns-prefetch"/>
  <link href="https://audit.ecogood

In [5]:
# Checking tables and their classes (turns out there are sub tables in the main table)
for table in soup.select('table')[0:6]: # More "RW_Aufträge" sub tables coming after
    print(table.get('id'))

RW_Firmenliste
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge
RW_Aufträge


In [6]:
# Checking column titles
company_data = soup.select('table')[0]
column_titles = []
for header in company_data.select('tr')[0]:
    if header.get_text().strip() != '':
        column_titles.append(header.get_text().strip())

#print(len(titles))
print(column_titles)

['ID', 'Organisation', 'Land', 'PLZ', 'Ort', 'Größe', 'Branche', 'Bilanzdetails…', 'BilanztypPeer-Audit…', 'gültig bis…', 'Dokumente…']


## Scraping the first page

In [7]:
# Selecting table with relevant information and assigning variable
company_data = soup.select('table')[0]

In [8]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []
#balance_types = []
#balance_valid_until = []
#documents = []


# Scraping main data
for row in company_data.select('tbody tr'):
    if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
        ids.append(row.get_text().strip().split('\n')[0])
        organizations.append(row.get_text().strip().split('\n')[1])
        countries.append(row.get_text().strip().split('\n')[2])
        postal_codes.append(row.get_text().strip().split('\n')[3])
        cities.append(row.get_text().strip().split('\n')[4])
        company_sizes.append(row.get_text().strip().split('\n')[5])
        economic_sectors.append(row.get_text().strip().split('\n')[6])
        
        balance_details.append(row.get_text().strip().split('\n')[7:])
        
# Check
len(ids)

50

In [9]:
# Creating dataframe
ecg_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       'balance_details': balance_details, 
                       #'balance_type': balance_types,
                       #'balance_valid_until': balance_valid_until,
                       #'documents': documents
                      })
ecg_df

Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
5,14887,Abitare,DE,10829,Berlin,(2)3-10,Handwerk,"[, , …6453, 4.1 StandardDeskaudit, 30.11.2020..."
6,46986,"abw – gemeinnützige Gesellschaft für Arbeit, B...",DE,14057,Berlin,(3)11-25,Bildung / Universität / FH / Schulen,"[, , …46985, M5.0 KompaktbilanzPeerevaluation,..."
7,92361,"Action Waterscape, S.L.",ES,46200,Paiporta,(8) Todas las tallas,Otros,"[, , …92518, M5.0 Balance completoauditoría in..."
8,14888,ad fontes,DE,27624,Geestland,(2)3-10,Elektro / Elektronik,"[, , …6454, M5.0 KompaktbilanzPeerevaluation, ..."
9,40981,ADAMAH BioHof G. Zoubek Vertriebs KG,AT,2280,Glinzendorf,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …40980, 4.1 StandardDeskaudit, 30.06.201..."


In [10]:
# Turn missing information in balance_details into missing values
ecg_df.loc[ecg_df['balance_details'].astype(str).str.contains('liegen keine'), 'balance_details'] = None

# Check
ecg_df['balance_details']

0     [, , …6451, 4.1 StandardPeerevaluation,  30.03...
1     [, , …6452, M5.0 KompaktbilanzPeerevaluation, ...
2        [, , …24837, M5.0 KompaktbilanzPeerevaluation]
3     [, , …44667, 4.0 StandardPeerevaluation,  31.1...
4     [, , …29651, 4.1 StandardDeskaudit,  , Bericht...
5     [, , …6453, 4.1 StandardDeskaudit,  30.11.2020...
6     [, , …46985, M5.0 KompaktbilanzPeerevaluation,...
7     [, , …92518, M5.0 Balance completoauditoría in...
8     [, , …6454, M5.0 KompaktbilanzPeerevaluation, ...
9     [, , …40980, 4.1 StandardDeskaudit,  30.06.201...
10    [, , …138298, M5.0 VollbilanzBesuchsaudit,  31...
11                                                 None
12        [, , …150226, M5.0 KompaktbilanzBesuchsaudit]
13    [, , …30336, M5.0 KompaktbilanzPeerevaluation,...
14      [, , …29634, 4.1 StandardDeskaudit,  , Bericht]
15    [, , …132862, M5.0 VollbilanzBesuchsaudit,  31...
16                                                 None
17                                              

In [11]:
# Check entry 1
list(ecg_df[ecg_df['id'] == '14885']['balance_details'])

[['',
  '',
  '…6451',
  '4.1 StandardPeerevaluation',
  ' 30.03.2020  690 ',
  'Bericht Zertifikat']]

In [12]:
# Check entry 2
list(ecg_df[ecg_df['id'] == '14886']['balance_details'])

[['',
  '',
  '…6452',
  'M5.0 KompaktbilanzPeerevaluation',
  ' 28.02.2021   ',
  ' Zertifikat']]

In [13]:
# Check entry 3
list(ecg_df[ecg_df['id'] == '35275']['balance_details'])

[['', '', '…24837', 'M5.0 KompaktbilanzPeerevaluation']]

In [14]:
# Check for multiple audit entries: The algorithm writes them correctly into once cell
list(ecg_df[ecg_df['id'] == '78313']['balance_details'])

[['',
  '',
  '…154215',
  'M5.0 VollbilanzBesuchsaudit',
  ' ',
  ' ',
  '',
  '',
  '…78312',
  'M5.0 VollbilanzPeerevaluation',
  ' 31.07.2024  266 ',
  'Bericht Zertifikat']]

In [15]:
# Check another multiple audit entry
list(ecg_df[ecg_df['id'] == '29652']['balance_details'])

[['',
  '',
  '…29651',
  '4.1 StandardDeskaudit',
  ' ',
  'Bericht ',
  '',
  '',
  '…28492',
  '3.0 StandardDeskaudit',
  ' ',
  'Bericht']]

In [16]:
# Check another multiple audit entry
list(ecg_df[ecg_df['id'] == '14818']['balance_details'])

[['',
  '',
  '…32990',
  'M5.0 KompaktbilanzPeerevaluation',
  ' 31.05.2023  437 ',
  ' Zertifikat',
  '',
  '',
  '…6371',
  'M5.0 KompaktbilanzBesuchsaudit',
  ' 31.07.2020 489 ',
  'Bericht Testat',
  '',
  '',
  '…36551',
  '4.1 StandardDeskaudit',
  ' 31.01.2018 405 ',
  'Bericht Testat']]

In [17]:
# Checking stuff
ecg_df[ecg_df['balance_details'].astype(str).str.contains('\d\d.')][['id','balance_details']]

Unnamed: 0,id,balance_details
0,14885,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
5,14887,"[, , …6453, 4.1 StandardDeskaudit, 30.11.2020..."
6,46986,"[, , …46985, M5.0 KompaktbilanzPeerevaluation,..."
7,92361,"[, , …92518, M5.0 Balance completoauditoría in..."
8,14888,"[, , …6454, M5.0 KompaktbilanzPeerevaluation, ..."
9,40981,"[, , …40980, 4.1 StandardDeskaudit, 30.06.201..."


In [18]:
# Extracting information from balance_details
'''
To extract information from the balance details: Using pattern
CAN BE AUTOMATED (dependent on max number of balances: for balance1....n...)
'''

number_of_balances = []

balance1_type_code = []
balance1_type_name = []
balance1_valid_until_date = []
balance1_valid_until_year = []
balance1_year = []
balance1_score = []
balance1_documents = []

balance2_type_code = []
balance2_type_name = []
balance2_valid_until_date = []
balance2_valid_until_year = []
balance2_year = []
balance2_score = []
balance2_documents = []

balance3_type_code = []
balance3_type_name = []
balance3_valid_until_date = []
balance3_valid_until_year = []
balance3_year = []
balance3_score = []
balance3_documents = []

balance4_type_code = []
balance4_type_name = []
balance4_valid_until_date = []
balance4_valid_until_year = []
balance4_year = []
balance4_score = []
balance4_documents = []

balance5_type_code = []
balance5_type_name = []
balance5_valid_until_date = []
balance5_valid_until_year = []
balance5_year = []
balance5_score = []
balance5_documents = []

balance6_type_code = []
balance6_type_name = []
balance6_valid_until_date = []
balance6_valid_until_year = []
balance6_year = []
balance6_score = []
balance6_documents = []

balance7_type_code = []
balance7_type_name = []
balance7_valid_until_date = []
balance7_valid_until_year = []
balance7_year = []
balance7_score = []
balance7_documents = []

test = []

for balance_detail in ecg_df['balance_details']:
    
    try:
        if len(balance_detail) <=6:
            number_of_balances.append(1)
        elif len(balance_detail) <=12:
            number_of_balances.append(2)
        elif len(balance_detail) <=18:
            number_of_balances.append(3)
        elif len(balance_detail) <=24:
            number_of_balances.append(4)
        elif len(balance_detail) <=30:
            number_of_balances.append(5)
        elif len(balance_detail) <=36:
            number_of_balances.append(6)
        elif len(balance_detail) <=42:
            number_of_balances.append(7)
        else:
            number_of_balances.append('>7')
    except:
        number_of_balances.append(None)
    
    # balance 1
    try:
        balance1_type_code.append(balance_detail[3].split(' ',1)[0])
    except:
        balance1_type_code.append(None)
    try:
        balance1_type_name.append(balance_detail[3].split(' ',1)[1])
    except:
        balance1_type_name.append(None)
    try:
        balance1_valid_until_date.append(balance_detail[4][1:11])
    except:
        balance1_valid_until_date.append(None)
    try:
        balance1_valid_until_year.append(int(balance_detail[4][7:11]))
    except:
        balance1_valid_until_year.append(None)
    try:
        balance1_year.append(int(balance_detail[4][7:11])-2)
    except:
        balance1_year.append(None)
    try:
        balance1_score.append(int(balance_detail[4][12:]))
    except:
        balance1_score.append(None)    
    try:
        balance1_documents.append(balance_detail[5].strip().replace(' ', '+'))
    except:
        balance1_documents.append(None)
    
    # balance 2
    try:
        balance2_type_code.append(balance_detail[9].split(' ',1)[0])
    except:
        balance2_type_code.append(None)
    try:
        balance2_type_name.append(balance_detail[9].split(' ',1)[1])
    except:
        balance2_type_name.append(None)
    try:
        balance2_valid_until_date.append(balance_detail[10][1:11])
    except:
        balance2_valid_until_date.append(None)
    try:
        balance2_valid_until_year.append(int(balance_detail[10][7:11]))
    except:
        balance2_valid_until_year.append(None)
    try:
        balance2_year.append(int(balance_detail[10][7:11])-2)
    except:
        balance2_year.append(None)
    try:
        balance2_score.append(int(balance_detail[10][12:]))
    except:
        balance2_score.append(None)    
    try:
        balance2_documents.append(balance_detail[11].strip().replace(' ', '+'))
    except:
        balance2_documents.append(None)

    # balance 3
    try:
        balance3_type_code.append(balance_detail[15].split(' ',1)[0])
    except:
        balance3_type_code.append(None)
    try:
        balance3_type_name.append(balance_detail[15].split(' ',1)[1])
    except:
        balance3_type_name.append(None)
    try:
        balance3_valid_until_date.append(balance_detail[16][1:11])
    except:
        balance3_valid_until_date.append(None)
    try:
        balance3_valid_until_year.append(int(balance_detail[16][7:11]))
    except:
        balance3_valid_until_year.append(None)
    try:
        balance3_year.append(int(balance_detail[16][7:11])-2)
    except:
        balance3_year.append(None)
    try:
        balance3_score.append(int(balance_detail[16][12:]))
    except:
        balance3_score.append(None)    
    try:
        balance3_documents.append(balance_detail[17].strip().replace(' ', '+'))
    except:
        balance3_documents.append(None)        

    # balance 4
    try:
        balance4_type_code.append(balance_detail[21].split(' ',1)[0])
    except:
        balance4_type_code.append(None)
    try:
        balance4_type_name.append(balance_detail[21].split(' ',1)[1])
    except:
        balance4_type_name.append(None)
    try:
        balance4_valid_until_date.append(balance_detail[22][1:11])
    except:
        balance4_valid_until_date.append(None)
    try:
        balance4_valid_until_year.append(int(balance_detail[22][7:11]))
    except:
        balance4_valid_until_year.append(None)
    try:
        balance4_year.append(int(balance_detail[22][7:11])-2)
    except:
        balance4_year.append(None)
    try:
        balance4_score.append(int(balance_detail[22][12:]))
    except:
        balance4_score.append(None)    
    try:
        balance4_documents.append(balance_detail[23].strip().replace(' ', '+'))
    except:
        balance4_documents.append(None)        

    # balance 5
    try:
        balance5_type_code.append(balance_detail[27].split(' ',1)[0])
    except:
        balance5_type_code.append(None)
    try:
        balance5_type_name.append(balance_detail[27].split(' ',1)[1])
    except:
        balance5_type_name.append(None)
    try:
        balance5_valid_until_date.append(balance_detail[28][1:11])
    except:
        balance5_valid_until_date.append(None)
    try:
        balance5_valid_until_year.append(int(balance_detail[28][7:11]))
    except:
        balance5_valid_until_year.append(None)
    try:
        balance5_year.append(int(balance_detail[28][7:11])-2)
    except:
        balance5_year.append(None)
    try:
        balance5_score.append(int(balance_detail[28][12:]))
    except:
        balance5_score.append(None)    
    try:
        balance5_documents.append(balance_detail[29].strip().replace(' ', '+'))
    except:
        balance5_documents.append(None)

    # balance 6
    try:
        balance6_type_code.append(balance_detail[33].split(' ',1)[0])
    except:
        balance6_type_code.append(None)
    try:
        balance6_type_name.append(balance_detail[33].split(' ',1)[1])
    except:
        balance6_type_name.append(None)
    try:
        balance6_valid_until_date.append(balance_detail[34][1:11])
    except:
        balance6_valid_until_date.append(None)
    try:
        balance6_valid_until_year.append(int(balance_detail[34][7:11]))
    except:
        balance6_valid_until_year.append(None)
    try:
        balance6_year.append(int(balance_detail[34][7:11])-2)
    except:
        balance6_year.append(None)
    try:
        balance6_score.append(int(balance_detail[34][12:]))
    except:
        balance6_score.append(None)    
    try:
        balance6_documents.append(balance_detail[35].strip().replace(' ', '+'))
    except:
        balance6_documents.append(None)

    # balance 7
    try:
        balance7_type_code.append(balance_detail[39].split(' ',1)[0])
    except:
        balance7_type_code.append(None)
    try:
        balance7_type_name.append(balance_detail[39].split(' ',1)[1])
    except:
        balance7_type_name.append(None)
    try:
        balance7_valid_until_date.append(balance_detail[40][1:11])
    except:
        balance7_valid_until_date.append(None)
    try:
        balance7_valid_until_year.append(int(balance_detail[40][7:11]))
    except:
        balance7_valid_until_year.append(None)
    try:
        balance7_year.append(int(balance_detail[40][7:11])-2)
    except:
        balance7_year.append(None)
    try:
        balance7_score.append(int(balance_detail[40][12:]))
    except:
        balance7_score.append(None)    
    try:
        balance7_documents.append(balance_detail[41].strip().replace(' ', '+'))
    except:
        balance7_documents.append(None)
        
balance1_type_name

['StandardPeerevaluation',
 'KompaktbilanzPeerevaluation',
 'KompaktbilanzPeerevaluation',
 'StandardPeerevaluation',
 'StandardDeskaudit',
 'StandardDeskaudit',
 'KompaktbilanzPeerevaluation',
 'Balance completoauditoría in situ',
 'KompaktbilanzPeerevaluation',
 'StandardDeskaudit',
 'VollbilanzBesuchsaudit',
 None,
 'KompaktbilanzBesuchsaudit',
 'KompaktbilanzPeerevaluation',
 'StandardDeskaudit',
 'VollbilanzBesuchsaudit',
 None,
 None,
 'StandardPeerevaluation',
 'KompaktbilanzPeerevaluation',
 'KompaktbilanzBesuchsaudit',
 'KompaktbilanzPeerevaluation',
 'Balance completoauditoría in situ',
 'VollbilanzBesuchsaudit',
 'VollbilanzPeerevaluation',
 'KompaktbilanzBesuchsaudit',
 'KompaktbilanzBesuchsaudit',
 'Balance completoauditoría in situ',
 'KompaktbilanzDeskaudit',
 'VollbilanzPeerevaluation',
 'VollbilanzPeerevaluation',
 None,
 None,
 'KompaktbilanzPeerevaluation',
 'KompaktbilanzBesuchsaudit',
 'KompaktbilanzPeerevaluation',
 'KompaktbilanzDeskaudit',
 'KompaktbilanzDeskaud

## Scraping all pages of the table

In [19]:
# Creating empty lists and dictionary
ids = []
organizations = []
countries = []
postal_codes = []
cities = []
company_sizes = []
economic_sectors = []
balance_details = []
#balance_types = []
#balance_valid_until = []
#documents = []

# Scraping
i=0
for url in dynamic_url_list:
    # Preparations
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    company_data = soup.select('table')[0]
    
    # Scraping main data
    for row in company_data.select('tbody tr'):
        if row.get_text().strip()[0] != '…': # In the subtables, the rows begin with '…'
            if row.get_text().strip().split('\n')[0] == '92435': # Special treatment of this case
                ids.append(row.get_text().strip().split('\n')[0])
                organizations.append((row.get_text().strip().split('\n')[1] + ' ' + row.get_text().strip().split('\n')[2]))
                countries.append(row.get_text().strip().split('\n')[3])
                postal_codes.append(row.get_text().strip().split('\n')[4])
                cities.append(row.get_text().strip().split('\n')[5])
                company_sizes.append(row.get_text().strip().split('\n')[6])
                economic_sectors.append(row.get_text().strip().split('\n')[7])
                balance_details.append(row.get_text().strip().split('\n')[8:])

            else:
                ids.append(row.get_text().strip().split('\n')[0])
                organizations.append(row.get_text().strip().split('\n')[1])
                countries.append(row.get_text().strip().split('\n')[2])
                postal_codes.append(row.get_text().strip().split('\n')[3])
                cities.append(row.get_text().strip().split('\n')[4])
                company_sizes.append(row.get_text().strip().split('\n')[5])
                economic_sectors.append(row.get_text().strip().split('\n')[6])
                balance_details.append(row.get_text().strip().split('\n')[7:])
                
    # Show status
    i += 1
    print(f'Page {i}/{len(dynamic_url_list)} scraped.')
        
# Check
print(f'\nAll pages scraped. Length: {len(ids)} entries.')

Page 1/25 scraped.
Page 2/25 scraped.
Page 3/25 scraped.
Page 4/25 scraped.
Page 5/25 scraped.
Page 6/25 scraped.
Page 7/25 scraped.
Page 8/25 scraped.
Page 9/25 scraped.
Page 10/25 scraped.
Page 11/25 scraped.
Page 12/25 scraped.
Page 13/25 scraped.
Page 14/25 scraped.
Page 15/25 scraped.
Page 16/25 scraped.
Page 17/25 scraped.
Page 18/25 scraped.
Page 19/25 scraped.
Page 20/25 scraped.
Page 21/25 scraped.
Page 22/25 scraped.
Page 23/25 scraped.
Page 24/25 scraped.
Page 25/25 scraped.

All pages scraped. Length: 1249 entries.


In [20]:
# Creating basic dataframe
ecg_df = pd.DataFrame({'id': ids, 
                       'organization': organizations, 
                       'country': countries, 
                       'postal_code': postal_codes, 
                       'city': cities, 
                       'company_size': company_sizes,
                       'economic_sector': economic_sectors,
                       'balance_details': balance_details, 
                       #'balance_type': balance_types,
                       #'balance_valid_until': balance_valid_until,
                       #'documents': documents
                      })
print(f'Shape: {ecg_df.shape}\n')
print(f'Missing values: \n{ecg_df.isna().sum()}\n')
print(f'Datatype: \n{ecg_df.dtypes}\n')
display(ecg_df)

Shape: (1249, 8)

Missing values: 
id                 0
organization       0
country            0
postal_code        0
city               0
company_size       0
economic_sector    0
balance_details    0
dtype: int64

Datatype: 
id                 object
organization       object
country            object
postal_code        object
city               object
company_size       object
economic_sector    object
balance_details    object
dtype: object



Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
0,14885,4e solutions GmbH,DE,70794,Filderstadt,(2)3-10,Handel / Konsum,"[, , …6451, 4.1 StandardPeerevaluation, 30.03..."
1,14886,4plus5,DE,89077,Ulm,(2)3-10,Baugewerbe / Architektur,"[, , …6452, M5.0 KompaktbilanzPeerevaluation, ..."
2,35275,A & K Engemann GbR,DE,34439,Willebadessen,(2)3-10,Nahrungsmittel / Land / Forstwirtschaft,"[, , …24837, M5.0 KompaktbilanzPeerevaluation]"
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,14480,Potsdam,(3)11-25,Beratung / Consulting,"[, , …44667, 4.0 StandardPeerevaluation, 31.1..."
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,1080,Wien,(3)11-25,Baugewerbe / Architektur,"[, , …29651, 4.1 StandardDeskaudit, , Bericht..."
...,...,...,...,...,...,...,...,...
1244,92844,Zimmerei Diedrich – Die Gesundhausbauer GmbH,DE,37434,Rüdershausen,(3)11-25,Baugewerbe / Architektur,"[, , …92843, M5.0 KompaktbilanzPeerevaluation,..."
1245,45156,zimmerwerkstatt Nicola Bannier und Michael Weber,DE,29456,Hitzacker,(1)1-2,Handwerk,"[, , …45155, 4.1 StandardPeerevaluation, 28.0..."
1246,14718,Zukunftswerk eG,DE,82319,Starnberg,(2)3-10,Beratung / Consulting,"[, , …3335, M5.0 KompaktbilanzPeerevaluation, ..."
1247,30321,zündstoff. fair organic clothing S. Klemz & M....,DE,79100,Freiburg,(2)3-10,Textilbranche,"[, , …30320, 4.1 StandardPeerevaluation, 30.0..."


## Data cleaning and transformation

In [21]:
# Checking specific case (that has a linebreak within the organization name)
ecg_df[ecg_df['id'] == '92435']

Unnamed: 0,id,organization,country,postal_code,city,company_size,economic_sector,balance_details
560,92435,IMPRESUM GRAFICAS LITOLEMA S.L.,ES,46006,Valencia,(8) Todas las tallas,Otros,"[, , …92434, M5.0 Balance completoPeerevaluation]"


In [22]:
# Remove leading and trailing whitespaces
ecg_df = ecg_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [23]:
# Make missing values visible
ecg_df = ecg_df.applymap(lambda x: None if x == '' else x)

print(f'Missing values: \n{ecg_df.isna().sum()}\n')

Missing values: 
id                  0
organization        1
country             0
postal_code         2
city                1
company_size       19
economic_sector     4
balance_details     0
dtype: int64



In [24]:
# id
ecg_df['id'] = ecg_df['id'].astype(int) # As type integer
if ecg_df.duplicated(subset='id').any() == False: # Duplicates?
    print('No id duplicates.')

No id duplicates.


In [25]:
# organization
if ecg_df.duplicated(subset='organization').any() == False: # Duplicates?
    print('No id duplicates.')

print(ecg_df[ecg_df['organization'].isna()])

         id organization country postal_code  city company_size  \
1248  14728         None      DE        None  None       (1)1-2   

     economic_sector                                    balance_details  
1248            None  [, , …155202, M5.0 KompaktbilanzAbPE,  ,  , , ...  


In [26]:
# country
ecg_df = ecg_df.rename(columns={'country':'country_code'})

country_dictionary = {
    'AT': 'Austria',
    'BE': 'Belgium',
    'CH': 'Switzerland',
    'DE': 'Germany',
    'DK': 'Denmark',
    'ES': 'Spain',
    'HR': 'Croatia',
    'IT': 'Italy',
    'LU': 'Luxembourg',
    'NL': 'Netherlands',
    'PL': 'Poland',
    'PT': 'Portugal',
    'SE': 'Sweden',
    'UK': 'United Kingdom',
    'US': 'United States',
    'UR': 'Uruguay'
    }
ecg_df['country_name'] = ecg_df['country_code'].map(country_dictionary)

if ecg_df['country_name'].isna().any() == True:
    new_country_codes_df = ecg_df[ecg_df['country_name'].isna()][['id','country_code', 'city']]
    new_country_codes_summary = ecg_df[ecg_df['country_name'].isna()]['country_code'].value_counts()
    new_country_codes_count = len(new_country_codes_summary)
    print(f'{new_country_codes_count} new country code(s) found. Please review. \nCode(s) & count:')
    print(f'{new_country_codes_summary.to_string()}')
    print(f'\n{new_country_codes_df}')

else:
    print(f'Country codes of {len(country_dictionary)} countries mapped successfully.')

Country codes of 16 countries mapped successfully.


In [27]:
# postal_code
print(ecg_df[ecg_df['postal_code'].isna()][['id', 'organization', 'country_code', 'postal_code', 'city']])

ecg_df.loc[ecg_df['id'] == 153690, 'postal_code'] = '12400'
print('\n')
print(ecg_df[ecg_df['postal_code'].isna()][['id', 'organization', 'country_code', 'postal_code', 'city']])

          id                organization country_code postal_code        city
658   153690  LABORATORIO ECOTECH S.R.L.           UR        None  Montevideo
1248   14728                        None           DE        None        None


         id organization country_code postal_code  city
1248  14728         None           DE        None  None


In [28]:
# city
# Temporarily turn missing values into empty strings
ecg_df['city'] = ecg_df['city'].fillna('')

# city (cont.)
print(ecg_df[ecg_df['city'].str.contains(',')][['id', 'country_code', 'postal_code', 'city']])

#print([x for x in ecg_df['city'] if re.match('.*\d.*', x)]) # with regex
print(ecg_df[ecg_df['city'].str.contains('\d')][['id', 'country_code', 'postal_code', 'city']]) # works like this as well

print(ecg_df[ecg_df['city'].str.contains('_')][['id', 'country_code', 'postal_code', 'city']]) # Anything not a letter, number, underscore

city_dictionary = {
    'Gent, Belgien': 'Gent',
    'Ettingen, Basel-Landschaft (Schweiz)': 'Ettingen (Basel-Landschaft)',
    '52074 Aachen': 'Aachen',
    'NL1 City': None
    }

ecg_df['city'] = ecg_df['city'].apply(lambda x: city_dictionary.get(x,x))

# Define missings as missing value again
ecg_df['city'] = ecg_df['city'].apply(lambda x: None if x == '' else x)

         id country_code postal_code                                  city
613   14659           BE        9040                         Gent, Belgien
914  124736           CH        4107  Ettingen, Basel-Landschaft (Schweiz)
         id country_code postal_code          city
15   132863           DE       52074  52074 Aachen
809   58436           NL       12345      NL1 City
Empty DataFrame
Columns: [id, country_code, postal_code, city]
Index: []


In [29]:
# company_size
print(ecg_df['company_size'].value_counts())

employees_dictionary = {
    '(1)1-2': '1-2',
    '(2)3-10': '3-10',
    '(3)11-25': '11-25',
    '(4)26-50': '26-50',
    '(5)51-100': '51-100',
    '(6)101-250': '101-250',
    '(7)>250': '>250',
    '(7)250-': '>250',
    '(8)501-1000': '501-1000',
    '(7)251-500': '251-500',
    '(10)2501-5000': '2501-5000',
    '(11)5001-10000': '5001-10000',
    '(9)1001-2500': '1001-2500',
    '(8) alle Größen': None,
    '(8) Todas las tallas': None,
    '': None
    }

company_size_category_dictionary = {
    '(1)1-2': 'Micro (≤10)',
    '(2)3-10': 'Micro (≤10)',
    '(3)11-25': 'Small (≤50)',
    '(4)26-50': 'Small (≤50)',
    '(5)51-100': 'Mid (≤250)',
    '(6)101-250': 'Mid (≤250)',
    '(7)>250': 'Large (>250)',
    '(7)250-': 'Large (>250)',
    '(8)501-1000': 'Large (>250)',
    '(7)251-500': 'Large (>250)',
    '(10)2501-5000': 'Large (>250)',
    '(11)5001-10000': 'Large (>250)',
    '(9)1001-2500': 'Large (>250)',
    '(8) alle Größen': None,
    '(8) Todas las tallas': None,
    '': None
    }

ecg_df['company_size_category'] = ecg_df['company_size'].map(company_size_category_dictionary)
print('\nCompany size categories:')
print(ecg_df['company_size_category'].value_counts().to_string())

ecg_df['employees'] = ecg_df['company_size'].map(employees_dictionary)
print('\nEmployees:')
print(ecg_df['employees'].value_counts().to_string())

missing_company_sizes = int(ecg_df['company_size_category'].isna().sum())
print(f'\n{missing_company_sizes} entries are missing information about company size.')

(1)1-2                  390
(2)3-10                 326
(3)11-25                155
(4)26-50                118
(5)51-100                73
(6)101-250               60
(8) Todas las tallas     57
(7)>250                  25
(7)250-                  13
(8)501-1000               4
(7)251-500                3
(10)2501-5000             2
(11)5001-10000            2
(9)1001-2500              1
(8) alle Größen           1
Name: company_size, dtype: int64

Company size categories:
Micro (≤10)     716
Small (≤50)     273
Mid (≤250)      133
Large (>250)     50

Employees:
1-2           390
3-10          326
11-25         155
26-50         118
51-100         73
101-250        60
>250           38
501-1000        4
251-500         3
2501-5000       2
5001-10000      2
1001-2500       1

77 entries are missing information about company size.


In [30]:
# economic_sector
print(f"{len(ecg_df['economic_sector'].value_counts())} unique sectors.\n")
unique_ordered_sectors = ecg_df['economic_sector'].sort_values(ascending=True)
unique_ordered_sectors = unique_ordered_sectors.drop_duplicates().reset_index(drop=True)
print(unique_ordered_sectors.head(60))
print(unique_ordered_sectors.tail(5))

65 unique sectors.

0                                      Abfallentsorgung
1                                   Advice / Consulting
2                         Art / Culture / Entertainment
3                                         Assicurazione
4     Assistenza sanitaria / Servizi sociali / Infer...
5                       Automobil / Automobilzulieferer
6                                                Banken
7                              Baugewerbe / Architektur
8                                              Beratung
9                                 Beratung / Consulting
10                 Bildung / Universität / FH / Schulen
11                                               Chemie
12                    Cibo / Agricoltura / Silvicoltura
13                                  Commercio / Consumo
14                                Consigli / Consulenza
15                          Construction / Architecture
16                                       Dienstleistung
17                          

In [31]:
sector_dictionary_EN = {
    'Abfallentsorgung': 'Waste Disposal',
    'Advice / Consulting': 'Consulting',
    'Art / Culture / Entertainment': 'Art / Culture / Entertainment',
    'Assicurazione': 'Waste Disposal',
    'Assistenza sanitaria / Servizi sociali / Infermieristica': 'Health / Social Affairs / Nursing',
    'Automobil / Automobilzulieferer': 'Automotive / Automotive supplier',
    'Banken': 'Banking',
    'Baugewerbe / Architektur': 'Construction / Architecture',
    'Beratung': 'Consulting',
    'Beratung / Consulting': 'Consulting',
    'Bildung / Universität / FH / Schulen': 'Education / University / Polytechnic / Schools',
    'Chemie': 'Chemistry',
    'Cibo / Agricoltura / Silvicoltura': 'Nutrition / Agriculture / Forestry',
    'Commercio / Consumo': 'Trade / Consumption',
    'Consigli / Consulenza': 'Consulting',
    'Construction / Architecture': 'Construction / Architecture',
    'Dienstleistung': 'Services',
    'Druck / Papier / Verpackung': 'Printing / Paper / Packaging',
    'EDP / IT': 'EDP / IT',
    'EDV / IT': 'EDP / IT',
    'Education / University / FH / Schools': 'Education / University / Polytechnic / Schools',
    'Electrical / Electronics': 'Electrical / Electronics',
    'Elektro / Elektronik': 'Electrical / Electronics',
    'Energiewirtschaft': 'Energy',
    'Finanzen': 'Finance',
    'Forschung / Entwicklung / Wissenschaft': 'Research / Development / Science',
    'Gesundheitswesen / Soziales / Pflege': 'Health / Social Affairs / Nursing',
    'Handel / Konsum': 'Trade / Consumption',
    'Handwerk': 'Craft',
    'Immobilien / Facility Management': 'Real Estate / Facility Management',
    'Industrie': 'Industry',
    'Industry': 'Industry',
    'Internet / Multimedia': 'Internet / Multimedia',
    'Kunst / Kultur / Unterhaltung': 'Art / Culture / Entertainment',
    'Marketing / Werbung / PR': 'Marketing / Avertising / PR',
    'Marktforschung': 'Market Research',
    'Maschinen / Anlagenbau': 'Machinery & Plant Engineering',
    'Media': 'Media',
    'Medien': 'Media',
    'Medizin / Pharma': 'Pharma',
    'Medizintechnik': 'Medical Engineering',
    'Nahrungsmittel / Land /  Forstwirtschaft': 'Nutrition / Agriculture / Forestry',
    'Nahrungsmittel / Land / Forstwirtschaft': 'Nutrition / Agriculture / Forestry',
    'Other branches': 'Other',
    'Otros': 'Other',
    'Personalwesen / Personalbeschaffung': 'HR',
    'Seminar / Messeanbieter': 'Seminar / Fair provider',
    'Seminario / Fornitori di fiere': 'Seminar / Fair provider',
    'Software · Branding · SEO · Gemeinwohl': 'EDP / IT',
    'Sonstige Branchen': 'Other',
    'Sport / Fitness / Beauty': 'Sports / Fitness / Beauty',
    'Steuerberatung / Wirtschaftsprüfung': 'Fiscal Advice / Auditing',
    'Telecommunication': 'Telecommunication',
    'Telekommunikation': 'Telecommunication',
    'Textilbranche': 'Textile',
    'Tourism / Hotel / Gastronomy': 'Tourism / Hotel / Catering',
    'Tourismus / Hotel / Gastronomie': 'Tourism / Hotel / Catering',
    'Tourismus / Hotel / Gastronomie/ Lebensmittel': 'Tourism / Hotel / Catering',
    'Turismo / Hotel / Gastronomia': 'Tourism / Hotel / Catering',
    'Vereine': 'Associations / Societies',
    'Verkehr / Transport / Logistik': 'Transport / Logistics',
    'Versicherung': 'Insurance',
    'sonstiges': 'Other',
    'Öffentliche Verwaltung': 'Public Administration',
    '': None
    }

sector_dictionary_DE = {
    'Abfallentsorgung': 'Abfallentsorgung',
    'Advice / Consulting': 'Beratung / Consulting',
    'Art / Culture / Entertainment': 'Kunst / Kultur / Unterhaltung',
    'Assicurazione': 'Abfallentsorgung',
    'Assistenza sanitaria / Servizi sociali / Infermieristica': 'Gesundheitswesen / Soziales / Pflege',
    'Automobil / Automobilzulieferer': 'Automobil / Automobilzulieferer',
    'Banken': 'Banken',
    'Baugewerbe / Architektur': 'Baugewerbe / Architektur',
    'Beratung': 'Beratung / Consulting',
    'Beratung / Consulting': 'Beratung / Consulting',
    'Bildung / Universität / FH / Schulen': 'Bildung / Universität / FH / Schulen',
    'Chemie': 'Chemie',
    'Cibo / Agricoltura / Silvicoltura': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Commercio / Consumo': 'Handel / Konsum',
    'Consigli / Consulenza': 'Beratung / Consulting',
    'Construction / Architecture': 'Baugewerbe / Architektur',
    'Dienstleistung': 'Dienstleistung',
    'Druck / Papier / Verpackung': 'Druck / Papier / Verpackung',
    'EDP / IT': 'EDV / IT',
    'EDV / IT': 'EDV / IT',
    'Education / University / FH / Schools': 'Bildung / Universität / FH / Schulen',
    'Electrical / Electronics': 'Elektro / Elektronik',
    'Elektro / Elektronik': 'Elektro / Elektronik',
    'Energiewirtschaft': 'Energiewirtschaft',
    'Finanzen': 'Finanzen',
    'Forschung / Entwicklung / Wissenschaft': 'Forschung / Entwicklung / Wissenschaft',
    'Gesundheitswesen / Soziales / Pflege': 'Gesundheitswesen / Soziales / Pflege',
    'Handel / Konsum': 'Handel / Konsum',
    'Handwerk': 'Handwerk',
    'Immobilien / Facility Management': 'Immobilien / Facility Management',
    'Industrie': 'Industrie',
    'Industry': 'Industrie',
    'Internet / Multimedia': 'Internet / Multimedia',
    'Kunst / Kultur / Unterhaltung': 'Kunst / Kultur / Unterhaltung',
    'Marketing / Werbung / PR': 'Marketing / Werbung / PR',
    'Marktforschung': 'Marktforschung',
    'Maschinen / Anlagenbau': 'Maschinen- & Anlagenbau',
    'Media': 'Medien',
    'Medien': 'Medien',
    'Medizin / Pharma': 'Medizin / Pharma',
    'Medizintechnik': 'Medizintechnik',
    'Nahrungsmittel / Land /  Forstwirtschaft': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Nahrungsmittel / Land / Forstwirtschaft': 'Nahrungsmittel / Land /  Forstwirtschaft',
    'Other branches': 'Sonstige',
    'Otros': 'Sonstige',
    'Personalwesen / Personalbeschaffung': 'Personalwesen / Personalbeschaffung',
    'Seminar / Messeanbieter': 'Seminar- / Messeanbieter',
    'Seminario / Fornitori di fiere': 'Seminar- / Messeanbieter',
    'Software · Branding · SEO · Gemeinwohl': 'EDV / IT',
    'Sonstige Branchen': 'Sonstige',
    'Sport / Fitness / Beauty': 'Sport / Fitness / Beauty',
    'Steuerberatung / Wirtschaftsprüfung': 'Steuerberatung / Wirtschaftsprüfung',
    'Telecommunication': 'Telekommunikation',
    'Telekommunikation': 'Telekommunikation',
    'Textilbranche': 'Textilbranche',
    'Tourism / Hotel / Gastronomy': 'Tourismus / Hotel / Gastronomie',
    'Tourismus / Hotel / Gastronomie': 'Tourismus / Hotel / Gastronomie',
    'Tourismus / Hotel / Gastronomie/ Lebensmittel': 'Tourismus / Hotel / Gastronomie',
    'Turismo / Hotel / Gastronomia': 'Tourismus / Hotel / Gastronomie',
    'Vereine': 'Vereine',
    'Verkehr / Transport / Logistik': 'Verkehr / Transport / Logistik',
    'Versicherung': 'Versicherung',
    'sonstiges': 'Sonstige',
    'Öffentliche Verwaltung': 'Öffentliche Verwaltung',
    '': None
    }

ecg_df['economic_sector_EN'] = ecg_df['economic_sector'].map(sector_dictionary_EN)
ecg_df['economic_sector_DE'] = ecg_df['economic_sector'].map(sector_dictionary_DE)

In [32]:
# Check EN
print(ecg_df['economic_sector_EN'].isna().sum())
print(ecg_df[ecg_df['economic_sector_EN'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

ecg_df.loc[ecg_df['id'] == 25050, 'economic_sector_EN'] = 'EDP / IT'
print(ecg_df[ecg_df['economic_sector_EN'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

# Check DE
print(ecg_df['economic_sector_DE'].isna().sum())
print(ecg_df[ecg_df['economic_sector_DE'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

ecg_df.loc[ecg_df['id'] == 25050, 'economic_sector_DE'] = 'EDV / IT'
print(ecg_df[ecg_df['economic_sector_DE'].isna()][['id', 'organization', 'economic_sector', 'economic_sector_EN']])

6
          id                      organization  \
15    132863                          AGIT mbH   
301    28513  DROSG-PLÖCKINGER & PLÖCKINGER OG   
713    28515                  Manfred Kofranek   
892   151681                   pusch & Partner   
1175   25050             visuellverstehen GmbH   
1248   14728                              None   

                             economic_sector economic_sector_EN  
15                      Wirtschaftsförderung                NaN  
301                                     None                NaN  
713                                     None                NaN  
892                                     None                NaN  
1175  Software · Branding · SEO · Gemeinwohl                NaN  
1248                                    None                NaN  
          id                      organization       economic_sector  \
15    132863                          AGIT mbH  Wirtschaftsförderung   
301    28513  DROSG-PLÖCKINGER & PLÖCKING

In [33]:
# balance_details
'''
To extract information from balance_details: Using pattern
CAN BE AUTOMATED (dependent on max number of balances: for balance1....n...)

# With dictionaries: https://stackoverflow.com/questions/6181935/how-do-you-create-different-variable-names-while-in-a-loop
d = {}
for x in range(1, 10):
    d["string{0}".format(x)] = "Hello"
'''

# Turn missing information into missing values
ecg_df.loc[ecg_df['balance_details'].astype(str).str.contains('liegen keine'), 'balance_details'] = None

# Check
ecg_df['balance_details']

# Define lists
number_of_balances = []

balance1_type_code = []
balance1_type_name = []
balance1_valid_until_date = []
balance1_valid_until_year = []
balance1_year = []
balance1_score = []
balance1_documents = []

balance2_type_code = []
balance2_type_name = []
balance2_valid_until_date = []
balance2_valid_until_year = []
balance2_year = []
balance2_score = []
balance2_documents = []

balance3_type_code = []
balance3_type_name = []
balance3_valid_until_date = []
balance3_valid_until_year = []
balance3_year = []
balance3_score = []
balance3_documents = []

balance4_type_code = []
balance4_type_name = []
balance4_valid_until_date = []
balance4_valid_until_year = []
balance4_year = []
balance4_score = []
balance4_documents = []

balance5_type_code = []
balance5_type_name = []
balance5_valid_until_date = []
balance5_valid_until_year = []
balance5_year = []
balance5_score = []
balance5_documents = []

balance6_type_code = []
balance6_type_name = []
balance6_valid_until_date = []
balance6_valid_until_year = []
balance6_year = []
balance6_score = []
balance6_documents = []

balance7_type_code = []
balance7_type_name = []
balance7_valid_until_date = []
balance7_valid_until_year = []
balance7_year = []
balance7_score = []
balance7_documents = []

balance8_type_code = []
balance8_type_name = []
balance8_valid_until_date = []
balance8_valid_until_year = []
balance8_year = []
balance8_score = []
balance8_documents = []

balance9_type_code = []
balance9_type_name = []
balance9_valid_until_date = []
balance9_valid_until_year = []
balance9_year = []
balance9_score = []
balance9_documents = []

balance10_type_code = []
balance10_type_name = []
balance10_valid_until_date = []
balance10_valid_until_year = []
balance10_year = []
balance10_score = []
balance10_documents = []

# Fill lists
for balance_detail in ecg_df['balance_details']:
    
    try:
        if len(balance_detail) <=6:
            number_of_balances.append(1)
        elif len(balance_detail) <=12:
            number_of_balances.append(2)
        elif len(balance_detail) <=18:
            number_of_balances.append(3)
        elif len(balance_detail) <=24:
            number_of_balances.append(4)
        elif len(balance_detail) <=30:
            number_of_balances.append(5)
        elif len(balance_detail) <=36:
            number_of_balances.append(6)
        elif len(balance_detail) <=42:
            number_of_balances.append(7)
        elif len(balance_detail) <=48:
            number_of_balances.append(8)
        elif len(balance_detail) <=54:
            number_of_balances.append(9)
        elif len(balance_detail) <=60:
            number_of_balances.append(10)
        else:
            number_of_balances.append('>10')
    except:
        number_of_balances.append(None)
    
    # balance 1
    try:
        balance1_type_code.append(balance_detail[3].split(' ',1)[0])
    except:
        balance1_type_code.append(None)
    try:
        balance1_type_name.append(balance_detail[3].split(' ',1)[1])
    except:
        balance1_type_name.append(None)
    try:
        balance1_valid_until_date.append(balance_detail[4][1:11].replace('.', '-'))
    except:
        balance1_valid_until_date.append(None)
    try:
        balance1_valid_until_year.append(int(balance_detail[4][7:11]))
    except:
        balance1_valid_until_year.append(None)
    try:
        balance1_year.append(int(balance_detail[4][7:11])-2)
    except:
        balance1_year.append(None)
    try:
        balance1_score.append(int(balance_detail[4][12:]))
    except:
        balance1_score.append(None)    
    try:
        balance1_documents.append(balance_detail[5].strip().replace(' ', '+'))
    except:
        balance1_documents.append(None)
    
    # balance 2
    try:
        balance2_type_code.append(balance_detail[9].split(' ',1)[0])
    except:
        balance2_type_code.append(None)
    try:
        balance2_type_name.append(balance_detail[9].split(' ',1)[1])
    except:
        balance2_type_name.append(None)
    try:
        balance2_valid_until_date.append(balance_detail[10][1:11].replace('.', '-'))
    except:
        balance2_valid_until_date.append(None)
    try:
        balance2_valid_until_year.append(int(balance_detail[10][7:11]))
    except:
        balance2_valid_until_year.append(None)
    try:
        balance2_year.append(int(balance_detail[10][7:11])-2)
    except:
        balance2_year.append(None)
    try:
        balance2_score.append(int(balance_detail[10][12:]))
    except:
        balance2_score.append(None)    
    try:
        balance2_documents.append(balance_detail[11].strip().replace(' ', '+'))
    except:
        balance2_documents.append(None)

    # balance 3
    try:
        balance3_type_code.append(balance_detail[15].split(' ',1)[0])
    except:
        balance3_type_code.append(None)
    try:
        balance3_type_name.append(balance_detail[15].split(' ',1)[1])
    except:
        balance3_type_name.append(None)
    try:
        balance3_valid_until_date.append(balance_detail[16][1:11].replace('.', '-'))
    except:
        balance3_valid_until_date.append(None)
    try:
        balance3_valid_until_year.append(int(balance_detail[16][7:11]))
    except:
        balance3_valid_until_year.append(None)
    try:
        balance3_year.append(int(balance_detail[16][7:11])-2)
    except:
        balance3_year.append(None)
    try:
        balance3_score.append(int(balance_detail[16][12:]))
    except:
        balance3_score.append(None)    
    try:
        balance3_documents.append(balance_detail[17].strip().replace(' ', '+'))
    except:
        balance3_documents.append(None)        

    # balance 4
    try:
        balance4_type_code.append(balance_detail[21].split(' ',1)[0])
    except:
        balance4_type_code.append(None)
    try:
        balance4_type_name.append(balance_detail[21].split(' ',1)[1])
    except:
        balance4_type_name.append(None)
    try:
        balance4_valid_until_date.append(balance_detail[22][1:11].replace('.', '-'))
    except:
        balance4_valid_until_date.append(None)
    try:
        balance4_valid_until_year.append(int(balance_detail[22][7:11]))
    except:
        balance4_valid_until_year.append(None)
    try:
        balance4_year.append(int(balance_detail[22][7:11])-2)
    except:
        balance4_year.append(None)
    try:
        balance4_score.append(int(balance_detail[22][12:]))
    except:
        balance4_score.append(None)    
    try:
        balance4_documents.append(balance_detail[23].strip().replace(' ', '+'))
    except:
        balance4_documents.append(None)        

    # balance 5
    try:
        balance5_type_code.append(balance_detail[27].split(' ',1)[0])
    except:
        balance5_type_code.append(None)
    try:
        balance5_type_name.append(balance_detail[27].split(' ',1)[1])
    except:
        balance5_type_name.append(None)
    try:
        balance5_valid_until_date.append(balance_detail[28][1:11].replace('.', '-'))
    except:
        balance5_valid_until_date.append(None)
    try:
        balance5_valid_until_year.append(int(balance_detail[28][7:11]))
    except:
        balance5_valid_until_year.append(None)
    try:
        balance5_year.append(int(balance_detail[28][7:11])-2)
    except:
        balance5_year.append(None)
    try:
        balance5_score.append(int(balance_detail[28][12:]))
    except:
        balance5_score.append(None)    
    try:
        balance5_documents.append(balance_detail[29].strip().replace(' ', '+'))
    except:
        balance5_documents.append(None)

    # balance 6
    try:
        balance6_type_code.append(balance_detail[33].split(' ',1)[0])
    except:
        balance6_type_code.append(None)
    try:
        balance6_type_name.append(balance_detail[33].split(' ',1)[1])
    except:
        balance6_type_name.append(None)
    try:
        balance6_valid_until_date.append(balance_detail[34][1:11].replace('.', '-'))
    except:
        balance6_valid_until_date.append(None)
    try:
        balance6_valid_until_year.append(int(balance_detail[34][7:11]))
    except:
        balance6_valid_until_year.append(None)
    try:
        balance6_year.append(int(balance_detail[34][7:11])-2)
    except:
        balance6_year.append(None)
    try:
        balance6_score.append(int(balance_detail[34][12:]))
    except:
        balance6_score.append(None)    
    try:
        balance6_documents.append(balance_detail[35].strip().replace(' ', '+'))
    except:
        balance6_documents.append(None)

    # balance 7
    try:
        balance7_type_code.append(balance_detail[39].split(' ',1)[0])
    except:
        balance7_type_code.append(None)
    try:
        balance7_type_name.append(balance_detail[39].split(' ',1)[1])
    except:
        balance7_type_name.append(None)
    try:
        balance7_valid_until_date.append(balance_detail[40][1:11].replace('.', '-'))
    except:
        balance7_valid_until_date.append(None)
    try:
        balance7_valid_until_year.append(int(balance_detail[40][7:11]))
    except:
        balance7_valid_until_year.append(None)
    try:
        balance7_year.append(int(balance_detail[40][7:11])-2)
    except:
        balance7_year.append(None)
    try:
        balance7_score.append(int(balance_detail[40][12:]))
    except:
        balance7_score.append(None)    
    try:
        balance7_documents.append(balance_detail[41].strip().replace(' ', '+'))
    except:
        balance7_documents.append(None)
        
    # balance 8
    try:
        balance8_type_code.append(balance_detail[45].split(' ',1)[0])
    except:
        balance8_type_code.append(None)
    try:
        balance8_type_name.append(balance_detail[45].split(' ',1)[1])
    except:
        balance8_type_name.append(None)
    try:
        balance8_valid_until_date.append(balance_detail[46][1:11].replace('.', '-'))
    except:
        balance8_valid_until_date.append(None)
    try:
        balance8_valid_until_year.append(int(balance_detail[46][7:11]))
    except:
        balance8_valid_until_year.append(None)
    try:
        balance8_year.append(int(balance_detail[46][7:11])-2)
    except:
        balance8_year.append(None)
    try:
        balance8_score.append(int(balance_detail[46][12:]))
    except:
        balance8_score.append(None)    
    try:
        balance8_documents.append(balance_detail[47].strip().replace(' ', '+'))
    except:
        balance8_documents.append(None)
        
    # balance 9
    try:
        balance9_type_code.append(balance_detail[51].split(' ',1)[0])
    except:
        balance9_type_code.append(None)
    try:
        balance9_type_name.append(balance_detail[51].split(' ',1)[1])
    except:
        balance9_type_name.append(None)
    try:
        balance9_valid_until_date.append(balance_detail[52][1:11].replace('.', '-'))
    except:
        balance9_valid_until_date.append(None)
    try:
        balance9_valid_until_year.append(int(balance_detail[52][7:11]))
    except:
        balance9_valid_until_year.append(None)
    try:
        balance9_year.append(int(balance_detail[52][7:11])-2)
    except:
        balance9_year.append(None)
    try:
        balance9_score.append(int(balance_detail[52][12:]))
    except:
        balance9_score.append(None)    
    try:
        balance9_documents.append(balance_detail[53].strip().replace(' ', '+'))
    except:
        balance9_documents.append(None)
        
    # balance 10
    try:
        balance10_type_code.append(balance_detail[57].split(' ',1)[0])
    except:
        balance10_type_code.append(None)
    try:
        balance10_type_name.append(balance_detail[57].split(' ',1)[1])
    except:
        balance10_type_name.append(None)
    try:
        balance10_valid_until_date.append(balance_detail[58][1:11].replace('.', '-'))
    except:
        balance10_valid_until_date.append(None)
    try:
        balance10_valid_until_year.append(int(balance_detail[58][7:11]))
    except:
        balance10_valid_until_year.append(None)
    try:
        balance10_year.append(int(balance_detail[58][7:11])-2)
    except:
        balance10_year.append(None)
    try:
        balance10_score.append(int(balance_detail[58][12:]))
    except:
        balance10_score.append(None)    
    try:
        balance10_documents.append(balance_detail[59].strip().replace(' ', '+'))
    except:
        balance10_documents.append(None)

In [34]:
valid_number_of_balances = []
for entry in number_of_balances:
    if entry is not None and isinstance(entry, int):
        valid_number_of_balances.append(entry)

max_number_of_balances = max(valid_number_of_balances)

print(f'Maximum number of balances: {max_number_of_balances}')

print(pd.Series(valid_number_of_balances).value_counts().to_string())

print(f'MISSG {len(number_of_balances)-len(valid_number_of_balances)}')

Maximum number of balances: 8
1    967
2    157
3     37
4     13
5      2
6      2
7      1
8      1
MISSG 69


In [35]:
balance1_valid_until_date[859]

In [36]:
# Correct scores being placed in valid_until_date lists due to missing values in valid_until_date 
i = 0
for element in balance1_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance1_valid_until_date[i] = None
            balance1_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance2_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance2_valid_until_date[i] = None
            balance2_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance3_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance3_valid_until_date[i] = None
            balance3_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance4_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance4_valid_until_date[i] = None
            balance4_score[i] = element
    except:
        pass
    i += 1
    
i = 0
for element in balance5_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance5_valid_until_date[i] = None
            balance5_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance6_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance6_valid_until_date[i] = None
            balance6_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance7_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance7_valid_until_date[i] = None
            balance7_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance8_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance8_valid_until_date[i] = None
            balance8_score[i] = element
    except:
        pass
    i += 1
    
i = 0
for element in balance9_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance9_valid_until_date[i] = None
            balance9_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance10_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance10_valid_until_date[i] = None
            balance10_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance10_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance10_valid_until_date[i] = None
            balance10_score[i] = element
    except:
        pass
    i += 1

i = 0
for element in balance4_valid_until_date:
    try:
        if len(element.strip()) <= 4:
            balance4_valid_until_date[i] = None
            balance4_score[i] = element
    except:
        pass
    i += 1

In [37]:
'''
MAKE DYNAMIC!
'''

ecg_df['number_of_balances'] = number_of_balances

ecg_df['balance1_type_code'] = balance1_type_code
ecg_df['balance1_type_name'] = balance1_type_name
ecg_df['balance1_valid_until_date'] = balance1_valid_until_date
ecg_df['balance1_valid_until_year'] = balance1_valid_until_year
ecg_df['balance1_year'] = balance1_year
ecg_df['balance1_score'] = balance1_score
ecg_df['balance1_documents'] = balance1_documents

ecg_df['balance2_type_code'] = balance2_type_code
ecg_df['balance2_type_name'] = balance2_type_name
ecg_df['balance2_valid_until_date'] = balance2_valid_until_date
ecg_df['balance2_valid_until_year'] = balance2_valid_until_year
ecg_df['balance2_year'] = balance2_year
ecg_df['balance2_score'] = balance2_score
ecg_df['balance2_documents'] = balance2_documents

ecg_df['balance3_type_code'] = balance3_type_code
ecg_df['balance3_type_name'] = balance3_type_name
ecg_df['balance3_valid_until_date'] = balance3_valid_until_date
ecg_df['balance3_valid_until_year'] = balance3_valid_until_year
ecg_df['balance3_year'] = balance3_year
ecg_df['balance3_score'] = balance3_score
ecg_df['balance3_documents'] = balance3_documents

ecg_df['balance4_type_code'] = balance4_type_code
ecg_df['balance4_type_name'] = balance4_type_name
ecg_df['balance4_valid_until_date'] = balance4_valid_until_date
ecg_df['balance4_valid_until_year'] = balance4_valid_until_year
ecg_df['balance4_year'] = balance4_year
ecg_df['balance4_score'] = balance4_score
ecg_df['balance4_documents'] = balance4_documents

ecg_df['balance5_type_code'] = balance5_type_code
ecg_df['balance5_type_name'] = balance5_type_name
ecg_df['balance5_valid_until_date'] = balance5_valid_until_date
ecg_df['balance5_valid_until_year'] = balance5_valid_until_year
ecg_df['balance5_year'] = balance5_year
ecg_df['balance5_score'] = balance5_score
ecg_df['balance5_documents'] = balance5_documents

ecg_df['balance6_type_code'] = balance6_type_code
ecg_df['balance6_type_name'] = balance6_type_name
ecg_df['balance6_valid_until_date'] = balance6_valid_until_date
ecg_df['balance6_valid_until_year'] = balance6_valid_until_year
ecg_df['balance6_year'] = balance6_year
ecg_df['balance6_score'] = balance6_score
ecg_df['balance6_documents'] = balance6_documents

ecg_df['balance7_type_code'] = balance7_type_code
ecg_df['balance7_type_name'] = balance7_type_name
ecg_df['balance7_valid_until_date'] = balance7_valid_until_date
ecg_df['balance7_valid_until_year'] = balance7_valid_until_year
ecg_df['balance7_year'] = balance7_year
ecg_df['balance7_score'] = balance7_score
ecg_df['balance7_documents'] = balance7_documents

ecg_df['balance8_type_code'] = balance8_type_code
ecg_df['balance8_type_name'] = balance8_type_name
ecg_df['balance8_valid_until_date'] = balance8_valid_until_date
ecg_df['balance8_valid_until_year'] = balance8_valid_until_year
ecg_df['balance8_year'] = balance8_year
ecg_df['balance8_score'] = balance8_score
ecg_df['balance8_documents'] = balance8_documents

ecg_df['balance9_type_code'] = balance9_type_code
ecg_df['balance9_type_name'] = balance9_type_name
ecg_df['balance9_valid_until_date'] = balance9_valid_until_date
ecg_df['balance9_valid_until_year'] = balance9_valid_until_year
ecg_df['balance9_year'] = balance9_year
ecg_df['balance9_score'] = balance9_score
ecg_df['balance9_documents'] = balance9_documents

ecg_df['balance10_type_code'] = balance10_type_code
ecg_df['balance10_type_name'] = balance10_type_name
ecg_df['balance10_valid_until_date'] = balance10_valid_until_date
ecg_df['balance10_valid_until_year'] = balance10_valid_until_year
ecg_df['balance10_year'] = balance10_year
ecg_df['balance10_score'] = balance10_score
ecg_df['balance10_documents'] = balance10_documents

In [38]:
def documents_replace(x):
    x = x.replace('Bericht', 'Report')
    x = x.replace('Zertifikat', 'Certificate')
    x = x.replace('Testat', 'Attestation')
    return x

ecg_df['balance1_documents'] = ecg_df['balance1_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance2_documents'] = ecg_df['balance2_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance3_documents'] = ecg_df['balance3_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance4_documents'] = ecg_df['balance4_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance5_documents'] = ecg_df['balance5_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance6_documents'] = ecg_df['balance6_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance7_documents'] = ecg_df['balance7_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance8_documents'] = ecg_df['balance8_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance9_documents'] = ecg_df['balance9_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)
ecg_df['balance10_documents'] = ecg_df['balance10_documents'].apply(lambda x: documents_replace(x) if isinstance(x, str) else x)

In [39]:
type_name_list = []
type_name_list += balance1_type_name
type_name_list += balance2_type_name
type_name_list += balance3_type_name
type_name_list += balance4_type_name
type_name_list += balance5_type_name
type_name_list += balance6_type_name
type_name_list += balance7_type_name
type_name_list += balance8_type_name
type_name_list += balance9_type_name
type_name_list += balance10_type_name

valid_type_names = []
for type_name in type_name_list:
    if type_name is not None:
        valid_type_names.append(type_name)

list(pd.Series(valid_type_names).unique())

['StandardPeerevaluation',
 'KompaktbilanzPeerevaluation',
 'StandardDeskaudit',
 'Balance completoauditoría in situ',
 'VollbilanzBesuchsaudit',
 'KompaktbilanzBesuchsaudit',
 'VollbilanzPeerevaluation',
 'KompaktbilanzDeskaudit',
 'Balance completoPeerevaluation',
 'KompaktbilanzAbPE',
 'VollbilanzDeskaudit',
 'Bilancio completoValutazione tra pari',
 'Compact BalancePeerevaluation',
 'StandardBesuchsaudit',
 'M1.2Besuchsaudit',
 'M1.2Deskaudit',
 'Compact Balanceonsite audit',
 'Full Balanceonsite audit']

In [40]:
balance_version_dict = {
    'StandardPeerevaluation': 'Standard',
    'KompaktbilanzPeerevaluation': 'Compact',
    'StandardDeskaudit': 'Standard',
    'Balance completoauditoría in situ': 'Full',
    'VollbilanzBesuchsaudit': 'Full',
    'KompaktbilanzBesuchsaudit': 'Compact',
    'VollbilanzPeerevaluation': 'Full',
    'KompaktbilanzDeskaudit': 'Compact',
    'Balance completoPeerevaluation': 'Full',
    'KompaktbilanzAbPE': 'Compact',
    'VollbilanzDeskaudit': 'Full',
    'Bilancio completoValutazione tra pari': 'Full',
    'Compact BalancePeerevaluation': 'Compact',
    'StandardBesuchsaudit': 'Standard',
    'M1.2Besuchsaudit': 'M1.2',
    'M1.2Deskaudit': 'M1.2',
    'Compact Balanceonsite audit': 'Compact',
    'Full Balanceonsite audit': 'Full'
}

balance_audit_type_dict = {
    'StandardPeerevaluation': 'Peer',
    'KompaktbilanzPeerevaluation': 'Peer',
    'StandardDeskaudit': 'Desk',
    'Balance completoauditoría in situ': 'On site',
    'VollbilanzBesuchsaudit': 'On site',
    'KompaktbilanzBesuchsaudit': 'On site',
    'VollbilanzPeerevaluation': 'Peer',
    'KompaktbilanzDeskaudit': 'Desk',
    'Balance completoPeerevaluation': 'Peer',
    'KompaktbilanzAbPE': 'Peer', # I think this could mean "Audit bei Peer Evaluation", not sure though
    'VollbilanzDeskaudit': 'Desk',
    'Bilancio completoValutazione tra pari': 'Peer',
    'Compact BalancePeerevaluation': 'Peer',
    'StandardBesuchsaudit': 'On site',
    'M1.2Besuchsaudit': 'On site',
    'M1.2Deskaudit': 'Desk',
    'Compact Balanceonsite audit': 'On site',
    'Full Balanceonsite audit': 'On site'
}

In [41]:
ecg_df['balance1_version'] = ecg_df['balance1_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance1_audit_type'] = ecg_df['balance1_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance2_version'] = ecg_df['balance2_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance2_audit_type'] = ecg_df['balance2_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance3_version'] = ecg_df['balance3_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance3_audit_type'] = ecg_df['balance3_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance4_version'] = ecg_df['balance4_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance4_audit_type'] = ecg_df['balance4_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance5_version'] = ecg_df['balance5_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance5_audit_type'] = ecg_df['balance5_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance6_version'] = ecg_df['balance6_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance6_audit_type'] = ecg_df['balance6_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance7_version'] = ecg_df['balance7_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance7_audit_type'] = ecg_df['balance7_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance8_version'] = ecg_df['balance8_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance8_audit_type'] = ecg_df['balance8_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance9_version'] = ecg_df['balance9_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance9_audit_type'] = ecg_df['balance9_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

ecg_df['balance10_version'] = ecg_df['balance10_type_name'].apply(lambda x: balance_version_dict.get(x,x))
ecg_df['balance10_audit_type'] = ecg_df['balance10_type_name'].apply(lambda x: balance_audit_type_dict.get(x,x))

In [42]:
# Reverse-ordering balance_details (so that they are in ascending chronological order)

# Step 1: Split dataframe into chunks dependent upon their number of balances
'''
MAKE DYNAMIC! ALTERNATIVE TO HARD CODING?
'''
balances_0_1_df = ecg_df[(ecg_df['number_of_balances'].isna() == True) | (ecg_df['number_of_balances'] == 1)]
balances_2_df = ecg_df[ecg_df['number_of_balances'] == 2]
balances_3_df = ecg_df[ecg_df['number_of_balances'] == 3]
balances_4_df = ecg_df[ecg_df['number_of_balances'] == 4]
balances_5_df = ecg_df[ecg_df['number_of_balances'] == 5]
balances_6_df = ecg_df[ecg_df['number_of_balances'] == 6]
balances_7_df = ecg_df[ecg_df['number_of_balances'] == 7]
balances_8_df = ecg_df[ecg_df['number_of_balances'] == 8]
balances_9_df = ecg_df[ecg_df['number_of_balances'] == 9]
balances_10_df = ecg_df[ecg_df['number_of_balances'] == 10]

# Step 2: Rename columns (reversing)
balances_2_df.rename(columns= {
    'balance2_type_code': 'balance1_type_code',
    'balance2_type_name': 'balance1_type_name',
    'balance2_version': 'balance1_version',
    'balance2_audit_type': 'balance1_audit_type',
    'balance2_valid_until_date': 'balance1_valid_until_date',
    'balance2_valid_until_year': 'balance1_valid_until_year',
    'balance2_year': 'balance1_year',
    'balance2_score': 'balance1_score',
    'balance2_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance2_type_code',
    'balance1_type_name': 'balance2_type_name',
    'balance1_version': 'balance2_version',
    'balance1_audit_type': 'balance2_audit_type',
    'balance1_valid_until_date': 'balance2_valid_until_date',
    'balance1_valid_until_year': 'balance2_valid_until_year',
    'balance1_year': 'balance2_year',
    'balance1_score': 'balance2_score',
    'balance1_documents': 'balance2_documents'
    }, inplace=True)

balances_3_df.rename(columns= {
    'balance3_type_code': 'balance1_type_code',
    'balance3_type_name': 'balance1_type_name',
    'balance3_version': 'balance1_version',
    'balance3_audit_type': 'balance1_audit_type',
    'balance3_valid_until_date': 'balance1_valid_until_date',
    'balance3_valid_until_year': 'balance1_valid_until_year',
    'balance3_year': 'balance1_year',
    'balance3_score': 'balance1_score',
    'balance3_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance3_type_code',
    'balance1_type_name': 'balance3_type_name',
    'balance1_version': 'balance3_version',
    'balance1_audit_type': 'balance3_audit_type',
    'balance1_valid_until_date': 'balance3_valid_until_date',
    'balance1_valid_until_year': 'balance3_valid_until_year',
    'balance1_year': 'balance3_year',
    'balance1_score': 'balance3_score',
    'balance1_documents': 'balance3_documents'
    }, inplace=True)

balances_4_df.rename(columns= {
    'balance4_type_code': 'balance1_type_code',
    'balance4_type_name': 'balance1_type_name',
    'balance4_version': 'balance1_version',
    'balance4_audit_type': 'balance1_audit_type',
    'balance4_valid_until_date': 'balance1_valid_until_date',
    'balance4_valid_until_year': 'balance1_valid_until_year',
    'balance4_year': 'balance1_year',
    'balance4_score': 'balance1_score',
    'balance4_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance4_type_code',
    'balance1_type_name': 'balance4_type_name',
    'balance1_version': 'balance4_version',
    'balance1_audit_type': 'balance4_audit_type',
    'balance1_valid_until_date': 'balance4_valid_until_date',
    'balance1_valid_until_year': 'balance4_valid_until_year',
    'balance1_year': 'balance4_year',
    'balance1_score': 'balance4_score',
    'balance1_documents': 'balance4_documents',
    
    'balance3_type_code': 'balance2_type_code',
    'balance3_type_name': 'balance2_type_name',
    'balance3_version': 'balance2_version',
    'balance3_audit_type': 'balance2_audit_type',
    'balance3_valid_until_date': 'balance2_valid_until_date',
    'balance3_valid_until_year': 'balance2_valid_until_year',
    'balance3_year': 'balance2_year',
    'balance3_score': 'balance2_score',
    'balance3_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance3_type_code',
    'balance2_type_name': 'balance3_type_name',
    'balance2_version': 'balance3_version',
    'balance2_audit_type': 'balance3_audit_type',
    'balance2_valid_until_date': 'balance3_valid_until_date',
    'balance2_valid_until_year': 'balance3_valid_until_year',
    'balance2_year': 'balance3_year',
    'balance2_score': 'balance3_score',
    'balance2_documents': 'balance3_documents'
    }, inplace=True)

balances_5_df.rename(columns= {
    'balance5_type_code': 'balance1_type_code',
    'balance5_type_name': 'balance1_type_name',
    'balance5_version': 'balance1_version',
    'balance5_audit_type': 'balance1_audit_type',
    'balance5_valid_until_date': 'balance1_valid_until_date',
    'balance5_valid_until_year': 'balance1_valid_until_year',
    'balance5_year': 'balance1_year',
    'balance5_score': 'balance1_score',
    'balance5_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance5_type_code',
    'balance1_type_name': 'balance5_type_name',
    'balance1_version': 'balance5_version',
    'balance1_audit_type': 'balance5_audit_type',
    'balance1_valid_until_date': 'balance5_valid_until_date',
    'balance1_valid_until_year': 'balance5_valid_until_year',
    'balance1_year': 'balance5_year',
    'balance1_score': 'balance5_score',
    'balance1_documents': 'balance5_documents',
    
    'balance4_type_code': 'balance2_type_code',
    'balance4_type_name': 'balance2_type_name',
    'balance4_version': 'balance2_version',
    'balance4_audit_type': 'balance2_audit_type',
    'balance4_valid_until_date': 'balance2_valid_until_date',
    'balance4_valid_until_year': 'balance2_valid_until_year',
    'balance4_year': 'balance2_year',
    'balance4_score': 'balance2_score',
    'balance4_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance4_type_code',
    'balance2_type_name': 'balance4_type_name',
    'balance2_version': 'balance4_version',
    'balance2_audit_type': 'balance4_audit_type',
    'balance2_valid_until_date': 'balance4_valid_until_date',
    'balance2_valid_until_year': 'balance4_valid_until_year',
    'balance2_year': 'balance4_year',
    'balance2_score': 'balance4_score',
    'balance2_documents': 'balance4_documents'
    }, inplace=True)

balances_6_df.rename(columns= {
    'balance6_type_code': 'balance1_type_code',
    'balance6_type_name': 'balance1_type_name',
    'balance6_version': 'balance1_version',
    'balance6_audit_type': 'balance1_audit_type',
    'balance6_valid_until_date': 'balance1_valid_until_date',
    'balance6_valid_until_year': 'balance1_valid_until_year',
    'balance6_year': 'balance1_year',
    'balance6_score': 'balance1_score',
    'balance6_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance6_type_code',
    'balance1_type_name': 'balance6_type_name',
    'balance1_version': 'balance6_version',
    'balance1_audit_type': 'balance6_audit_type',
    'balance1_valid_until_date': 'balance6_valid_until_date',
    'balance1_valid_until_year': 'balance6_valid_until_year',
    'balance1_year': 'balance6_year',
    'balance1_score': 'balance6_score',
    'balance1_documents': 'balance6_documents',
    
    'balance5_type_code': 'balance2_type_code',
    'balance5_type_name': 'balance2_type_name',
    'balance5_version': 'balance2_version',
    'balance5_audit_type': 'balance2_audit_type',
    'balance5_valid_until_date': 'balance2_valid_until_date',
    'balance5_valid_until_year': 'balance2_valid_until_year',
    'balance5_year': 'balance2_year',
    'balance5_score': 'balance2_score',
    'balance5_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance5_type_code',
    'balance2_type_name': 'balance5_type_name',
    'balance2_version': 'balance5_version',
    'balance2_audit_type': 'balance5_audit_type',
    'balance2_valid_until_date': 'balance5_valid_until_date',
    'balance2_valid_until_year': 'balance5_valid_until_year',
    'balance2_year': 'balance5_year',
    'balance2_score': 'balance5_score',
    'balance2_documents': 'balance5_documents',
    
    'balance4_type_code': 'balance3_type_code',
    'balance4_type_name': 'balance3_type_name',
    'balance4_version': 'balance3_version',
    'balance4_audit_type': 'balance3_audit_type',
    'balance4_valid_until_date': 'balance3_valid_until_date',
    'balance4_valid_until_year': 'balance3_valid_until_year',
    'balance4_year': 'balance3_year',
    'balance4_score': 'balance3_score',
    'balance4_documents': 'balance3_documents',
    
    'balance3_type_code': 'balance4_type_code',
    'balance3_type_name': 'balance4_type_name',
    'balance3_version': 'balance4_version',
    'balance3_audit_type': 'balance4_audit_type',
    'balance3_valid_until_date': 'balance4_valid_until_date',
    'balance3_valid_until_year': 'balance4_valid_until_year',
    'balance3_year': 'balance4_year',
    'balance3_score': 'balance4_score',
    'balance3_documents': 'balance4_documents'
    }, inplace=True)

balances_7_df.rename(columns= {
    'balance7_type_code': 'balance1_type_code',
    'balance7_type_name': 'balance1_type_name',
    'balance7_version': 'balance1_version',
    'balance7_audit_type': 'balance1_audit_type',
    'balance7_valid_until_date': 'balance1_valid_until_date',
    'balance7_valid_until_year': 'balance1_valid_until_year',
    'balance7_year': 'balance1_year',
    'balance7_score': 'balance1_score',
    'balance7_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance7_type_code',
    'balance1_type_name': 'balance7_type_name',
    'balance1_version': 'balance7_version',
    'balance1_audit_type': 'balance7_audit_type',
    'balance1_valid_until_date': 'balance7_valid_until_date',
    'balance1_valid_until_year': 'balance7_valid_until_year',
    'balance1_year': 'balance7_year',
    'balance1_score': 'balance7_score',
    'balance1_documents': 'balance7_documents',
    
    'balance6_type_code': 'balance2_type_code',
    'balance6_type_name': 'balance2_type_name',
    'balance6_version': 'balance2_version',
    'balance6_audit_type': 'balance2_audit_type',
    'balance6_valid_until_date': 'balance2_valid_until_date',
    'balance6_valid_until_year': 'balance2_valid_until_year',
    'balance6_year': 'balance2_year',
    'balance6_score': 'balance2_score',
    'balance6_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance6_type_code',
    'balance2_type_name': 'balance6_type_name',
    'balance2_version': 'balance6_version',
    'balance2_audit_type': 'balance6_audit_type',
    'balance2_valid_until_date': 'balance6_valid_until_date',
    'balance2_valid_until_year': 'balance6_valid_until_year',
    'balance2_year': 'balance6_year',
    'balance2_score': 'balance6_score',
    'balance2_documents': 'balance6_documents',
    
    'balance5_type_code': 'balance3_type_code',
    'balance5_type_name': 'balance3_type_name',
    'balance5_version': 'balance3_version',
    'balance5_audit_type': 'balance3_audit_type',
    'balance5_valid_until_date': 'balance3_valid_until_date',
    'balance5_valid_until_year': 'balance3_valid_until_year',
    'balance5_year': 'balance3_year',
    'balance5_score': 'balance3_score',
    'balance5_documents': 'balance3_documents',
    
    'balance3_type_code': 'balance5_type_code',
    'balance3_type_name': 'balance5_type_name',
    'balance3_version': 'balance5_version',
    'balance3_audit_type': 'balance5_audit_type',
    'balance3_valid_until_date': 'balance5_valid_until_date',
    'balance3_valid_until_year': 'balance5_valid_until_year',
    'balance3_year': 'balance5_year',
    'balance3_score': 'balance5_score',
    'balance3_documents': 'balance5_documents'
    }, inplace=True)

balances_8_df.rename(columns= {
    'balance8_type_code': 'balance1_type_code',
    'balance8_type_name': 'balance1_type_name',
    'balance8_version': 'balance1_version',
    'balance8_audit_type': 'balance1_audit_type',
    'balance8_valid_until_date': 'balance1_valid_until_date',
    'balance8_valid_until_year': 'balance1_valid_until_year',
    'balance8_year': 'balance1_year',
    'balance8_score': 'balance1_score',
    'balance8_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance8_type_code',
    'balance1_type_name': 'balance8_type_name',
    'balance1_version': 'balance8_version',
    'balance1_audit_type': 'balance8_audit_type',
    'balance1_valid_until_date': 'balance8_valid_until_date',
    'balance1_valid_until_year': 'balance8_valid_until_year',
    'balance1_year': 'balance8_year',
    'balance1_score': 'balance8_score',
    'balance1_documents': 'balance8_documents',
    
    'balance7_type_code': 'balance2_type_code',
    'balance7_type_name': 'balance2_type_name',
    'balance7_version': 'balance2_version',
    'balance7_audit_type': 'balance2_audit_type',
    'balance7_valid_until_date': 'balance2_valid_until_date',
    'balance7_valid_until_year': 'balance2_valid_until_year',
    'balance7_year': 'balance2_year',
    'balance7_score': 'balance2_score',
    'balance7_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance7_type_code',
    'balance2_type_name': 'balance7_type_name',
    'balance2_version': 'balance7_version',
    'balance2_audit_type': 'balance7_audit_type',
    'balance2_valid_until_date': 'balance7_valid_until_date',
    'balance2_valid_until_year': 'balance7_valid_until_year',
    'balance2_year': 'balance7_year',
    'balance2_score': 'balance7_score',
    'balance2_documents': 'balance7_documents',
    
    'balance6_type_code': 'balance3_type_code',
    'balance6_type_name': 'balance3_type_name',
    'balance6_version': 'balance3_version',
    'balance6_audit_type': 'balance3_audit_type',
    'balance6_valid_until_date': 'balance3_valid_until_date',
    'balance6_valid_until_year': 'balance3_valid_until_year',
    'balance6_year': 'balance3_year',
    'balance6_score': 'balance3_score',
    'balance6_documents': 'balance3_documents',
    
    'balance3_type_code': 'balance6_type_code',
    'balance3_type_name': 'balance6_type_name',
    'balance3_version': 'balance6_version',
    'balance3_audit_type': 'balance6_audit_type',
    'balance3_valid_until_date': 'balance6_valid_until_date',
    'balance3_valid_until_year': 'balance6_valid_until_year',
    'balance3_year': 'balance6_year',
    'balance3_score': 'balance6_score',
    'balance3_documents': 'balance6_documents',
    
    'balance5_type_code': 'balance4_type_code',
    'balance5_type_name': 'balance4_type_name',
    'balance5_version': 'balance4_version',
    'balance5_audit_type': 'balance4_audit_type',
    'balance5_valid_until_date': 'balance4_valid_until_date',
    'balance5_valid_until_year': 'balance4_valid_until_year',
    'balance5_year': 'balance4_year',
    'balance5_score': 'balance4_score',
    'balance5_documents': 'balance4_documents',
    
    'balance4_type_code': 'balance5_type_code',
    'balance4_type_name': 'balance5_type_name',
    'balance4_version': 'balance5_version',
    'balance4_audit_type': 'balance5_audit_type',
    'balance4_valid_until_date': 'balance5_valid_until_date',
    'balance4_valid_until_year': 'balance5_valid_until_year',
    'balance4_year': 'balance5_year',
    'balance4_score': 'balance5_score',
    'balance4_documents': 'balance5_documents'
    }, inplace=True)

balances_9_df.rename(columns= {
    'balance9_type_code': 'balance1_type_code',
    'balance9_type_name': 'balance1_type_name',
    'balance9_version': 'balance1_version',
    'balance9_audit_type': 'balance1_audit_type',
    'balance9_valid_until_date': 'balance1_valid_until_date',
    'balance9_valid_until_year': 'balance1_valid_until_year',
    'balance9_year': 'balance1_year',
    'balance9_score': 'balance1_score',
    'balance9_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance9_type_code',
    'balance1_type_name': 'balance9_type_name',
    'balance1_version': 'balance9_version',
    'balance1_audit_type': 'balance9_audit_type',
    'balance1_valid_until_date': 'balance9_valid_until_date',
    'balance1_valid_until_year': 'balance9_valid_until_year',
    'balance1_year': 'balance9_year',
    'balance1_score': 'balance9_score',
    'balance1_documents': 'balance9_documents',
    
    'balance8_type_code': 'balance2_type_code',
    'balance8_type_name': 'balance2_type_name',
    'balance8_version': 'balance2_version',
    'balance8_audit_type': 'balance2_audit_type',
    'balance8_valid_until_date': 'balance2_valid_until_date',
    'balance8_valid_until_year': 'balance2_valid_until_year',
    'balance8_year': 'balance2_year',
    'balance8_score': 'balance2_score',
    'balance8_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance8_type_code',
    'balance2_type_name': 'balance8_type_name',
    'balance2_version': 'balance8_version',
    'balance2_audit_type': 'balance8_audit_type',
    'balance2_valid_until_date': 'balance8_valid_until_date',
    'balance2_valid_until_year': 'balance8_valid_until_year',
    'balance2_year': 'balance8_year',
    'balance2_score': 'balance8_score',
    'balance2_documents': 'balance8_documents',
    
    'balance7_type_code': 'balance3_type_code',
    'balance7_type_name': 'balance3_type_name',
    'balance7_version': 'balance3_version',
    'balance7_audit_type': 'balance3_audit_type',
    'balance7_valid_until_date': 'balance3_valid_until_date',
    'balance7_valid_until_year': 'balance3_valid_until_year',
    'balance7_year': 'balance3_year',
    'balance7_score': 'balance3_score',
    'balance7_documents': 'balance3_documents',
    
    'balance3_type_code': 'balance7_type_code',
    'balance3_type_name': 'balance7_type_name',
    'balance3_version': 'balance7_version',
    'balance3_audit_type': 'balance7_audit_type',
    'balance3_valid_until_date': 'balance7_valid_until_date',
    'balance3_valid_until_year': 'balance7_valid_until_year',
    'balance3_year': 'balance7_year',
    'balance3_score': 'balance7_score',
    'balance3_documents': 'balance7_documents',
    
    'balance6_type_code': 'balance4_type_code',
    'balance6_type_name': 'balance4_type_name',
    'balance6_version': 'balance4_version',
    'balance6_audit_type': 'balance4_audit_type',
    'balance6_valid_until_date': 'balance4_valid_until_date',
    'balance6_valid_until_year': 'balance4_valid_until_year',
    'balance6_year': 'balance4_year',
    'balance6_score': 'balance4_score',
    'balance6_documents': 'balance4_documents',
    
    'balance4_type_code': 'balance6_type_code',
    'balance4_type_name': 'balance6_type_name',
    'balance4_version': 'balance6_version',
    'balance4_audit_type': 'balance6_audit_type',
    'balance4_valid_until_date': 'balance6_valid_until_date',
    'balance4_valid_until_year': 'balance6_valid_until_year',
    'balance4_year': 'balance6_year',
    'balance4_score': 'balance6_score',
    'balance4_documents': 'balance6_documents'
    }, inplace=True)

balances_10_df.rename(columns= {
    'balance10_type_code': 'balance1_type_code',
    'balance10_type_name': 'balance1_type_name',
    'balance10_version': 'balance1_version',
    'balance10_audit_type': 'balance1_audit_type',
    'balance10_valid_until_date': 'balance1_valid_until_date',
    'balance10_valid_until_year': 'balance1_valid_until_year',
    'balance10_year': 'balance1_year',
    'balance10_score': 'balance1_score',
    'balance10_documents': 'balance1_documents',
    
    'balance1_type_code': 'balance10_type_code',
    'balance1_type_name': 'balance10_type_name',
    'balance1_version': 'balance10_version',
    'balance1_audit_type': 'balance10_audit_type',
    'balance1_valid_until_date': 'balance10_valid_until_date',
    'balance1_valid_until_year': 'balance10_valid_until_year',
    'balance1_year': 'balance10_year',
    'balance1_score': 'balance10_score',
    'balance1_documents': 'balance10_documents',
    
    'balance9_type_code': 'balance2_type_code',
    'balance9_type_name': 'balance2_type_name',
    'balance9_version': 'balance2_version',
    'balance9_audit_type': 'balance2_audit_type',
    'balance9_valid_until_date': 'balance2_valid_until_date',
    'balance9_valid_until_year': 'balance2_valid_until_year',
    'balance9_year': 'balance2_year',
    'balance9_score': 'balance2_score',
    'balance9_documents': 'balance2_documents',
    
    'balance2_type_code': 'balance9_type_code',
    'balance2_type_name': 'balance9_type_name',
    'balance2_version': 'balance9_version',
    'balance2_audit_type': 'balance9_audit_type',
    'balance2_valid_until_date': 'balance9_valid_until_date',
    'balance2_valid_until_year': 'balance9_valid_until_year',
    'balance2_year': 'balance9_year',
    'balance2_score': 'balance9_score',
    'balance2_documents': 'balance9_documents',
    
    'balance8_type_code': 'balance3_type_code',
    'balance8_type_name': 'balance3_type_name',
    'balance8_version': 'balance3_version',
    'balance8_audit_type': 'balance3_audit_type',
    'balance8_valid_until_date': 'balance3_valid_until_date',
    'balance8_valid_until_year': 'balance3_valid_until_year',
    'balance8_year': 'balance3_year',
    'balance8_score': 'balance3_score',
    'balance8_documents': 'balance3_documents',
    
    'balance3_type_code': 'balance8_type_code',
    'balance3_type_name': 'balance8_type_name',
    'balance3_version': 'balance8_version',
    'balance3_audit_type': 'balance8_audit_type',
    'balance3_valid_until_date': 'balance8_valid_until_date',
    'balance3_valid_until_year': 'balance8_valid_until_year',
    'balance3_year': 'balance8_year',
    'balance3_score': 'balance8_score',
    'balance3_documents': 'balance8_documents',
    
    'balance7_type_code': 'balance4_type_code',
    'balance7_type_name': 'balance4_type_name',
    'balance7_version': 'balance4_version',
    'balance7_audit_type': 'balance4_audit_type',
    'balance7_valid_until_date': 'balance4_valid_until_date',
    'balance7_valid_until_year': 'balance4_valid_until_year',
    'balance7_year': 'balance4_year',
    'balance7_score': 'balance4_score',
    'balance7_documents': 'balance4_documents',
    
    'balance4_type_code': 'balance7_type_code',
    'balance4_type_name': 'balance7_type_name',
    'balance4_version': 'balance7_version',
    'balance4_audit_type': 'balance7_audit_type',
    'balance4_valid_until_date': 'balance7_valid_until_date',
    'balance4_valid_until_year': 'balance7_valid_until_year',
    'balance4_year': 'balance7_year',
    'balance4_score': 'balance7_score',
    'balance4_documents': 'balance7_documents',
    
    'balance6_type_code': 'balance5_type_code',
    'balance6_type_name': 'balance5_type_name',
    'balance6_version': 'balance5_version',
    'balance6_audit_type': 'balance5_audit_type',
    'balance6_valid_until_date': 'balance5_valid_until_date',
    'balance6_valid_until_year': 'balance5_valid_until_year',
    'balance6_year': 'balance5_year',
    'balance6_score': 'balance5_score',
    'balance6_documents': 'balance5_documents',
    
    'balance5_type_code': 'balance6_type_code',
    'balance5_type_name': 'balance6_type_name',
    'balance5_version': 'balance6_version',
    'balance5_audit_type': 'balance6_audit_type',
    'balance5_valid_until_date': 'balance6_valid_until_date',
    'balance5_valid_until_year': 'balance6_valid_until_year',
    'balance5_year': 'balance6_year',
    'balance5_score': 'balance6_score',
    'balance5_documents': 'balance6_documents'
    }, inplace=True)

# Step 3: Concat chunks
ecg_df = pd.concat([
    balances_0_1_df,
    balances_2_df,
    balances_3_df,
    balances_4_df,
    balances_5_df,
    balances_6_df,
    balances_7_df,
    balances_8_df,
    balances_9_df,
    balances_10_df
    ])
ecg_df = ecg_df.sort_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balances_2_df.rename(columns= {
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balances_3_df.rename(columns= {
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balances_4_df.rename(columns= {
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balances_5_df.rename(columns= {
A value is trying to be set on a copy of

In [43]:
# Choose relevant columns
ecg_df = ecg_df[[
    'id',
    'organization',
    'country_code',
    'country_name',
    'postal_code',
    'city',
    'company_size_category',
    'employees',
    'economic_sector_EN',
    'economic_sector_DE',
    'number_of_balances',
    
    'balance1_type_code',
    'balance1_version',
    'balance1_audit_type',
    'balance1_valid_until_date',
    'balance1_valid_until_year',
    'balance1_year',
    'balance1_score',
    'balance1_documents',
    
    'balance2_type_code',
    'balance2_version',
    'balance2_audit_type',
    'balance2_valid_until_date',
    'balance2_valid_until_year',
    'balance2_year',
    'balance2_score',
    'balance2_documents',
       
    'balance3_type_code',
    'balance3_version',
    'balance3_audit_type',
    'balance3_valid_until_date',
    'balance3_valid_until_year',
    'balance3_year',
    'balance3_score',
    'balance3_documents',
    
    'balance4_type_code',
    'balance4_version',
    'balance4_audit_type',
    'balance4_valid_until_date',
    'balance4_valid_until_year',
    'balance4_year',
    'balance4_score',
    'balance4_documents',
    
    'balance5_type_code',
    'balance5_version',
    'balance5_audit_type',
    'balance5_valid_until_date',
    'balance5_valid_until_year',
    'balance5_year',
    'balance5_score',
    'balance5_documents',
    
    'balance6_type_code',
    'balance6_version',
    'balance6_audit_type',
    'balance6_valid_until_date',
    'balance6_valid_until_year',
    'balance6_year',
    'balance6_score',
    'balance6_documents',
    
    'balance7_type_code',
    'balance7_version',
    'balance7_audit_type',
    'balance7_valid_until_date',
    'balance7_valid_until_year',
    'balance7_year',
    'balance7_score',
    'balance7_documents',
    
    'balance8_type_code',
    'balance8_version',
    'balance8_audit_type',
    'balance8_valid_until_date',
    'balance8_valid_until_year',
    'balance8_year',
    'balance8_score',
    'balance8_documents',
    
    #'balance9_type_code',
    #'balance9_version',
    #'balance9_audit_type',
    #'balance9_valid_until_date',
    #'balance9_valid_until_year',
    #'balance9_year',
    #'balance9_score',
    #'balance9_documents',
    
    #'balance10_type_code',
    #'balance10_version',
    #'balance10_audit_type',
    #'balance10_valid_until_date',
    #'balance10_valid_until_year',
    #'balance10_year',
    #'balance10_score',
    #'balance10_documents',
    ]]

In [44]:
# Checks after cleaning and transformation
print(f'Datatype: \n{ecg_df.dtypes}\n')

Datatype: 
id                             int64
organization                  object
country_code                  object
country_name                  object
postal_code                   object
                              ...   
balance8_valid_until_date     object
balance8_valid_until_year    float64
balance8_year                float64
balance8_score                object
balance8_documents            object
Length: 75, dtype: object



In [45]:
print(f'Missing values: \n{ecg_df.isna().sum()}\n')

Missing values: 
id                              0
organization                    1
country_code                    0
country_name                    0
postal_code                     1
                             ... 
balance8_valid_until_date    1248
balance8_valid_until_year    1248
balance8_year                1248
balance8_score               1248
balance8_documents           1248
Length: 75, dtype: int64



In [46]:
ecg_df['economic_sector_EN'].value_counts()

Consulting                                        271
Other                                             136
Nutrition / Agriculture / Forestry                 94
Trade / Consumption                                88
Health / Social Affairs / Nursing                  82
Tourism / Hotel / Catering                         80
Services                                           57
Craft                                              56
Construction / Architecture                        55
Education / University / Polytechnic / Schools     36
EDP / IT                                           24
Transport / Logistics                              22
Art / Culture / Entertainment                      21
Energy                                             20
Printing / Paper / Packaging                       19
Real Estate / Facility Management                  19
Marketing / Avertising / PR                        18
Media                                              16
Finance                     

In [47]:
ecg_df['balance1_version'].value_counts()

Compact     556
Full        318
Standard    303
M1.2          3
Name: balance1_version, dtype: int64

In [48]:
pd.set_option('display.max_columns', None) # Showing all columns
display(ecg_df[ecg_df['balance1_version'] == 'M1.2'])
pd.reset_option('max_columns')

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,number_of_balances,balance1_type_code,balance1_version,balance1_audit_type,balance1_valid_until_date,balance1_valid_until_year,balance1_year,balance1_score,balance1_documents,balance2_type_code,balance2_version,balance2_audit_type,balance2_valid_until_date,balance2_valid_until_year,balance2_year,balance2_score,balance2_documents,balance3_type_code,balance3_version,balance3_audit_type,balance3_valid_until_date,balance3_valid_until_year,balance3_year,balance3_score,balance3_documents,balance4_type_code,balance4_version,balance4_audit_type,balance4_valid_until_date,balance4_valid_until_year,balance4_year,balance4_score,balance4_documents,balance5_type_code,balance5_version,balance5_audit_type,balance5_valid_until_date,balance5_valid_until_year,balance5_year,balance5_score,balance5_documents,balance6_type_code,balance6_version,balance6_audit_type,balance6_valid_until_date,balance6_valid_until_year,balance6_year,balance6_score,balance6_documents,balance7_type_code,balance7_version,balance7_audit_type,balance7_valid_until_date,balance7_valid_until_year,balance7_year,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents
421,14904,Gemeinde Kirchanschöring,DE,Germany,83417,Kirchanschöring,Micro (≤10),3-10,Public Administration,Öffentliche Verwaltung,1.0,Gemeinde,M1.2,On site,31-10-2020,2020.0,2018.0,588,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
422,14928,Gemeinde Mäder,AT,Austria,6841,Mäder,Small (≤50),26-50,Public Administration,Öffentliche Verwaltung,2.0,Gemeinde,M1.2,Desk,31-08-2019,2019.0,2017.0,487,Report+Attestation,Gemeinde,M1.2,On site,31-01-2022,2022.0,2020.0,567.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
423,29889,Gemeinde Nenzing,AT,Austria,6710,Nenzing,Mid (≤250),51-100,Public Administration,Öffentliche Verwaltung,1.0,Gemeinde,M1.2,Desk,31-08-2019,2019.0,2017.0,425,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
ecg_df['balance1_type_code'].value_counts()

M5.0        874
4.1         231
4.0          64
3.0           8
Gemeinde      3
Name: balance1_type_code, dtype: int64

In [50]:
pd.set_option('display.max_columns', None) # Showing all columns
display(ecg_df[ecg_df['economic_sector_EN'] == 'Public Administration'])
pd.reset_option('max_columns')

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,number_of_balances,balance1_type_code,balance1_version,balance1_audit_type,balance1_valid_until_date,balance1_valid_until_year,balance1_year,balance1_score,balance1_documents,balance2_type_code,balance2_version,balance2_audit_type,balance2_valid_until_date,balance2_valid_until_year,balance2_year,balance2_score,balance2_documents,balance3_type_code,balance3_version,balance3_audit_type,balance3_valid_until_date,balance3_valid_until_year,balance3_year,balance3_score,balance3_documents,balance4_type_code,balance4_version,balance4_audit_type,balance4_valid_until_date,balance4_valid_until_year,balance4_year,balance4_score,balance4_documents,balance5_type_code,balance5_version,balance5_audit_type,balance5_valid_until_date,balance5_valid_until_year,balance5_year,balance5_score,balance5_documents,balance6_type_code,balance6_version,balance6_audit_type,balance6_valid_until_date,balance6_valid_until_year,balance6_year,balance6_score,balance6_documents,balance7_type_code,balance7_version,balance7_audit_type,balance7_valid_until_date,balance7_valid_until_year,balance7_year,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents
421,14904,Gemeinde Kirchanschöring,DE,Germany,83417,Kirchanschöring,Micro (≤10),3-10,Public Administration,Öffentliche Verwaltung,1.0,Gemeinde,M1.2,On site,31-10-2020,2020.0,2018.0,588.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
422,14928,Gemeinde Mäder,AT,Austria,6841,Mäder,Small (≤50),26-50,Public Administration,Öffentliche Verwaltung,2.0,Gemeinde,M1.2,Desk,31-08-2019,2019.0,2017.0,487.0,Report+Attestation,Gemeinde,M1.2,On site,31-01-2022,2022.0,2020.0,567.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
423,29889,Gemeinde Nenzing,AT,Austria,6710,Nenzing,Mid (≤250),51-100,Public Administration,Öffentliche Verwaltung,1.0,Gemeinde,M1.2,Desk,31-08-2019,2019.0,2017.0,425.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
424,30096,Gemeindeverband für Abfallwirtschaft und Umwel...,AT,Austria,6850,Dornbirn,Micro (≤10),3-10,Public Administration,Öffentliche Verwaltung,1.0,4.1,Standard,Peer,,,,,Report,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
442,125273,Gesellschaft für Wirtschaftsförderung und Stad...,DE,Germany,37085,Göttingen,Mid (≤250),51-100,Public Administration,Öffentliche Verwaltung,1.0,M5.0,Compact,Peer,28-02-2026,2026.0,2024.0,391.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
665,25485,Landkreis Marburg-Biedenkopf / Eigenbetrieb Ju...,DE,Germany,35043,Marburg,Micro (≤10),3-10,Public Administration,Öffentliche Verwaltung,1.0,M5.0,Full,On site,28-02-2023,2023.0,2021.0,332.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
918,23715,Regensburg Tourismus GmbH (RTG),DE,Germany,93055,Regensburg,Small (≤50),26-50,Public Administration,Öffentliche Verwaltung,2.0,M5.0,Compact,On site,28-02-2023,2023.0,2021.0,526.0,Report+Attestation,M5.0,Compact,Desk,31-01-2025,2025.0,2023.0,458.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
993,14870,SES Stadtentwässerung Stuttgart,DE,Germany,70178,Stuttgart,Large (>250),>250,Public Administration,Öffentliche Verwaltung,2.0,M5.0,Full,On site,30-04-2020,2020.0,2018.0,430.0,Report+Attestation,M5.0,Full,On site,31-08-2024,2024.0,2022.0,504.0,Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1046,15717,Stadtverwaltung Steinheim,DE,Germany,32839,Steinheim,Mid (≤250),51-100,Public Administration,Öffentliche Verwaltung,1.0,M5.0,Full,On site,31-08-2022,2022.0,2020.0,423.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1047,15525,"Stadtwerke Kufstein GmbH, Wasserabteilunge",AT,Austria,6330,Kufstein,Micro (≤10),3-10,Public Administration,Öffentliche Verwaltung,1.0,M5.0,Compact,Desk,31-05-2022,2022.0,2020.0,380.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [51]:
# Check for empty strings
pd.set_option('display.max_columns', None) # Showing all columns
display(ecg_df[ecg_df.eq('').any(axis=1)])
pd.reset_option('max_columns')

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,number_of_balances,balance1_type_code,balance1_version,balance1_audit_type,balance1_valid_until_date,balance1_valid_until_year,balance1_year,balance1_score,balance1_documents,balance2_type_code,balance2_version,balance2_audit_type,balance2_valid_until_date,balance2_valid_until_year,balance2_year,balance2_score,balance2_documents,balance3_type_code,balance3_version,balance3_audit_type,balance3_valid_until_date,balance3_valid_until_year,balance3_year,balance3_score,balance3_documents,balance4_type_code,balance4_version,balance4_audit_type,balance4_valid_until_date,balance4_valid_until_year,balance4_year,balance4_score,balance4_documents,balance5_type_code,balance5_version,balance5_audit_type,balance5_valid_until_date,balance5_valid_until_year,balance5_year,balance5_score,balance5_documents,balance6_type_code,balance6_version,balance6_audit_type,balance6_valid_until_date,balance6_valid_until_year,balance6_year,balance6_score,balance6_documents,balance7_type_code,balance7_version,balance7_audit_type,balance7_valid_until_date,balance7_valid_until_year,balance7_year,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,Austria,1080.0,Wien,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,2.0,3.0,Standard,Desk,,,,,Report,4.1,Standard,Desk,,,,,Report,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,92361,"Action Waterscape, S.L.",ES,Spain,46200.0,Paiporta,,,Other,Sonstige,2.0,M5.0,Full,Peer,,,,,,M5.0,Full,On site,30-10-2019,2019.0,2017.0,491.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,30337,Africa Green Tec AG,DE,Germany,63512.0,Hainburg,Small (≤50),11-25,Energy,Energiewirtschaft,1.0,M5.0,Compact,Peer,,,,,Report,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
14,29635,AGENTUR FÜR EVALUATION & QUALITÄTSENTWICKLUNG,CH,Switzerland,9100.0,Herisau,Micro (≤10),1-2,Consulting,Beratung / Consulting,1.0,4.1,Standard,Desk,,,,,Report,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
23,78313,Allgäu Batterie GmbH & Co. KG,DE,Germany,87490.0,Haldenwang,Small (≤50),26-50,Electrical / Electronics,Elektro / Elektronik,2.0,M5.0,Full,Peer,31-07-2024,2024.0,2022.0,266.0,Report+Certificate,M5.0,Full,On site,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36,14772,ananjo informatiker,DE,Germany,63165.0,mühlheim am main,Micro (≤10),3-10,EDP / IT,EDV / IT,2.0,4.1,Standard,Peer,,,,,Report,M5.0,Compact,Desk,31-12-2021,2021.0,2019.0,290.0,Report+Attestation,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
40,92457,Animación y logística cultural S.L.U.,ES,Spain,46017.0,Valencia,,,Other,Sonstige,2.0,M5.0,Full,On site,20-10-2020,2020.0,2018.0,348.0,,M5.0,Full,Peer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
58,92489,ArtMarketing SL,ES,Spain,28224.0,Pozuelo del Alarcón,Micro (≤10),1-2,Other,Sonstige,2.0,M5.0,Full,On site,16-12-2015,2015.0,2013.0,625.0,,M5.0,Full,On site,25-05-2018,2018.0,2016.0,688.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141,14938,BKK ProVita,DE,Germany,85232.0,Bergkirchen,Mid (≤250),101-250,Health / Social Affairs / Nursing,Gesundheitswesen / Soziales / Pflege,4.0,4.1,Standard,Desk,31-12-2017,2017.0,2015.0,374.0,Attestation,M5.0,Full,On site,30-01-2020,2020.0,2018.0,604.0,Report+Attestation,M5.0,Full,On site,31-01-2022,2022.0,2020.0,790.0,Report+Attestation,M5.0,Full,On site,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
169,30106,BürgerEnergieGenossenschaft eG,DE,Germany,58300.0,Wetter (Ruhr),Micro (≤10),3-10,Energy,Energiewirtschaft,1.0,4.1,Standard,Peer,,,,,Report+Certificate,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [52]:
# Replace all empty strings with missing values
ecg_df = ecg_df.replace('', np.nan)

In [53]:
# Check for empty strings again
pd.set_option('display.max_columns', None) # Showing all columns
display(ecg_df[ecg_df.eq('').any(axis=1)])
pd.reset_option('max_columns')

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,number_of_balances,balance1_type_code,balance1_version,balance1_audit_type,balance1_valid_until_date,balance1_valid_until_year,balance1_year,balance1_score,balance1_documents,balance2_type_code,balance2_version,balance2_audit_type,balance2_valid_until_date,balance2_valid_until_year,balance2_year,balance2_score,balance2_documents,balance3_type_code,balance3_version,balance3_audit_type,balance3_valid_until_date,balance3_valid_until_year,balance3_year,balance3_score,balance3_documents,balance4_type_code,balance4_version,balance4_audit_type,balance4_valid_until_date,balance4_valid_until_year,balance4_year,balance4_score,balance4_documents,balance5_type_code,balance5_version,balance5_audit_type,balance5_valid_until_date,balance5_valid_until_year,balance5_year,balance5_score,balance5_documents,balance6_type_code,balance6_version,balance6_audit_type,balance6_valid_until_date,balance6_valid_until_year,balance6_year,balance6_score,balance6_documents,balance7_type_code,balance7_version,balance7_audit_type,balance7_valid_until_date,balance7_valid_until_year,balance7_year,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents


In [54]:
# Turn None type values into NaN
ecg_df = ecg_df.fillna(value=np.nan)

In [55]:
# Last check
ecg_df

Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,...,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents
0,14885,4e solutions GmbH,DE,Germany,70794,Filderstadt,Micro (≤10),3-10,Trade / Consumption,Handel / Konsum,...,,,,,,,,,,
1,14886,4plus5,DE,Germany,89077,Ulm,Micro (≤10),3-10,Construction / Architecture,Baugewerbe / Architektur,...,,,,,,,,,,
2,35275,A & K Engemann GbR,DE,Germany,34439,Willebadessen,Micro (≤10),3-10,Nutrition / Agriculture / Forestry,Nahrungsmittel / Land / Forstwirtschaft,...,,,,,,,,,,
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,Germany,14480,Potsdam,Small (≤50),11-25,Consulting,Beratung / Consulting,...,,,,,,,,,,
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,Austria,1080,Wien,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244,92844,Zimmerei Diedrich – Die Gesundhausbauer GmbH,DE,Germany,37434,Rüdershausen,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,...,,,,,,,,,,
1245,45156,zimmerwerkstatt Nicola Bannier und Michael Weber,DE,Germany,29456,Hitzacker,Micro (≤10),1-2,Craft,Handwerk,...,,,,,,,,,,
1246,14718,Zukunftswerk eG,DE,Germany,82319,Starnberg,Micro (≤10),3-10,Consulting,Beratung / Consulting,...,,,,,,,,,,
1247,30321,zündstoff. fair organic clothing S. Klemz & M....,DE,Germany,79100,Freiburg,Micro (≤10),3-10,Textile,Textilbranche,...,,,,,,,,,,


## Saving the dataframe

In [56]:
ecg_df.to_csv('ecg_df.csv', index=False)