## Preparations

In [1]:
# Import modules
import os
import pandas as pd
import numpy as np

In [2]:
# Creating directories
def create_directories():
    if not os.path.exists('tables'):
        os.makedirs('tables')
    if not os.path.exists('logs'):
        os.makedirs('logs')

create_directories()

In [3]:
# Import dataframe
def import_df():
    df = pd.read_csv('/home/seblap/Files/Arbeit/Data Analytics/GitHub repos/economy-for-the-common-good/data/scraping/ecg_df.csv')
    print(df.shape)
    display(df.head())
    
    return df

df = import_df()

(1248, 75)


Unnamed: 0,id,organization,country_code,country_name,postal_code,city,company_size_category,employees,economic_sector_EN,economic_sector_DE,...,balance7_score,balance7_documents,balance8_type_code,balance8_version,balance8_audit_type,balance8_valid_until_date,balance8_valid_until_year,balance8_year,balance8_score,balance8_documents
0,14885,4e solutions GmbH,DE,Germany,70794,Filderstadt,Micro (≤10),3-10,Trade / Consumption,Handel / Konsum,...,,,,,,,,,,
1,14886,4plus5,DE,Germany,89077,Ulm,Micro (≤10),3-10,Construction / Architecture,Baugewerbe / Architektur,...,,,,,,,,,,
2,35275,A & K Engemann GbR,DE,Germany,34439,Willebadessen,Micro (≤10),3-10,Nutrition / Agriculture / Forestry,Nahrungsmittel / Land / Forstwirtschaft,...,,,,,,,,,,
3,44668,A&P Steuerberatungsgesellschaft mbH,DE,Germany,14480,Potsdam,Small (≤50),11-25,Consulting,Beratung / Consulting,...,,,,,,,,,,
4,29652,AAP-ARCHITEKTEN ZT-GMBH,AT,Austria,1080,Wien,Small (≤50),11-25,Construction / Architecture,Baugewerbe / Architektur,...,,,,,,,,,,


## Creating tables

In [4]:
# sizes
def create_sizes(df):
    sizes_list = sorted(list(df['company_size_category'].dropna().unique()))

    if len(sizes_list) > 4:
        print('Warning. Unexpected company size category/categories.')

    sizes = pd.DataFrame({
        'id': [x for x in range (1,len(sizes_list)+1)],
        'company_size_category': [sizes_list[1], sizes_list[3], sizes_list[2], sizes_list[0]]
        })

    sizes.to_csv('tables/sizes.csv', index=False)
    display(sizes)
    return sizes

sizes = create_sizes(df)

#sizes_mapping = dict(zip(sizes['company_size_category'], sizes['id']))

Unnamed: 0,id,company_size_category
0,1,Micro (≤10)
1,2,Small (≤50)
2,3,Mid (≤250)
3,4,Large (>250)


In [5]:
# employees
def create_employees(df):
    employees_list = sorted(list(df['employees'].dropna().unique()))

    if len(employees_list) > 12:
        print('Warning. Unexpected employees value(s).')

    employees = pd.DataFrame({
        'id': [x for x in range (1,len(employees_list)+1)],
        'employees': [
            employees_list[0], 
            employees_list[7],
            employees_list[3],
            employees_list[6], 
            employees_list[10],
            employees_list[2], 
            employees_list[5],
            employees_list[9], 
            employees_list[1],
            employees_list[4],
            employees_list[8],
            employees_list[11]
            ]
        })

    sizes_mapping = {
        1: 1,
        2: 1,
        3: 2,
        4: 2,
        5: 3,
        6: 3,
        7: 4,
        8: 4,
        9: 4,
        10: 4,
        11: 4,
        12: 4
        }

    employees['sizes_id'] = employees['id'].map(sizes_mapping)

    employees.to_csv('tables/employees.csv', index=False)
    display(employees)
    return employees
    
employees = create_employees(df)

employees_mapping = dict(zip(employees['employees'], employees['id']))

Unnamed: 0,id,employees,sizes_id
0,1,1-2,1
1,2,3-10,1
2,3,11-25,2
3,4,26-50,2
4,5,51-100,3
5,6,101-250,3
6,7,251-500,4
7,8,501-1000,4
8,9,1001-2500,4
9,10,2501-5000,4


In [6]:
# sectors
def create_sectors(df):
    sectors_df = df[['economic_sector_EN', 'economic_sector_DE']].dropna()
    sectors_df = sectors_df.drop_duplicates(subset=['economic_sector_EN']).sort_values(by='economic_sector_EN').reset_index(drop=True)

    if len(sectors_df) > 40:
        print('Warning. Unexpected economic sector(s).')

    sectors = pd.DataFrame({
        'id': [x for x in range (1,len(sectors_df)+1)],
        'economic_sector_EN': sectors_df['economic_sector_EN'],
        'economic_sector_DE': sectors_df['economic_sector_DE']
        })

    sectors.to_csv('tables/sectors.csv', index=False)
    display(sectors)
    return sectors

sectors = create_sectors(df)

sectors_mapping_EN = dict(zip(sectors['economic_sector_EN'], sectors['id']))
sectors_mapping_DE = dict(zip(sectors['economic_sector_DE'], sectors['id']))

Unnamed: 0,id,economic_sector_EN,economic_sector_DE
0,1,Art / Culture / Entertainment,Kunst / Kultur / Unterhaltung
1,2,Associations / Societies,Vereine
2,3,Automotive / Automotive supplier,Automobil / Automobilzulieferer
3,4,Banking,Banken
4,5,Chemistry,Chemie
5,6,Construction / Architecture,Baugewerbe / Architektur
6,7,Consulting,Beratung / Consulting
7,8,Craft,Handwerk
8,9,EDP / IT,EDV / IT
9,10,Education / University / Polytechnic / Schools,Bildung / Universität / FH / Schulen


In [7]:
# countries
def create_countries(df):
    countries_df = df[['country_code', 'country_name']].dropna()
    countries_df = countries_df.drop_duplicates(subset=['country_code']).sort_values(by='country_code').reset_index(drop=True)

    if len(countries_df) > 16:
        print('Warning. Unexpected company size category/categories.')

    countries = pd.DataFrame({
        'id': [x for x in range (1,len(countries_df)+1)],
        'country_code': countries_df['country_code'],
        'country_name': countries_df['country_name']
        })

    countries.to_csv('tables/countries.csv', index=False)
    display(countries)
    return countries

countries = create_countries(df)

countries_mapping_code = dict(zip(countries['country_code'], countries['id']))
#countries_mapping_name = dict(zip(countries['country_name'], countries['id']))

Unnamed: 0,id,country_code,country_name
0,1,AT,Austria
1,2,BE,Belgium
2,3,CH,Switzerland
3,4,DE,Germany
4,5,DK,Denmark
5,6,ES,Spain
6,7,HR,Croatia
7,8,IT,Italy
8,9,LU,Luxembourg
9,10,NL,Netherlands


In [8]:
# locations
def create_locations(df, countries_mapping_code):
    locations_df = df[['postal_code', 'city', 'country_code']].dropna()
    locations_df = locations_df.drop_duplicates(subset=['postal_code', 'country_code']).sort_values(by=['country_code', 'postal_code']).reset_index(drop=True)

    if len(locations_df) > 927:
        print('Warning. New location(s).')

    locations = pd.DataFrame({
        'id': [x for x in range (1,len(locations_df)+1)],
        'postal_code': locations_df['postal_code'],
        'city': locations_df['city'],
        'country_code': locations_df['country_code']
        })

    locations['country_id'] = locations['country_code'].map(countries_mapping_code)

    locations.to_csv('tables/locations.csv', index=False)
    display(locations)
    return locations

locations = create_locations(df, countries_mapping_code)

locations_mapping = locations[['id', 'country_id', 'postal_code']].astype(str)
locations_mapping['loc_mapping'] = locations_mapping['country_id'] + ' ' + locations_mapping['postal_code'] 

Unnamed: 0,id,postal_code,city,country_code,country_id
0,1,1010,Wien,AT,1
1,2,1020,Wien,AT,1
2,3,1030,Wien,AT,1
3,4,1040,Wien,AT,1
4,5,1050,Wien,AT,1
...,...,...,...,...,...
922,923,27392,Tomelilla,SE,13
923,924,41458,Göteborg,SE,13
924,925,BT56 8FQ,Portrush,UK,14
925,926,12400,Montevideo,UR,15


In [9]:
# balance_audit_types
def create_balance_audit_types(df):
    balance_audit_types_list = []
    for i in range(1,9):
        balance_audit_types_list.extend(list(df[f'balance{i}_audit_type']))
    
    balance_audit_types_list = sorted(list(pd.Series(balance_audit_types_list).dropna().unique()))

    if len(balance_audit_types_list) > 3:
        print('Warning. Unexpected balance audit type(s).')

    balance_audit_types = pd.DataFrame({
        'id': [x for x in range (1,len(balance_audit_types_list)+1)],
        'audit_type': balance_audit_types_list
        })

    balance_audit_types.to_csv('tables/balance_audit_types.csv', index=False)
    display(balance_audit_types)
    return balance_audit_types

balance_audit_types = create_balance_audit_types(df)

balance_audit_types_mapping = dict(zip(balance_audit_types['audit_type'], balance_audit_types['id']))

Unnamed: 0,id,audit_type
0,1,Desk
1,2,On site
2,3,Peer


In [10]:
# balance_types
def create_balance_types(df):
    balance_types_list = []
    for i in range(1,9):
        balance_types_list.extend(list(df[f'balance{i}_type_code']))
    
    balance_types_list = sorted(list(pd.Series(balance_types_list).dropna().astype(str).unique()))

    if len(balance_types_list) > 5:
        print('Warning. Unexpected balance type code(s).')

    balance_types = pd.DataFrame({
        'id': [x for x in range (1,len(balance_types_list)+1)],
        'type_code': balance_types_list
        })

    balance_types.to_csv('tables/balance_types.csv', index=False)
    display(balance_types)
    return balance_types

balance_types = create_balance_types(df)

balance_types_mapping = dict(zip(balance_types['type_code'], balance_types['id']))

Unnamed: 0,id,type_code
0,1,3.0
1,2,4.0
2,3,4.1
3,4,Gemeinde
4,5,M5.0


In [11]:
# balance_versions
def create_balance_versions(df):
    balance_versions_list = []
    for i in range(1,9):
        balance_versions_list.extend(list(df[f'balance{i}_version']))
    
    balance_versions_list = sorted(list(pd.Series(balance_versions_list).dropna().astype(str).unique()))

    if len(balance_versions_list) > 3:
        print('Warning. Unexpected balance version(s).')

    balance_versions = pd.DataFrame({
        'id': [x for x in range (1,len(balance_versions_list)+1)],
        'type_code': balance_versions_list
        })

    balance_versions.to_csv('tables/balance_versions.csv', index=False)
    display(balance_versions)
    return balance_versions

balance_versions = create_balance_versions(df)

balance_versions_mapping = dict(zip(balance_versions['type_code'], balance_versions['id']))



Unnamed: 0,id,type_code
0,1,Compact
1,2,Full
2,3,M1.2
3,4,Standard


In [12]:
# balance_versions
def create_balance_documents(df):
    balance_documents_list = []
    for i in range(1,9):
        balance_documents_list.extend(list(df[f'balance{i}_documents']))

    balance_documents_list = pd.Series(balance_documents_list).dropna().astype(str).unique()
    balance_documents_list = [x.split('+') for x in balance_documents_list]
    balance_documents_list = list(pd.Series(balance_documents_list).explode('+'))
    balance_documents_list = sorted(list(pd.Series(balance_documents_list).dropna().unique()))

    if len(balance_documents_list) > 3:
        print('Warning. Unexpected balance document(s).')

    balance_documents = pd.DataFrame({
        'id': [x for x in range (1,len(balance_documents_list)+1)],
        'document': balance_documents_list
        })

    balance_documents.to_csv('tables/balance_documents.csv', index=False)
    display(balance_documents)
    return balance_documents

balance_documents = create_balance_documents(df)

balance_documents_mapping = dict(zip(balance_documents['document'], balance_documents['id']))

Unnamed: 0,id,document
0,1,Bericht
1,2,Testat
2,3,Zertifikat


In [13]:
# balances
def create_balances(df):
    balances_df = []
    for i in range(1,9):
        balance_versions_list.extend(list(df[f'balance{i}_version']))
    
    balance_versions_list = sorted(list(pd.Series(balance_versions_list).dropna().astype(str).unique()))

    if len(balance_versions_list) > 3:
        print('Warning. Unexpected balance version(s).')

    balance_versions = pd.DataFrame({
        'id': [x for x in range (1,len(balance_versions_list)+1)],
        'type_code': balance_versions_list
        })

    balance_versions.to_csv('tables/balance_versions.csv', index=False)
    display(balance_versions)
    return balance_versions

balance_versions = create_balance_versions(df)

balance_versions_mapping = dict(zip(balance_versions['type_code'], balance_versions['id']))



Unnamed: 0,id,type_code
0,1,Compact
1,2,Full
2,3,M1.2
3,4,Standard


In [25]:
balances_df = df[['id', 'balance1_type_code', 'balance1_version', 'balance1_audit_type', 'balance1_valid_until_date', 'balance1_valid_until_year', 'balance1_year', 'balance1_score', 'balance1_documents']].dropna(subset=['balance1_type_code', 'balance1_version', 'balance1_audit_type', 'balance1_valid_until_date', 'balance1_valid_until_year', 'balance1_year', 'balance1_score', 'balance1_documents'], how='all')
balances_df.columns = ['organizations_id', 'type_code', 'version', 'audit_type', 'valid_until_date', 'valid_until_year', 'year', 'score', 'balance_documents']
balances_df['entry_number'] = 1

for i in range(2,9):
    additional_df = df[['id', f'balance{i}_type_code', f'balance{i}_version', f'balance{i}_audit_type', f'balance{i}_valid_until_date', f'balance{i}_valid_until_year', f'balance{i}_year', f'balance{i}_score', f'balance{i}_documents']].dropna(subset=[f'balance{i}_type_code', f'balance{i}_version', f'balance{i}_audit_type', f'balance{i}_valid_until_date', f'balance{i}_valid_until_year', f'balance{i}_year', f'balance{i}_score', f'balance{i}_documents'], how='all')
    additional_df.columns = ['organizations_id', 'type_code', 'version', 'audit_type', 'valid_until_date', 'valid_until_year', 'year', 'score', 'balance_documents']
    additional_df['entry_number'] = i
    balances_df = pd.concat([balances_df, additional_df], axis=0)

balances_df = balances_df.sort_values(by=['organizations_id', 'entry_number']).reset_index(drop=True)
balances_df['id'] = [x for x in range(1, len(balances_df)+1)]
balances_df = balances_df[['id', 'organizations_id', 'entry_number', 'type_code', 'version', 'audit_type', 'year', 'valid_until_date', 'valid_until_year', 'score', 'balance_documents']]

balances_df['type_code'] = balances_df['type_code'].apply(lambda x: str(x) if isinstance(x, int) or isinstance(x, float) else x)
balances_df['types_id'] = balances_df['type_code'].apply(lambda x: balance_types_mapping.get(x,x))
                                                     
balances_df['version'] = balances_df['version'].apply(lambda x: str(x) if isinstance(x, int) or isinstance(x, float) else x)
balances_df['versions_id'] = balances_df['version'].apply(lambda x: balance_versions_mapping.get(x,x))    

balances_df['audit_types_id'] = balances_df['audit_type'].apply(lambda x: balance_audit_types_mapping.get(x,x))

#balances_df['year'] = balances_df['year'].fillna('None')
#balances_df['year'] = balances_df['year'].apply(lambda x: int(x) if isinstance(x, float) else x)
#balances_df['year'] = balances_df['year'].apply(lambda x: None if x='None' else x)

balances_df['valid_until_date'] = balances_df['valid_until_date'].apply(lambda x: {'day': x.split('-')[0], 'month': x.split('-')[1], 'year': x.split('-')[2]} if isinstance(x, str) else x)
###### CHECK WHY THERE ARE WEIRD VALUES!
# OK Pineta Hotels, La Perla, Brauerei Gutmann have missing data values -> Solve in scraping!

balances_df = balances_df[['id', 'organizations_id', 'entry_number', 'types_id', 'versions_id', 'audit_types_id', 'year', 'valid_until_date', 'valid_until_year', 'score', 'balance_documents']]
balances_df

IndexError: list index out of range

In [15]:
balances_df

Unnamed: 0,id,organizations_id,entry_number,types_id,versions_id,audit_types_id,year,valid_until_date,valid_until_year,score,balance_documents
0,1,14654,1,5,1,1,,,,,
1,2,14654,2,3,4,1,2016.0,31-03-2018,2018.0,520.0,Bericht+Testat
2,3,14655,1,5,1,1,2019.0,31-05-2021,2021.0,443.0,Bericht+Testat
3,4,14655,2,3,4,1,,,,,Bericht
4,5,14655,3,3,4,1,,,,,Bericht
...,...,...,...,...,...,...,...,...,...,...,...
1475,1476,159686,1,5,1,2,,,,,
1476,1477,159686,2,5,1,2,,,,,
1477,1478,159686,3,5,2,2,,,,,
1478,1479,160753,1,5,1,3,,,,,
