## Preparations

In [None]:
# Import modules
import os
import pandas as pd
import numpy as np

In [None]:
# Creating directories
def create_directories():
    if not os.path.exists('tables'):
        os.makedirs('tables')
    if not os.path.exists('logs'):
        os.makedirs('logs')

create_directories()

In [None]:
# Import dataframe
def import_df():
    df = pd.read_csv('/home/seblap/Files/Arbeit/Data Analytics/GitHub repos/economy-for-the-common-good/data/scraping/ecg_df.csv')
    print(df.shape)
    display(df.head())
    
    return df

df = import_df()

## Creating tables

In [None]:
# sizes
def create_sizes(df):
    sizes_list = sorted(list(df['company_size_category'].dropna().unique()))

    if len(sizes_list) > 4:
        print('Warning: Unexpected company size category/categories.')

    sizes = pd.DataFrame({
        'id': [x for x in range (1,len(sizes_list)+1)],
        'company_size_category': [sizes_list[1], sizes_list[3], sizes_list[2], sizes_list[0]]
        })

    sizes.to_csv('tables/sizes.csv', index=False)
    display(sizes)
    
    sizes_mapping = dict(zip(sizes['company_size_category'], sizes['id']))
    
    return sizes, sizes_mapping

sizes, sizes_mapping = create_sizes(df)

In [None]:
# employees
def create_employees(df):
    employees_list = sorted(list(df['employees'].dropna().unique()))

    if len(employees_list) > 12:
        print('Warning: Unexpected employees value(s).')

    employees = pd.DataFrame({
        'id': [x for x in range (1,len(employees_list)+1)],
        'employees': [
            employees_list[0], 
            employees_list[7],
            employees_list[3],
            employees_list[6], 
            employees_list[10],
            employees_list[2], 
            employees_list[5],
            employees_list[9], 
            employees_list[1],
            employees_list[4],
            employees_list[8],
            employees_list[11]
            ]
        })

    sizes_mapping = {
        1: 1,
        2: 1,
        3: 2,
        4: 2,
        5: 3,
        6: 3,
        7: 4,
        8: 4,
        9: 4,
        10: 4,
        11: 4,
        12: 4
        }

    employees['sizes_id'] = employees['id'].map(sizes_mapping)
    
    employees.to_csv('tables/employees.csv', index=False)
    display(employees)
    
    employees_mapping = dict(zip(employees['employees'], employees['id']))
    
    return employees, employees_mapping
    
employees, employees_mapping = create_employees(df)

In [None]:
# sectors
def create_sectors(df):
    sectors_df = df[['economic_sector_EN', 'economic_sector_DE']].dropna()
    sectors_df = sectors_df.drop_duplicates(subset=['economic_sector_EN']).sort_values(by='economic_sector_EN').reset_index(drop=True)

    if len(sectors_df) > 40:
        print('Warning: Unexpected economic sector(s).')

    sectors = pd.DataFrame({
        'id': [x for x in range (1,len(sectors_df)+1)],
        'economic_sector_EN': sectors_df['economic_sector_EN'],
        'economic_sector_DE': sectors_df['economic_sector_DE']
        })

    sectors.to_csv('tables/sectors.csv', index=False)
    display(sectors)
    
    sectors_mapping_EN = dict(zip(sectors['economic_sector_EN'], sectors['id']))
    sectors_mapping_DE = dict(zip(sectors['economic_sector_DE'], sectors['id']))
    
    return sectors, sectors_mapping_EN, sectors_mapping_DE

sectors, sectors_mapping_EN, sectors_mapping_DE = create_sectors(df)

In [None]:
# countries
def create_countries(df):
    countries_df = df[['country_code', 'country_name']].dropna()
    countries_df = countries_df.drop_duplicates(subset=['country_code']).sort_values(by='country_code').reset_index(drop=True)

    if len(countries_df) > 16:
        print('Warning: Unexpected company size category/categories.')

    countries = pd.DataFrame({
        'id': [x for x in range (1,len(countries_df)+1)],
        'country_code': countries_df['country_code'],
        'country_name': countries_df['country_name']
        })

    countries.to_csv('tables/countries.csv', index=False)
    display(countries)

    countries_mapping_code = dict(zip(countries['country_code'], countries['id']))
    countries_mapping_name = dict(zip(countries['country_name'], countries['id']))

    return countries, countries_mapping_code, countries_mapping_name
    
countries, countries_mapping_code, countries_mapping_name = create_countries(df)

In [None]:
# locations
def create_locations(df, countries_mapping_code):
    locations_df = df[['postal_code', 'city', 'country_code']]
    locations_df = locations_df.drop_duplicates(subset=['postal_code', 'country_code']).sort_values(by=['country_code', 'postal_code']).reset_index(drop=True)

    if len(locations_df) > 927:
        print('Warning: New location(s).')

    locations = pd.DataFrame({
        'id': [x for x in range (1,len(locations_df)+1)],
        'postal_code': locations_df['postal_code'],
        'city': locations_df['city'],
        'country_code': locations_df['country_code']
        })

    locations['country_id'] = locations['country_code'].map(countries_mapping_code)
    
    locations_mapping = locations[['id', 'country_code', 'postal_code']].astype(str)
    locations_mapping['loc_mapping'] = locations_mapping['country_code'] + ' ' + locations_mapping['postal_code'] 
    locations_mapping = dict(zip(locations_mapping['loc_mapping'], locations_mapping['id']))

    locations = locations[['id', 'postal_code', 'city', 'country_id']]
    
    if locations.isna().values.any() == True:
        print('Warning: Missing values detected:')
        print(locations.isna().sum())
        locations = locations.fillna(r'\N')
    
    locations.to_csv('tables/locations.csv', index=False)
    display(locations)
    
    return locations, locations_mapping

locations, locations_mapping = create_locations(df, countries_mapping_code)

In [None]:
# organizations
def create_organizations(df, locations_mapping, sectors_mapping_EN, employees_mapping):
    organizations_df = df[['id', 'organization', 'country_code', 'postal_code', 'employees', 'economic_sector_EN']]
    
    organizations_df['employees_id'] = organizations_df['employees'].apply(lambda x: employees_mapping.get(x,x) if isinstance(x, str) else x)
    
    organizations_df['locations_id'] = organizations_df['country_code'] + ' ' + organizations_df['postal_code'] 
    organizations_df['locations_id'] = organizations_df['locations_id'].apply(lambda x: locations_mapping.get(x,x if isinstance(x, str) else x))
    
    organizations_df['sectors_id'] = organizations_df['economic_sector_EN'].apply(lambda x: sectors_mapping_EN.get(x,x) if isinstance(x, str) else x)

    organizations = organizations_df[['id', 'organization', 'locations_id', 'employees_id', 'sectors_id']]
    
    if organizations.isna().values.any() == True:
        print('Warning: Missing values detected:')
        print(organizations.isna().sum())
        organizations = organizations.fillna(r'\N')
    
    organizations.to_csv('tables/organizations.csv', index=False)
    display(organizations)
    
    return organizations

organizations = create_organizations(df, locations_mapping, sectors_mapping_EN, employees_mapping)

In [None]:
# balance_audit_types
def create_balance_audit_types(df):
    balance_audit_types_list = []
    for i in range(1,9):
        balance_audit_types_list.extend(list(df[f'balance{i}_audit_type']))
    
    balance_audit_types_list = sorted(list(pd.Series(balance_audit_types_list).dropna().unique()))

    if len(balance_audit_types_list) > 3:
        print('Warning: Unexpected balance audit type(s).')

    balance_audit_types = pd.DataFrame({
        'id': [x for x in range (1,len(balance_audit_types_list)+1)],
        'audit_type': balance_audit_types_list
        })

    balance_audit_types.to_csv('tables/balance_audit_types.csv', index=False)
    display(balance_audit_types)
    
    balance_audit_types_mapping = dict(zip(balance_audit_types['audit_type'], balance_audit_types['id']))
    
    return balance_audit_types, balance_audit_types_mapping

balance_audit_types, balance_audit_types_mapping = create_balance_audit_types(df)

In [None]:
# balance_types
def create_balance_types(df):
    balance_types_list = []
    for i in range(1,9):
        balance_types_list.extend(list(df[f'balance{i}_type_code']))
    
    balance_types_list = sorted(list(pd.Series(balance_types_list).dropna().astype(str).unique()))

    if len(balance_types_list) > 5:
        print('Warning: Unexpected balance type code(s).')

    balance_types = pd.DataFrame({
        'id': [x for x in range (1,len(balance_types_list)+1)],
        'type_code': balance_types_list
        })

    balance_types.to_csv('tables/balance_types.csv', index=False)
    display(balance_types)
    
    balance_types_mapping = dict(zip(balance_types['type_code'], balance_types['id']))
    
    return balance_types, balance_types_mapping

balance_types, balance_types_mapping = create_balance_types(df)

In [None]:
# balance_versions
def create_balance_versions(df):
    balance_versions_list = []
    for i in range(1,9):
        balance_versions_list.extend(list(df[f'balance{i}_version']))
    
    balance_versions_list = sorted(list(pd.Series(balance_versions_list).dropna().astype(str).unique()))

    if len(balance_versions_list) > 3:
        print('Warning: Unexpected balance version(s).')

    balance_versions = pd.DataFrame({
        'id': [x for x in range (1,len(balance_versions_list)+1)],
        'type_code': balance_versions_list
        })

    balance_versions.to_csv('tables/balance_versions.csv', index=False)
    display(balance_versions)
    
    balance_versions_mapping = dict(zip(balance_versions['type_code'], balance_versions['id']))
    
    return balance_versions, balance_versions_mapping

balance_versions, balance_versions_mapping = create_balance_versions(df)

In [None]:
# balance_documents
def create_balance_documents(df):
    balance_documents_list = []
    for i in range(1,9):
        balance_documents_list.extend(list(df[f'balance{i}_documents']))

    balance_documents_list = pd.Series(balance_documents_list).dropna().astype(str).unique()
    balance_documents_list = [x.split('+') for x in balance_documents_list]
    balance_documents_list = list(pd.Series(balance_documents_list).explode('+'))
    balance_documents_list = sorted(list(pd.Series(balance_documents_list).dropna().unique()))

    if len(balance_documents_list) > 3:
        print('Warning: Unexpected balance document(s).')

    balance_documents = pd.DataFrame({
        'id': [x for x in range (1,len(balance_documents_list)+1)],
        'document': balance_documents_list
        })

    balance_documents.to_csv('tables/balance_documents.csv', index=False)
    display(balance_documents)
    
    balance_documents_mapping = dict(zip(balance_documents['document'], balance_documents['id']))
    
    return balance_documents, balance_documents_mapping

balance_documents, balance_documents_mapping = create_balance_documents(df)

In [None]:
# balance_versions
def create_balance_versions(df):
    balance_versions_list = []
    for i in range(1,9):
        balance_versions_list.extend(list(df[f'balance{i}_version']))
    
    balance_versions_list = sorted(list(pd.Series(balance_versions_list).dropna().astype(str).unique()))

    if len(balance_versions_list) > 3:
        print('Warning: Unexpected balance version(s).')

    balance_versions = pd.DataFrame({
        'id': [x for x in range (1,len(balance_versions_list)+1)],
        'type_code': balance_versions_list
        })

    balance_versions.to_csv('tables/balance_versions.csv', index=False)
    display(balance_versions)
    
    balance_versions_mapping = dict(zip(balance_versions['type_code'], balance_versions['id']))
    
    return balance_versions, balance_versions_mapping

balance_versions, balance_versions_mapping = create_balance_versions(df)

In [None]:
# balances
def create_balances(df, balance_types_mapping, balance_versions_mapping, balance_audit_types_mapping):
    balances_df = df[['id', 'balance1_type_code', 'balance1_version', 'balance1_audit_type', 'balance1_valid_until_date', 'balance1_valid_until_year', 'balance1_year', 'balance1_score', 'balance1_documents']].dropna(subset=['balance1_type_code', 'balance1_version', 'balance1_audit_type', 'balance1_valid_until_date', 'balance1_valid_until_year', 'balance1_year', 'balance1_score', 'balance1_documents'], how='all')
    balances_df.columns = ['organizations_id', 'type_code', 'version', 'audit_type', 'valid_until_date', 'valid_until_year', 'year', 'score', 'balance_documents']
    balances_df['entry_number'] = 1

    for i in range(2,9):
        additional_df = df[['id', f'balance{i}_type_code', f'balance{i}_version', f'balance{i}_audit_type', f'balance{i}_valid_until_date', f'balance{i}_valid_until_year', f'balance{i}_year', f'balance{i}_score', f'balance{i}_documents']].dropna(subset=[f'balance{i}_type_code', f'balance{i}_version', f'balance{i}_audit_type', f'balance{i}_valid_until_date', f'balance{i}_valid_until_year', f'balance{i}_year', f'balance{i}_score', f'balance{i}_documents'], how='all')
        additional_df.columns = ['organizations_id', 'type_code', 'version', 'audit_type', 'valid_until_date', 'valid_until_year', 'year', 'score', 'balance_documents']
        additional_df['entry_number'] = i
        balances_df = pd.concat([balances_df, additional_df], axis=0)

    balances_df = balances_df.sort_values(by=['organizations_id', 'entry_number']).reset_index(drop=True)
    balances_df['id'] = [x for x in range(1, len(balances_df)+1)]
    balances_df = balances_df[['id', 'organizations_id', 'entry_number', 'type_code', 'version', 'audit_type', 'year', 'valid_until_date', 'valid_until_year', 'score', 'balance_documents']]

    balances_df['type_code'] = balances_df['type_code'].apply(lambda x: str(x) if isinstance(x, int) or isinstance(x, float) else x)
    balances_df['types_id'] = balances_df['type_code'].apply(lambda x: balance_types_mapping.get(x,x))

    balances_df['version'] = balances_df['version'].apply(lambda x: str(x) if isinstance(x, int) or isinstance(x, float) else x)
    balances_df['versions_id'] = balances_df['version'].apply(lambda x: balance_versions_mapping.get(x,x))    

    balances_df['audit_types_id'] = balances_df['audit_type'].apply(lambda x: balance_audit_types_mapping.get(x,x))

    #balances_df['year'] = balances_df['year'].fillna('None')
    #balances_df['year'] = balances_df['year'].apply(lambda x: int(x) if isinstance(x, float) else x)
    #balances_df['year'] = balances_df['year'].apply(lambda x: None if x='None' else x)

    balances_df['valid_until_date'] = balances_df['valid_until_date'].apply(lambda x: f'{x.split("-")[2]}-{x.split("-")[1]}-{x.split("-")[0]}' if isinstance(x, str) else x)

    balances_x_documents_temp_df = balances_df[['id', 'balance_documents']]
    balances = balances_df[['id', 'organizations_id', 'entry_number', 'types_id', 'versions_id', 'audit_types_id', 'year', 'valid_until_date', 'valid_until_year', 'score']]

    if balances.isna().values.any() == True:
        print('Warning: Missing values detected:')
        print(balances.isna().sum())
        balances = balances.fillna(r'\N')
    
    balances.to_csv('tables/balances.csv', index=False)
    display(balances)
    return balances, balances_x_documents_temp_df

balances, balances_x_documents_temp_df = create_balances(df, balance_types_mapping, balance_versions_mapping, balance_audit_types_mapping)

In [None]:
# balances_x_documents
def create_balances_x_documents(df, balances_x_documents_temp_df, balance_documents_mapping):
    balances_x_documents = balances_x_documents_temp_df
    balances_x_documents.columns = ['balances_id', 'balance_documents_id']
    balances_x_documents['balance_documents_id'] = balances_x_documents['balance_documents_id'].apply(lambda x: x.split('+') if isinstance(x, str) else x)
    balances_x_documents = balances_x_documents.explode('balance_documents_id')
    balances_x_documents = balances_x_documents.dropna().reset_index(drop=True)
    balances_x_documents['balance_documents_id'] = balances_x_documents['balance_documents_id'].apply(lambda x: balance_documents_mapping.get(x,x))
    balances_x_documents['id'] = [x for x in range(1,len(balances_x_documents)+1)]
    balances_x_documents = balances_x_documents[['id', 'balances_id', 'balance_documents_id']]
    
    balances_x_documents.to_csv('tables/balances_x_documents.csv', index=False)
    display(balances_x_documents)
    return balances_x_documents

balances_x_documents = create_balances_x_documents(df, balances_x_documents_temp_df, balance_documents_mapping)