In [1]:
# Import libraries
import os
import pandas as pd
from pathlib import Path
import configparser
import json
from tabulate import tabulate

In [2]:
# Initiallize
base_path = Path(r'U:\WP 765 Energy RIC\Private data & analysis\Alternative Approach_Private R&D\Orbis_Data\Data_2020')

report = []
ListCompSub_df = pd.DataFrame()

In [3]:
# Import config parameters
config = configparser.ConfigParser(
    converters={'list': lambda x: [i.strip() for i in x.split(',')]}
)
     
config.read(base_path.joinpath(r'config.ini'))

CASE = 'EU_28'

MAPPING = Path(config.get('DEFAULT','MAPPING_PATH'))
SCREENING_KEYS = config.getlist('DEFAULT','SCREENING_KEYS')

REGION = config.getlist(CASE,'ORBIS_REGION')
CASE_ROOT = base_path.joinpath(config.get(CASE,'CASE_ROOT_PATH'))
YEAR_LASTAV = config.getint(CASE,'YEAR_LASTAV')
SUBS_ID_FILE_N = config.getint(CASE,'SUBS_ID_FILE_N')
SUBS_FIN_FILE_N = config.getint(CASE,'SUBS_FIN_FILE_N')
GROUPS_FIN_FILE_N = config.getint(CASE,'GROUPS_FIN_FILE_N')
METHOD = config.get(CASE,'SUBS_METHOD')

In [4]:
# Load inititial listed company set
ListComp_df = pd.read_csv(CASE_ROOT.joinpath(r'Listed companies.csv'),
                          na_values = 'n.a.',
                          dtype = {
                              col: str for col in ['BvD9', 'BvD_id']
                          }
                         )

In [5]:
# Load selected listed company subsidiaries set
ListCompSub_df = pd.read_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries.csv'),
                             na_values = 'n.a.',
                             dtype = {
                                 col: str for col in ['BvD9', 'BvD_id', 'Sub_BvD9', 'Sub_BvD_id']
                                     }
                            )

In [6]:
# Check if main performers are in the list of subsidiaries
ListComp_df['Is a Sub'] = ListComp_df['BvD9'].isin(ListCompSub_df['Sub_BvD9'])

# Check if duplicate subsidiaries    
ListCompSub_df['Is a Perf'] = ListCompSub_df['Sub_BvD9'].isin(ListComp_df['BvD9'])
ListCompSub_df['Is a First Sub Duplicate'] = ListCompSub_df.duplicated(subset = 'Sub_BvD9', keep = 'first')
ListCompSub_df['Is a Sub Duplicate'] = ListCompSub_df.duplicated(subset = 'Sub_BvD9', keep = False)

In [7]:
report.append({'Set': 'Initial',
               '#Performers': ListComp_df['BvD9'].count().sum(),
               '#Performers_beingSubs': ListComp_df['BvD9'][ListCompSub_df['Is a Perf'] == True].count().sum(),
               '#Subs': ListCompSub_df['Sub_BvD9'].count().sum(),
               '#Subs_beingPerfs': ListCompSub_df['Sub_BvD9'][ListComp_df['Is a Sub'] == True].count().sum(),               
               '#Subs_Dupl': ListCompSub_df['Sub_BvD9'][ListCompSub_df['Is a Sub Duplicate'] == True].count().sum()
              })

In [8]:
# Drops not BVd identified subsidiaries and (group,subs) duplicates
ListCompSub_df = ListCompSub_df.dropna().drop_duplicates(['BvD9','Sub_BvD9'])
    
    
#axis = 'index', how = 'any', subset = ['BvD9','BvD_id','Sub_BvD9','Sub_BvD_id'], inplace = False)

#ListCompSub_df = ListCompSub_df.drop_duplicates(['BvD_id','Sub_BvD_id'])

In [9]:
report.append({'Set': 'Cleaned',
               '#Performers': ListComp_df['BvD9'].count().sum(),
               '#Performers_beingSubs': ListComp_df['BvD9'][ListCompSub_df['Is a Perf'] == True].count().sum(),
               '#Subs': ListCompSub_df['Sub_BvD9'].count().sum(),
               '#Subs_beingPerfs': ListCompSub_df['Sub_BvD9'][ListComp_df['Is a Sub'] == True].count().sum(),               
               '#Subs_Dupl': ListCompSub_df['Sub_BvD9'][ListCompSub_df['Is a Sub Duplicate'] == True].count().sum()
              })

In [10]:
# Set of main performers that are not also a subsidiary
ListComp_XPerfIsSub_df = ListComp_df[ListComp_df['Is a Sub'] == False]

# Updated set of subsidiaries
ListCompSub_XPerfIsSub_df = ListCompSub_df[ListCompSub_df['BvD9'].isin(ListComp_XPerfIsSub_df['BvD9'])].drop(
    columns = ['Is a First Sub Duplicate', 'Is a Sub Duplicate']
)

# Check if duplicate subsidiaries    
ListCompSub_XPerfIsSub_df['Is a First Sub Duplicate'] = ListCompSub_XPerfIsSub_df.duplicated(subset = 'Sub_BvD9', keep = 'first')
ListCompSub_XPerfIsSub_df['Is a Sub Duplicate'] = ListCompSub_XPerfIsSub_df.duplicated(subset = 'Sub_BvD9', keep = False)

In [11]:
report.append({'Set': 'Excluding performers that are subsidiaries',
               '#Performers': ListComp_XPerfIsSub_df['BvD9'].count().sum(),
               '#Performers_beingSubs': ListComp_XPerfIsSub_df['BvD9'][ListCompSub_XPerfIsSub_df['Is a Perf'] == True].count().sum(),
               '#Subs': ListCompSub_XPerfIsSub_df['Sub_BvD9'].count().sum(),
               '#Subs_beingPerfs': ListCompSub_XPerfIsSub_df['Sub_BvD9'][ListComp_XPerfIsSub_df['Is a Sub'] == True].count().sum(),               
               '#Subs_Dupl': ListCompSub_XPerfIsSub_df['Sub_BvD9'][ListCompSub_XPerfIsSub_df['Is a Sub Duplicate'] == True].count().sum()
              })

In [12]:
report_df = pd.DataFrame(report)

report_df = report_df[['Set','#Performers','#Performers_beingSubs','#Subs','#Subs_beingPerfs', '#Subs_Dupl']]

In [None]:
# Append report
with open(CASE_ROOT.joinpath(r'Report.txt'),'a') as report:
    report.write('#2 - Clean list of top R&D performers and subsidiaries\n\n')
    report.write(tabulate(report_df, tablefmt = 'simple', headers = report_df.columns, showindex = False
                         )
                )
    report.write('\n\n')
    
# Save output tables
ListComp_df.to_csv(CASE_ROOT.joinpath(r'Listed companies.csv'),
                   index = False,
                   float_format = '%.10f',
                   na_rep = 'n.a.'
                  )

ListCompSub_df.to_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries.csv'),
                      index = False,
                      float_format = '%.10f',
                      na_rep = 'n.a.'
                     )

ListComp_XPerfIsSub_df.to_csv(CASE_ROOT.joinpath(r'Listed companies - XPerfIsSub.csv'),
                        index = False,
                        columns = ['BvD9', 'BvD_id','Company_name', 'Country_3DID_ISO', 'World_Player','RnD_mean','Y_LastAv','RnD_Y_LastAv'],
                        float_format = '%.10f',
                        na_rep = 'n.a.'
                       )

ListCompSub_XPerfIsSub_df.to_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries - XPerfIsSub.csv'),
                           index = False,
                           columns = ['BvD9', 'BvD_id', 'Company_name',  'Sub_BvD9', 'Sub_BvD_id'],
                           float_format = '%.10f',
                           na_rep = 'n.a.'
                          )