In [1]:
# Import libraries
import os
import pandas as pd
from pathlib import Path
import configparser

In [2]:
AltApp_path = Path(r"U:\WP 765 Energy RIC\Private data & analysis\Alternative Approach_Private R&D")

In [None]:
# Import config parameters
file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Inputs tables\config.ini')

config = configparser.ConfigParser(
    converters={'list': lambda x: [i.strip() for i in x.split(',')]}
)
     
config.read(file)

CASE = 'EU_28'

REGION = config.getlist(CASE,'ORBIS_REGION')
PATH = config.get(CASE,'TEST_PATH')
YEAR_LASTAV = config.getint(CASE,'YEAR_LASTAV')
FILE_N = config.getint(CASE,'SUBS_ID_FILE_N')


In [3]:
report = []
report_DupSubs_bylvl = []

In [5]:
# Read # 1 - Inititial listed company set
file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\Listed companies - Main R&D performers.csv')

ListComp_df = pd.read_csv(file)

In [6]:
ListCompSub_df = pd.DataFrame()

# Read ORBIS input list for subsidiaries of publically listed companies by world region region
for number in [range(1,FILE_N)]:
    
    file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Inputs tables' + PATH + 'Listed R&D performers - Subsidiaries #' + str(number) + '.xlsx'                           )
    
    df = pd.read_excel(file,
                       sheet_name = 'Results',
                       na_values = 'No data fulfill your filter criteria',
                       names = ['Rank', 'Company_name', 'BvD_id', 'Group_Subs_Count', 'Sub_BvD_id', 'Subs_lvl'],
                       dtype = {'BvD_id': str,'Group_Subs_Count': pd.Int64Dtype(), 'Sub_BvD_id': str, 'Subs_lvl': pd.Int8Dtype()}
                      ).drop(columns = ['Rank','Subs_lvl','Group_Subs_Count'])

    # Consolidate list of subsidiaries
    ListCompSub_df = ListCompSub_df.append(df)
    
# Drops not BVd identified subsidiaries and (group,subs) duplicates
ListCompSub_df = ListCompSub_df[~pd.isnull(ListCompSub_df['BvD_id'])].drop_duplicates(['BvD_id','Sub_BvD_id'], inplace = True)

In [9]:
# Check if main performers are in the list of subsidiaries
ListComp_df['Is a Sub'] = ListComp_df['BvD_id'].isin(ListCompSub_df['Sub_BvD_id'])

# Check if main performers are identified as MNCs
ListComp_df['Is in MNC'] = ListComp_df['BvD_id'].isin(MNC_df['BvD_id'])

# Check if duplicate subsidiaries    
ListCompSub_df['Is a First Sub Duplicate'] = ListCompSub_df.duplicated(subset = 'Sub_BvD_id', keep = 'first')
ListCompSub_df['Is a Sub Duplicate'] = ListCompSub_df.duplicated(subset = 'Sub_BvD_id', keep = False)

# Check if subs are identified as MNCs
ListCompSub_df['Is in MNC'] = ListCompSub_df['Sub_BvD_id'].isin(MNC_df['BvD_id'])

TypeError: 'NoneType' object is not subscriptable

In [None]:
report.append({'Set': 'Initial',
               '#Performers': ListComp_df['BvD_id'].count().sum(),
               '#Performers_beingSubs': ListComp_df['BvD_id'][ListComp_df['Is a Sub'] == True].count().sum(),
               '#Subs': ListCompSub_df['Sub_BvD_id'].count().sum(),
               '#Subs_Dupl': ListCompSub_df['Sub_BvD_id'][ListCompSub_df['Is a Sub Duplicate'] == True].count().sum(),
               '#MNC': MNC_df['BvD_id'].count().sum(),
               '#Performers_inMNC': ListComp_df['BvD_id'][ListComp_df['Is in MNC'] == True].count().sum(),
               '#Subs_inMNC':ListCompSub_df['Sub_BvD_id'][ListCompSub_df['Is in MNC'] == True].count().sum()
              })

#df1 = ListCompSub_df[ListCompSub_df['Is a First Sub Duplicate'] == True].groupby('Subs_lvl').count()
#df1 = df1[['BvD_id']].rename(columns={'BvD_id': 'Initial_#FirstDuplicate'})

#df2 = ListCompSub_df[ListCompSub_df['Is a Sub Duplicate'] == True].groupby('Subs_lvl').count()
#df2 = df2[['BvD_id']].rename(columns={'BvD_id': 'Initial_#AllDuplicate'})

#DupSub_bylvl_df = pd.concat([df1,df2], axis = 1, sort = True)

ListComp_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Listed companies - Main R&D performers.csv'),
                   index = False,
                   float_format = '%.10f'
                  )

ListCompSub_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Listed companies - Subsidiaries.csv'),
                      index = False,
                      float_format = '%.10f'
                     )

In [None]:
# Set of main performers that are not also a subsidiary
CleanListComp_df = ListComp_df[ListComp_df['Is a Sub'] == False]

# Updated set of subsidiaries
CleanListCompSub_df = ListCompSub_df[ListCompSub_df['BvD_id'].isin(CleanListComp_df['BvD_id'])].drop(
    columns = ['Is a First Sub Duplicate', 'Is a Sub Duplicate']
)

# Check if duplicate subsidiaries    
CleanListCompSub_df['Is a First Sub Duplicate'] = CleanListCompSub_df.duplicated(subset = 'Sub_BvD_id', keep = 'first')
CleanListCompSub_df['Is a Sub Duplicate'] = CleanListCompSub_df.duplicated(subset = 'Sub_BvD_id', keep = False)

In [None]:
report.append({'Set': 'Excluding performers that are subsidiaries',
               '#Performers': CleanListComp_df['BvD_id'].count().sum(),
               '#Performers_beingSubs': CleanListComp_df['BvD_id'][CleanListComp_df['Is a Sub'] == True].count().sum(),
               '#Subs': CleanListCompSub_df['Sub_BvD_id'].count().sum(),
               '#Subs_Dupl': CleanListCompSub_df['Sub_BvD_id'][CleanListCompSub_df['Is a Sub Duplicate'] == True].count().sum(),
               '#MNC': MNC_df['BvD_id'].count().sum(),
               '#Performers_inMNC': CleanListComp_df['BvD_id'][CleanListComp_df['Is in MNC'] == True].count().sum(),
               '#Subs_inMNC':CleanListCompSub_df['Sub_BvD_id'][CleanListCompSub_df['Is in MNC'] == True].count().sum()
              })

#df1 = CleanListCompSub_df[CleanListCompSub_df['Is a First Sub Duplicate'] == True].groupby('Subs_lvl').count()
#df1 = df1[['BvD_id']].rename(columns={'BvD_id': 'Clean_#FirstDuplicate'})

#df2 = CleanListCompSub_df[CleanListCompSub_df['Is a Sub Duplicate'] == True].groupby('Subs_lvl').count()
#df2 = df2[['BvD_id']].rename(columns={'BvD_id': 'Clean_#AllDuplicate'})

#CleanDupSub_bylvl_df = pd.concat([df1,df2], axis = 1, sort = True)

In [None]:
CleanListComp_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Listed companies - Main R&D performers - Excl Perf_beingSubs.csv'),
                        index = False,
                        columns = ['BvD_id', 'Company_name', 'RnD_Y_LastAv', 'Year'],
                        float_format = '%.10f'                        
                       )

CleanListCompSub_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Listed companies - Subsidiaries - Excl Perf_beingSubs.csv'),
                           index = False,
                           columns = ['BvD_id', 'Company_name', 'Sub_BvD_id'],
                           float_format = '%.10f'
                          )

#pd.concat([DupSub_bylvl_df, CleanDupSub_bylvl_df], axis = 1).to_csv(
#    AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Duplicate Subsidiaries_by lvl - Excl Perf_beingSubs.csv')
#)

pd.DataFrame(report).to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - report.csv'),
                            index = False,
                            columns = ['Set', '#Performers', '#Performers_beingSubs', '#Subs', '#Subs_Dupl', '#MNC',
                                       '#Performers_inMNC', '#Subs_inMNC']
                       )