In [1]:
# Import libraries
import os
import pandas as pd
from pathlib import Path
import configparser
import json
from tabulate import tabulate

In [2]:
base_path = Path(r'U:\WP 765 Energy RIC\Private data & analysis\Alternative Approach_Private R&D\Orbis_Data\Data_2020')

In [3]:
# Import config parameters
config = configparser.ConfigParser(
    converters={'list': lambda x: [i.strip() for i in x.split(',')]}
)
     
config.read(base_path.joinpath(r'config.ini'))

CASE = 'EU_28'

MAPPING = Path(config.get('DEFAULT','MAPPING_PATH'))
SCREENING_KEYS = config.getlist('DEFAULT','SCREENING_KEYS')

REGION = config.getlist(CASE,'ORBIS_REGION')
CASE_ROOT = base_path.joinpath(config.get(CASE,'CASE_ROOT_PATH'))
YEAR_LASTAV = config.getint(CASE,'YEAR_LASTAV')
SUBS_ID_FILE_N = config.getint(CASE,'SUBS_ID_FILE_N')
SUBS_FIN_FILE_N = config.getint(CASE,'SUBS_FIN_FILE_N')
GROUPS_FIN_FILE_N = config.getint(CASE,'GROUPS_FIN_FILE_N')
METHOD = config.get(CASE,'SUBS_METHOD')

In [4]:
# Initialize DFs
ListComp_df = pd.DataFrame()
report_df = pd.DataFrame()

for region in REGION:
    
    print(region)

    # Read ORBIS input list of publically listed companies by world region region
    df = pd.read_excel(CASE_ROOT.joinpath(r'Input\Listed companies - ' + region + '.xlsx'),
                       sheet_name = 'Results',                       
                       names = ['Rank', 'Company_name', 'BvD9', 'BvD_id', 'Country_2DID_ISO'] + ['RnD_Y' + str(YY) for YY in range(10,19)[::-1]],
                       na_values = 'n.a.',
                       dtype = {
                           **{col: str for col in ['Company_name', 'BvD9', 'BvD_id', 'Country_2DID_ISO']},
                           **{col: float for col in ['RnD_Y' + str(YY) for YY in range(10,20)]}
                       }
                      ).drop(columns = 'Rank')
 
    df['Y_LastAv'] = YEAR_LASTAV
                           
    df['RnD_mean'] = df[['RnD_Y' + str(YY) for YY in range(10,19)]].mean(axis = 1, skipna = True)
    
    df['RnD_Y_LastAv'] = df['RnD_Y' + str(abs(YEAR_LASTAV) % 100)]
                           
    # Identify the top companies that constitute 99% of the R&D expenses    
    start = 0.0
    count = 0    

    while start < 0.99 * df['RnD_mean'].sum():
        count += 1
        start = df.nlargest(count, ['RnD_mean'])['RnD_mean'].sum()
        
    ListComp_df_region = df.nlargest(count, ['RnD_mean'])
                             
    # ListComp_df_region['Region'] = region                       
    
    # Calculates main regional satistics
    region_report_df = pd.DataFrame({'Year': YEAR_LASTAV,
                                     'Total_BvD9_count': df['BvD9'].count().sum(),
                                     'Total_BvD_id_count': df['BvD_id'].count().sum(),
                                     'Total_RnD_mean': df['RnD_mean'].sum(),
                                     'Selected_BvD9_count': ListComp_df_region['BvD9'].count().sum(),
                                     'Selected_BvD_id_count': ListComp_df_region['BvD_id'].count().sum(),
                                     'Sum_of_selected_RnD_mean': ListComp_df_region['RnD_mean'].sum()
                                    }, index = [region])
    
    # Consolidate statistics and list of top R&D performers over different regions
    ListComp_df = ListComp_df.append(ListComp_df_region)
    
    report_df = report_df.append(region_report_df)
    report_df.index.name = region

EU_28


In [5]:
# Drop duplicates
ListComp_df_clean = ListComp_df.drop_duplicates(subset = 'BvD_id', keep = 'first')

# Update satistics
region_report_df = pd.DataFrame({'Year': YEAR_LASTAV,
                                 'Total_BvD9_count': ListComp_df['BvD9'].count().sum(),
                                 'Total_BvD_id_count': ListComp_df['BvD_id'].count().sum(),
                                 'Total_RnD_mean': ListComp_df['RnD_mean'].sum(),
                                 'Selected_BvD9_count': ListComp_df_clean['BvD9'].count().sum(),
                                 'Selected_BvD_id_count': ListComp_df_clean['BvD_id'].count().sum(),
                                 'Sum_of_selected_RnD_mean': ListComp_df_clean['RnD_mean'].sum()
                                }, index = ['Clean_Bvd_ID'])

report_df = report_df.append(region_report_df)
report_df.index.name = 'Clean_Bvd_ID'

In [6]:
# Drop duplicates 
ListComp_df_clean = ListComp_df.drop_duplicates(subset = 'BvD9', keep = 'first')

# Update satistics
region_report_df = pd.DataFrame({'Year': YEAR_LASTAV,
                                 'Total_BvD9_count': ListComp_df['BvD9'].count().sum(),
                                 'Total_BvD_id_count': ListComp_df['BvD_id'].count().sum(),
                                 'Total_RnD_mean': ListComp_df['RnD_mean'].sum(),
                                 'Selected_BvD9_count': ListComp_df_clean['BvD9'].count().sum(),
                                 'Selected_BvD_id_count': ListComp_df_clean['BvD_id'].count().sum(),
                                 'Sum_of_selected_RnD_mean': ListComp_df_clean['RnD_mean'].sum()
                                }, index = ['Clean_Bvd9'])

report_df = report_df.append(region_report_df)
report_df.index.name = 'Clean_Bvd9'

In [7]:
# Read Country mapping file
country_df = pd.read_csv(CASE_ROOT.joinpath(r'Mapping\Country_table.csv'))

In [8]:
# Merging group exposure with group RnD
ListComp_df_clean_merge = pd.merge(
    ListComp_df_clean, country_df[['Country_2DID_ISO','Country_3DID_ISO','World_Player']],
    left_on='Country_2DID_ISO', right_on='Country_2DID_ISO',
    how='left',
    suffixes=(False, False)
)

In [9]:
# Append report
with open(CASE_ROOT.joinpath(r'Report.txt'),'w') as report:
    report.write('Step #1 - Inititial listed company set\n\n' + 'RnD in EUR million\n\n')
    report.write(tabulate(report_df, tablefmt = 'simple', headers = report_df.columns))
    report.write('\n\n')    

# Save output tables 
ListComp_df_clean_merge.to_csv(CASE_ROOT.joinpath(r'Listed companies.csv'),
                   index = False,
                   columns = ['BvD9', 'BvD_id','Company_name', 'Country_3DID_ISO', 'World_Player','RnD_mean','Y_LastAv','RnD_Y_LastAv'],
                   float_format = '%.10f',
                   na_rep = 'n.a.'
                  )