In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path
import configparser
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
AltApp_path = Path(r"U:\WP 765 Energy RIC\Private data & analysis\Alternative Approach_Private R&D")

In [3]:
config = configparser.ConfigParser(
    converters={'list': lambda x: [i.strip() for i in x.split(',')]}
)

file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Inputs tables\config.ini')
        
config.read(file) 
    
ORBIS_WORLD_GEO_REGION = config.getlist('DEFAULT','ORBIS_WORLD_GEO_REGION')
YEAR_LASTAV = config.getint('DEFAULT','YEAR_LASTAV')
TEST_PATH = config.get('DEFAULT','TEST_PATH')

In [4]:
# Read # 1 - Inititial listed company set
file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#1 - Listed companies - Main R&D performers.csv')

ListComp_df = pd.read_csv(file)

In [5]:
# Read input list of MNC companies used for R&D estimation 
file = AltApp_path.joinpath(r'Orbis_Data\Data_2019\Massive_Download\00_MNCs_2.xlsx')

MNC_df = pd.read_excel(file,
                       usecols = [i for i in range(11)],
                       header = 0,
                       na_values = 'n.a.',
                       names = ['JRC_id', 'Company_name', 'BvD_id','GUO_BvD_id', 'GUO_Name', 'GUO_Country_ISO',
                                'GUO_NACE_Code', 'GUO_NACE_Desc', 'Y_LastAv', 'Emp_Y_LastAv','RnD_Y_LastAv'
                               ],
                       dtype = {'JRC_id': int, 'GUO_NACE_Code': str,'Y_LastAv': pd.Int16Dtype(), 'RnD_Y_LastAv': float}
                      ).drop(columns = ['JRC_id','GUO_BvD_id', 'GUO_Name', 'GUO_Country_ISO','GUO_NACE_Code', 'GUO_NACE_Desc',
                                        'Emp_Y_LastAv'])

In [6]:
# Create an extended set to compare ListComp to MNC

GroupBench_df = pd.concat(
    [ListComp_df[['BvD_id','Company_name']], MNC_df[['BvD_id','Company_name']]]
)

GroupBench_df = GroupBench_df.drop_duplicates(subset = 'BvD_id')

GroupBench_df['Is LC'] = GroupBench_df['BvD_id'].isin(ListComp_df['BvD_id'])
GroupBench_df['Is MNC'] = GroupBench_df['BvD_id'].isin(MNC_df['BvD_id'])

GroupBench_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Group Benchmark - Extended IDs.csv'),
                     index = False
                      )

GroupBench_df.describe()

Unnamed: 0,BvD_id,Company_name,Is LC,Is MNC
count,10029,10029,10029,10029
unique,10029,10027,2,2
top,JP1100001020696,TOA CORPORATION,True,False
freq,1,2,9729,7811


In [7]:
# Import RnD expenses and location
file = AltApp_path.joinpath(r'Orbis_Data\Data_2020\Inputs tables' + TEST_PATH + 'Group Benchmark.xlsx')
    
df = pd.read_excel(file,
                   sheet_name = 'Results',
                   names = ['Rank', 'Company_name', 'BvD_id','Entity_type','Country_ISO', 'OpRev_Y_LastAv', 'RnD_Y_2018','RnD_Y_2015','RnD_Y_2010'],
                   na_values = 'n.a.',
                   dtype = {
                       **{col: str for col in ['Company_name','BvD_id','Country_ISO','Entity_type','Detail_type']},
                       **{col: float for col in ['OpRev_Y_LastAv','RnD_Y_2018','RnD_Y_2015','RnD_Y_2010']}
                   }
                  ).drop(columns = ['Rank','Company_name'])

Merged_GroupBench_df = pd.merge(
    GroupBench_df, df,
    left_on='BvD_id', right_on='BvD_id',
    how='inner'
)

Merged_GroupBench_df.describe()

Merged_GroupBench_df.to_csv(AltApp_path.joinpath(r'Orbis_Data\Data_2020\Output tables\#2 - Group Benchmark - RnD expenses.csv'),
                            index = False
                           )

Merged_GroupBench_df.describe()

Unnamed: 0,OpRev_Y_LastAv,RnD_Y_2018,RnD_Y_2015,RnD_Y_2010
count,9992.0,9553.0,9892.0,6324.0
mean,2542.714542,85.061069,68.1661,66.436325
std,12521.330842,586.263467,446.802551,377.91315
min,-0.058776,-2.491702,0.0,-5.87219
25%,44.298686,2.029809,1.668955,0.795541
50%,187.099819,6.331588,4.473589,3.499978
75%,886.461359,26.090368,18.20051,17.536111
max,369419.267468,25185.140147,11518.325275,7978.135813


In [18]:
# Distribution calculation

# bins = [bin for bin in range(0,15000,1000)]

bins = pd.interval_range(start=0, freq=1000, end=5000, closed='left')

LC_dist_df = Merged_GroupBench_df[
    (Merged_GroupBench_df['Is LC'] == True) & (Merged_GroupBench_df['RnD_Y_2015'] > 0) & (Merged_GroupBench_df['RnD_Y_2015'] < 5000)
].drop(columns = ['Is LC', 'OpRev_Y_LastAv', 'RnD_Y_2018','RnD_Y_2010'])

LC_dist_df['RnD_range'] = pd.cut(LC_dist_df['RnD_Y_2015'], bins)

LC_dist = LC_dist_df[['RnD_range','RnD_Y_2015']].groupby('RnD_range').sum()

#LC_dist.reset_index(inplace=True)

LC_dist_df.head()

Unnamed: 0,BvD_id,Company_name,Is MNC,Entity_type,Country_ISO,RnD_Y_2015,RnD_range
0,ZA194602245206,ANGLO AMERICAN PLATINUM LIMITED,False,Corporate,ZA,19.499119,"[0, 1000)"
1,ZA193600896306,SAPPI LIMITED,True,Corporate,ZA,18.744978,"[0, 1000)"
2,ZA198900216406,ARCELORMITTAL SOUTH AFRICA LIMITED,False,Corporate,ZA,8.981413,"[0, 1000)"
3,ZA200701623606,ADCOCK INGRAM HOLDINGS LIMITED,False,Corporate,ZA,8.739894,"[0, 1000)"
4,ZW30002KZ,SEED CO LIMITED,False,Corporate,ZW,5.883182,"[0, 1000)"


In [9]:
#MNC_dist_df = Merged_GroupBench_df[Merged_GroupBench_df['Is MNC'] == True].drop(columns = ['Is MNC','RnD_Y_2018','RnD_Y_2010'])

#MNC_dist_df['RnD_range'] = pd.cut(MNC_dist_df['RnD_Y_2015'], bins)

#MNC_dist = MNC_dist_df[['RnD_range','RnD_Y_2015']].groupby('RnD_range').sum()

In [19]:
# Distribution display

#sns.set(style="white", palette="muted", color_codes=True)

#sns.catplot(x= 'RnD_range', y = 'RnD_Y_2015', data = MNC_dist)

#sns.distplot(MNC_dist)