In [1]:
# Import libraries
import os
import pandas as pd
from pathlib import Path
import configparser
import json
from tabulate import tabulate

In [2]:
# Initialize
base_path = Path(r'U:\WP 765 Energy RIC\Private data & analysis\Alternative Approach_Private R&D\Orbis_Data\Data_2020')

report = []

In [3]:
# Import config parameters
config = configparser.ConfigParser(
    converters={'list': lambda x: [i.strip() for i in x.split(',')]}
)
     
config.read(base_path.joinpath(r'config.ini'))

CASE = 'EU_28'

MAPPING = Path(config.get('DEFAULT','MAPPING_PATH'))
SCREENING_KEYS = config.getlist('DEFAULT','SCREENING_KEYS')

REGION = config.getlist(CASE,'ORBIS_REGION')
CASE_ROOT = base_path.joinpath(config.get(CASE,'CASE_ROOT_PATH'))
YEAR_LASTAV = config.getint(CASE,'YEAR_LASTAV')
SUBS_ID_FILE_N = config.getint(CASE,'SUBS_ID_FILE_N')
SUBS_FIN_FILE_N = config.getint(CASE,'SUBS_FIN_FILE_N')
GROUPS_FIN_FILE_N = config.getint(CASE,'GROUPS_FIN_FILE_N')
METHOD = config.get(CASE,'SUBS_METHOD')

In [4]:
# Read ORBIS input list for subsidiaries financials
Subs_fin_df = pd.read_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries - Financials.csv'),
                          na_values = 'n.a.',
                          dtype = {
                              col: str for col in ['BvD9', 'BvD_id']
                          }
                         )

In [5]:
report.append({'Set': 'Initial',
               '#Subs': Subs_fin_df['BvD9'].count().sum(),
               '#Subs with clean RnD keyword': 'n.a.'
              })

In [6]:
# Drops not BVd identified subsidiaries and duplicates
Subs_fin_df = Subs_fin_df.drop_duplicates('BvD9')

In [7]:
report.append({'Set': 'Clean',
               '#Subs': Subs_fin_df['BvD9'].count().sum(),
               '#Subs with clean RnD keyword': 'n.a.'               
              })

In [8]:
# Read keywords
with open(base_path.joinpath(r'Keywords.json'), 'r') as file:
    keywords = json.load(file)
    
categories = list(keywords.keys())
    
# Calculate exposure mask    
for category in categories:    
    
    Subs_fin_df[category] = False
    
    for keyword in keywords[category]:
        
#        df['Trade_check'] = df['Trade_desc'].str.contains(keyword, regex = False)
#        df['Prod_check'] = df['Prod&Serv_desc'].str.contains(keyword, regex = False)
#        df['FullOverview_check'] = df['FullOverview_desc'].str.contains(keyword, regex = False)
        
        Subs_fin_df[category] |=  Subs_fin_df['Trade_desc'].str.contains(keyword, case = False, regex = False) | Subs_fin_df['Prod&Serv_desc'].str.contains(keyword, case = False, regex = False) | Subs_fin_df['FullOverview_desc'].str.contains(keyword, case = False, regex = False)       

In [9]:
# Calculate exposure at subsidiary level
Sub_exp_df = Subs_fin_df.loc[:,['Company_name', 'BvD9'] + categories]
    
Sub_exp_df['Sub_Turnover'] = Subs_fin_df.loc[:,['OpRevY' + str(YY) for YY in range(10,20)]].sum(axis = 1)

Sub_exp_df['Keyword_mask'] = list(map(bool, Subs_fin_df[[cat for cat in categories if cat not in ['Generation','RnD']]].sum(axis = 1)))

In [10]:
report.append({'Set': 'Screened',
               '#Subs': Subs_fin_df['BvD9'].count().sum(),
               '#Subs matching keywords': Sub_exp_df.loc[Sub_exp_df['Keyword_mask'] == True,'BvD9'].count().sum()
              })

In [11]:
report_df = pd.DataFrame(report)

report_df = report_df[['Set','#Subs','#Subs matching keywords']]

In [13]:
# Append report
with open(CASE_ROOT.joinpath(r'Report.txt'),'a') as report:
    report.write('#3 - Keyword screening of subsidiaries\n\n')
    report.write(tabulate(report_df, tablefmt = 'simple', headers = report_df.columns, showindex = False
                         )
                )
    report.write('\n\n')
    
# Save output tables
Subs_fin_df.to_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries - Financials.csv'),
                   index = False,
                   float_format = '%.10f',
                   na_rep = 'n.a.'
                  )

Sub_exp_df.to_csv(CASE_ROOT.joinpath(r'Listed companies subsidiaries - Screening.csv'),
                  index = False,
                  columns = ['BvD9'] + SCREENING_KEYS,
                  float_format = '%.10f',
                  na_rep = 'n.a.'
                 )