In [None]:
# Import libraries
import pandas as pd
import method as mtd
import config as cfg
import report as rpt
import datetime
import json
from pathlib import Path

In [None]:
# Load config files
(cases, cases_as_strings, files, use_case, place) = cfg.init()

In [None]:
 # Initialize DFs
all_comp = main_comp = pd.DataFrame()
step_report = report = {}
company_type = ''
i = 0

In [None]:
print('Read input table ...')

for region in cases['REGIONS']:
    i += 1
    print(region + ' (File #' + str(i) + '/' + str(len(cases['REGIONS'])) + ')')

    # Read input list of companies by world region
    df = pd.read_excel(cases['CASE_ROOT'].joinpath(r'input/main_comps_id/' + str(company_type) + 'listed companies - ' + region + '.xlsx'),
                       sheet_name='Results',
                       names=['rank', 'company_name', 'bvd9', 'bvd_id', 'country_2DID_iso'] + ['rnd_y' + str(YY) for
                                                                                               YY in
                                                                                               range(10, 19)[::-1]],
                       na_values='n.a.',
                       dtype={
                           **{col: str for col in ['company_name', 'bvd9', 'bvd_id', 'country_2DID_iso']},
                           **{col: float for col in ['rnd_y' + str(YY) for YY in range(10, 19)]}
                       }
                       ).drop(columns='rank')

    df['y_lastav'] = cases['YEAR_LASTAV']

    df['rnd_mean'] = df[['rnd_y' + str(YY) for YY in range(10, 19)]].mean(axis=1, skipna=True)

    df['rnd_y_lastav'] = df['rnd_y' + str(cases['YEAR_LASTAV'])[-2:]]

    # Identify the top companies that constitute 99% of the R&D expenses
    start = 0.0
    count = 0

    while start < 0.99 * df['rnd_mean'].sum():
        count += 1
        start = df.nlargest(count, ['rnd_mean'])['rnd_mean'].sum()

    main_comp_region = df.nlargest(count, ['rnd_mean'])

    # main_comp_region['Region'] = region

    # Calculates main regional statistics
    step_report[region.capitalize()] = {'total_bvd9': df['bvd9'].count().sum(),
                                   'total_rnd_y' + str(cases['YEAR_LASTAV'])[-2:]: df[
                                       'rnd_y' + str(cases['YEAR_LASTAV'])[-2:]].sum(),
                                   'selected_bvd9': main_comp_region['bvd9'].count().sum(),
                                   'selected_rnd_y' + str(cases['YEAR_LASTAV'])[-2:]: main_comp_region[
                                       'rnd_y' + str(cases['YEAR_LASTAV'])[-2:]].sum()
                                   }

    # Consolidate statistics and list of top R&D performers over different regions
    all_comp = all_comp.append(df)
    main_comp = main_comp.append(main_comp_region)

In [None]:
print('Clean output table ...')

# Drop duplicates
main_comp_clean = main_comp.drop_duplicates(subset='bvd9', keep='first')

In [None]:
# Update report statistics
step_report['Total'] = {'total_bvd9': all_comp['bvd9'].count().sum(),
                   'total_rnd_y' + str(cases['YEAR_LASTAV'])[-2:]: all_comp[
                       'rnd_y' + str(cases['YEAR_LASTAV'])[-2:]].sum(),
                   'selected_bvd9': main_comp_clean['bvd9'].count().sum(),
                   'selected_rnd_y' + str(cases['YEAR_LASTAV'])[-2:]: main_comp_clean[
                       'rnd_y' + str(cases['YEAR_LASTAV'])[-2:]].sum()
                   }

In [None]:
print('Merging with country_map ...')

# Merging group country_map for allocation to world player categories
merged = pd.merge(
    main_comp_clean, country_map[['country_2DID_iso', 'country_3DID_iso', 'world_player']],
    left_on='country_2DID_iso', right_on='country_2DID_iso',
    how='left',
    suffixes=(False, False)
)

In [None]:
print('Saving main companies output file ...')

# Save output table of selected main companies
merged.to_csv(files['OUTPUT'][company_type]['ID_EXT']['MAIN_COMPS'],
              index=False,
              columns=['bvd9', 'bvd_id', 'company_name', 'country_3DID_iso', 'world_player',
                       'rnd_mean', 'y_lastav', 'rnd_y_lastav'],
              float_format='%.10f',
              na_rep='n.a.'
              )

In [None]:
report['load_subsidiary_financials_by_regions'] = step_report

In [None]:
rpt.update(report, cases)