# 1. Libraries and File Paths

In [17]:
import pandas as pd
import numpy as np
import os
from functools import reduce

In [None]:
import_file_path = rf"..\\Input"
export_file_path = rf"..\\Output"

# 2. Functions and frequent lists

In [19]:
# tornqvist/quantity index function

def tornqvist_index(df, q_vars, v_vars, industry_column=None):
    df = df.copy()
    total_v = df[v_vars].sum(axis=1)
    for col in v_vars:
        df[f'w_{col}'] = df[col] / total_v
        
    log_index_sum = 0
    for q_var, v_var in zip(q_vars, v_vars):
        w_var = f'w_{v_var}'
        
        if industry_column is not None:
            log_change = np.log(df[q_var] / df.groupby(industry_column)[q_var].shift(1))
            avg_weight = 0.5 * (df[w_var] + df.groupby(industry_column)[w_var].shift(1))
        else:
            log_change = np.log(df[q_var] / df[q_var].shift(1))
            avg_weight = 0.5 * (df[w_var] + df[w_var].shift(1))

        log_index_sum += avg_weight * log_change

    q_growth_rate = np.exp(log_index_sum)

    if industry_column is not None:
        q_growth_rate.loc[df.groupby(industry_column).head(1).index] = 100
        qi = q_growth_rate.groupby(df[industry_column]).cumprod()
    else:
        q_growth_rate.iloc[0] = 100
        qi = q_growth_rate.cumprod()

    return qi

In [20]:
# normalise to a base year

def normalise(df, variable, year, industry_column):
    base_year_values = {} 
    normaliser = {}
    df_year = df[df['year'] == year].set_index(industry_column)
    base_year_values = df_year[variable].to_dict()
    normaliser = df[industry_column].map(base_year_values)
    df[variable] = ((df[variable] / normaliser) * 100)
    return df[variable]

In [21]:
# clean list or df by removing spaces and converting to lowercase

def clean(obj):
    if isinstance(obj, pd.DataFrame):
        obj.columns = [col.replace(' ', '_').lower() for col in obj.columns]
        return obj
    
    elif isinstance(obj, list):
        return [s.replace(' ', '_').lower() for s in obj]
    
    elif isinstance(obj, str):
        return obj.replace(' ', '_').lower()

In [22]:
# lists of variable types

core_variables = ['GO', 'CAP', 'LAB', 'II']

constant_variables = ['REAL_GO', 'REAL_CAP', 'REAL_LAB', 'REAL_II']

qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI']

productivity_index_variables = ['TFP_GO', 'TFP_VA', 'LP_GO', 'LP_VA']

In [23]:
# aggregate industries function

aggregate_groups = {
    2936: list(range(29, 37)),
    3740: list(range(37, 41)),
    4144: list(range(41, 45)),
    4749: list(range(47, 50)),
    5152: list(range(51, 53)),
    5456: list(range(54, 57)),
    5758: list(range(57, 59))}

def aggregate_industries(df):
    aggregate_dict = {}
    df = df.reset_index()
    for qi in qi_variables:
        for agg_code, industries in aggregate_groups.items():
            data_slice = df[df['industry_id'].isin(industries)].copy()

            q_vars = []
            v_vars = []

            for industry in industries:
                q = f'{industry}_{qi}'
                v = f'{industry}_va'

                services_index = df[df['industry_id'] == industry][['year', qi]].rename(columns={qi: q})
                va = df[df['industry_id'] == industry][['year', 'VA']].rename(columns={'VA': v})
            
                data_slice = data_slice.merge(services_index, on='year', how='left')
                data_slice = data_slice.merge(va, on='year', how='left')
            
                q_vars.append(q)
                v_vars.append(v)

            sum_cols = ['VA']
            data_slice[sum_cols] = data_slice.groupby('year')[sum_cols].transform('sum')
            data_slice = data_slice.drop_duplicates(subset='year')

            data_slice[qi] = tornqvist_index(data_slice, q_vars=q_vars, v_vars=v_vars, industry_column='industry_id')
            data_slice['industry_id'] = agg_code

            if agg_code not in aggregate_dict:
                aggregate_dict[agg_code] = data_slice[['year', 'industry_id'] + [qi] + sum_cols]
            else:
                aggregate_dict[agg_code] = aggregate_dict[agg_code].merge(data_slice[['year', 'industry_id', qi]], on=['year', 'industry_id'], how='left')

    aggregate_df = pd.concat(aggregate_dict.values(), ignore_index=True)

    df = pd.concat([df, aggregate_df], ignore_index=True).set_index(['year', 'industry_id'])
    return df

In [24]:
# chain together two dataframes function 

def chain(df_1, df_2, year):
    df_1 = df_1.reset_index()
    df_2 = df_2.reset_index()

    df_2_scaled = df_2.copy()

    for qi in qi_variables:
        scalers = f'{qi}_scalers'

        df_1_year = df_1[df_1['year'] == year][['industry_id', qi]].rename(columns={qi: scalers})

        df_2_scaled = df_2_scaled.merge(df_1_year, on='industry_id', how='left')
        df_2_scaled[scalers] = df_2_scaled[scalers].fillna(100)
        df_2_scaled[qi] *= df_2_scaled[scalers]
        df_2_scaled[qi] = df_2_scaled[qi] / 100
        df_2_scaled = df_2_scaled.drop(columns=scalers)

    df_1 = df_1[df_1['year'] != year]

    df_new = pd.concat([df_1, df_2_scaled], ignore_index=True)
    df_new = df_new.set_index(['year', 'industry_id']).sort_values(by=['industry_id', 'year'])

    return df_new

In [25]:
# constant values function 

def constant_values(df, variable, year):
    df = df.copy()
    values_year = {}
    df_year = df[df['year'] == year].set_index('industry_id')
    values_year = df_year[variable].to_dict()
    df[f'{variable}_{year}'] = df['industry_id'].map(values_year)
    df[f'{variable}_value_index'] = df[variable] / df[f'{variable}_{year}']
    df[f'{variable}_value_index'] *= 100
    df[f'{variable}_price_index'] = df[f'{variable}_value_index'] / df[f'{variable}_QI']
    df[f'REAL_{variable}'] = df[variable] / df[f'{variable}_price_index']
    df[f'REAL_{variable}'] = df[f'REAL_{variable}'].round(4)
    return df[f'REAL_{variable}']

In [26]:
# recover index function 

def recover_index(df, index_name, variable):
    df['ln_' + variable] = df.groupby('industry_id')['delta_ln_' + variable].cumsum().fillna(0)
    df[index_name] = np.exp(df['ln_' + variable])
    df[index_name] *= 100
    return df[index_name]

In [27]:
# lag 

def lag(df, variable):
    df[f'{variable}_lag'] = df.groupby('industry_id')[variable].transform(lambda x: x.shift(1))
    return df[f'{variable}_lag']

In [28]:
# delta

def delta(df, variable):
    df[f'delta_{variable}'] = df.groupby('industry_id')[variable].transform(lambda x: x - x.shift(1))
    return df[f'delta_{variable}']

# 3. 1947 to 2016

In [29]:
# importing data

required_columns = ['yr', 'indnum', 'go.', 'goqi.', 'ii.', 'iiqi.', 'vlcol.', 'vln.', 'vkit.', 'vksoft.', 'vkRD.', 'vkart.', 'vkoth.', 'qlindexcol_merge.', 'qlindexn_merge.', 'qkit.', 'qks.', 'qkrd.', 'qka.', 'qko.', 'hrs']

data_1 = pd.read_excel(os.path.join(import_file_path,'industry_production_account_experimental.xlsx'), sheet_name='1947-1963', skiprows=1, usecols=required_columns)
data_2 = pd.read_excel(os.path.join(import_file_path,'industry_production_account_experimental.xlsx'), sheet_name='1963-2016', skiprows=1, usecols=required_columns)

In [30]:
# quantity indices

data_1 = data_1.reset_index()
data_2 = data_2.reset_index()

q_vars_dict_1 = {
    'GO': ['goqi.'],
    'CAP': ['qkit.', 'qks.', 'qkrd.', 'qka.', 'qko.'],
    'LAB': ['qlindexcol_merge.', 'qlindexn_merge.'],
    'II': ['iiqi.']}

v_vars_dict_1 = {
    'GO': ['go.'],
    'CAP':  ['vkit.', 'vksoft.', 'vkRD.', 'vkart.', 'vkoth.'],
    'LAB': ['vlcol.', 'vln.'],
    'II': ['ii.']}

for df in [data_1, data_2]:
    for variable in core_variables:
        q_vars = q_vars_dict_1[variable]
        v_vars = v_vars_dict_1[variable]
        df[f'{variable}_QI'] = tornqvist_index(df, q_vars, v_vars, industry_column='indnum')

data_1['VA'] = data_1['go.'] - data_1['ii.']
data_2['VA'] = data_2['go.'] - data_2['ii.']

data_1.reset_index(inplace=True)
data_2.reset_index(inplace=True)
data_1 = data_1[qi_variables + ['hrs', 'VA', 'indnum', 'yr']]
data_2 = data_2[qi_variables + ['hrs', 'VA', 'indnum', 'yr']]
data_1 = data_1.rename(columns={'hrs': 'hours', 'indnum': 'industry_id', 'yr': 'year'})
data_2 = data_2.rename(columns={'hrs': 'hours', 'indnum': 'industry_id', 'yr': 'year'})
data_1 = data_1.set_index(['year', 'industry_id'])
data_2 = data_2.set_index(['year', 'industry_id'])

In [31]:
# aggregate industries
qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI', 'hours']

data_2 = aggregate_industries(data_2)
data_2 = data_2.drop(columns=['VA'])
data_1 = data_1.drop(columns=['VA'])

In [32]:
data_1.reset_index(inplace=True)
data_2.reset_index(inplace=True)

data_1['hours'] = normalise(data_1, 'hours', year=1947, industry_column='industry_id')
data_2['hours'] = normalise(data_2, 'hours', year=1963, industry_column='industry_id')

In [33]:
# chain together data_1 (early period) and data_2 (late period)

df_qi_47_to_16 = chain(data_1, data_2, 1963)

df_47_to_16_hours = df_qi_47_to_16['hours'].copy()

qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI']

# 4. BEA (1997 to 2023) and KLEMS2017 (1947 to 2014)

In [34]:
# importing and processing BEA data

cap_qty_list = ['Capital_Art_Quantity', 'Capital_R&D_Quantity', 'Capital_IT_Quantity', 'Capital_Other_Quantity', 'Capital_Software_Quantity']
cap_comp_list = ['Capital_Art Compensation', 'Capital_R&D Compensation', 'Capital_IT Compensation', 'Capital_Other Compensation', 'Capital_Software Compensation']

lab_qty_list = ['Labor_Col_Quantity', 'Labor_NoCol_Quantity']
lab_comp_list = ['Labor_Col Compensation', 'Labor_NoCol Compensation']

ii_qty_list = ['Energy_Quantity', 'Materials_Quantity', 'Services_Quantity']
ii_comp_list = ['Energy Compensation', 'Materials Compensation', 'Service Compensation']

relevant_sheets = cap_qty_list + cap_comp_list + lab_qty_list + lab_comp_list + ii_qty_list + ii_comp_list + ['Value Added'] + ['VA_Quantity'] + ['Gross Output'] + ['Gross Output_Quantity'] + ['Labor Hours_Quantity']

long_data = []

for sheet in relevant_sheets:
    df = pd.read_excel(os.path.join(import_file_path, 'industry_production_account_capital.xlsx'), sheet_name=sheet, header=[1])
    df = df.dropna(how='all')
    df.rename(columns={df.columns[0]: 'industry'}, inplace=True) 
    df_long = df.melt(id_vars='industry', var_name='Year', value_name=sheet)
    long_data.append(df_long)

df = reduce(lambda left, right: pd.merge(left, right, on=['industry', 'Year'], how='outer'), long_data)

industry_order = long_data[0]['industry'].drop_duplicates().tolist()
industry_id_map = {industry: i+1 for i, industry in enumerate(industry_order)}
df['industry_id'] = df['industry'].map(industry_id_map)

df = clean(df)
cap_qty_list = clean(cap_qty_list)
cap_comp_list = clean(cap_comp_list)
lab_qty_list = clean(lab_qty_list)
lab_comp_list = clean(lab_comp_list)
ii_qty_list = clean(ii_qty_list)
ii_comp_list = clean(ii_comp_list)

df['year'] = df['year'].astype(int)
df = df.set_index(['industry_id', 'year']).sort_index(level=['industry_id'])

In [35]:
# importing and processing KLEMS data

df_klems = pd.read_excel(os.path.join(import_file_path, 'usa_wk_mar_2017.xlsx'), sheet_name='KLEMdata', header=[1])
df_klems = df_klems[['year', 'industry', 'gross output', 'capital', 'labor', 'intermediate']]
df_klems = df_klems.rename(columns={'gross output': 'GO', 'capital': 'CAP', 'labor': 'LAB', 'intermediate': 'II', 'industry': 'industry_id'})

In [36]:
# nominal values

df = df.rename(columns={'gross_output': 'GO', 'value_added': 'VA', 'labor_hours_quantity': 'hours'})
df['LAB'] = df[lab_comp_list].sum(axis=1)
df['CAP'] = df[cap_comp_list].sum(axis=1)
df['II'] = df[ii_comp_list].sum(axis=1)

df_post_2014 = df.copy()
df_post_2014 = df_post_2014.reset_index()
df_post_2014 = df_post_2014[core_variables + ['industry_id', 'year', 'hours']]
df_post_2014 = df_post_2014[df_post_2014['year'] > 2014]

df_klems_62_63 = df_klems['industry_id'].isin([62, 63])
df_62 = df_klems[df_klems_62_63].groupby('year').sum()
df_62['industry_id'] = 62
df_62 = df_62.reset_index()

df_klems_64_65 = df_klems['industry_id'].isin([64, 65])
df_63 = df_klems[df_klems_64_65].groupby('year').sum()
df_63['industry_id'] = 63
df_63 = df_63.reset_index()

df_klems = df_klems[~df_klems['industry_id'].isin([62, 63, 64, 65])]
df_klems = pd.concat([df_klems, df_62, df_63], ignore_index=True)

df_nominal_47_to_23 = pd.concat([df_post_2014, df_klems], ignore_index=True)

# nominal values in aggregate industries

aggregate_dict = {}

for agg_code, industries in aggregate_groups.items():
    data_slice = df_nominal_47_to_23[df_nominal_47_to_23['industry_id'].isin(industries)].copy()
    data_slice[core_variables] = data_slice.groupby('year')[core_variables].transform('sum')
    data_slice = data_slice.drop_duplicates(subset='year')
    data_slice['industry_id'] = agg_code
    aggregate_dict[agg_code] = data_slice[['year', 'industry_id'] + core_variables]

df_nominal_47_to_23 = pd.concat([df_nominal_47_to_23] + list(aggregate_dict.values()), ignore_index=True).set_index(['year', 'industry_id']).sort_index(level=['industry_id'])

In [37]:
# quantity indices

df = df.reset_index()

q_vars_dict_2 = {
    'GO': ['gross_output_quantity'],
    'CAP': cap_qty_list,
    'LAB': lab_qty_list,
    'II': ii_qty_list}

v_vars_dict_2 = {
    'GO': ['GO'],
    'CAP': cap_comp_list,
    'LAB': lab_comp_list,
    'II': ii_comp_list}

for variable in core_variables:
    q_vars = q_vars_dict_2[variable]
    v_vars = v_vars_dict_2[variable]
    df[f'{variable}_QI'] = tornqvist_index(df, q_vars, v_vars, industry_column='industry_id')

df['year'] = df['year'].astype(int)

df_qi_97_to_23 = df[qi_variables + ['year', 'industry_id', 'VA', 'hours']] 
df_qi_97_to_23['hours'] = normalise(df, 'hours', year=1997, industry_column='industry_id')
df_qi_97_to_23 = df_qi_97_to_23.set_index(['year', 'industry_id']).sort_index(level=['industry_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qi_97_to_23['hours'] = normalise(df, 'hours', year=1997, industry_column='industry_id')


In [38]:
# aggregate industries

qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI', 'hours']

df_qi_97_to_23 = aggregate_industries(df_qi_97_to_23)

qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI']

df_qi_97_to_23 = df_qi_97_to_23.drop(columns=['VA'])

df_97_to_23_hours = df_qi_97_to_23['hours'].copy()

# 5. Chaining 1947-2016 with 1997-2023

In [39]:
df_47_to_96_hours = df_47_to_16_hours[df_47_to_16_hours.index.get_level_values('year').astype(int) <= 1997]
qi_variables = ['hours']
df_47_to_23_hours = chain(df_47_to_96_hours, df_97_to_23_hours, 1997)
qi_variables = ['GO_QI', 'CAP_QI', 'LAB_QI', 'II_QI']

In [40]:
df_qi_47_to_96 = df_qi_47_to_16[df_qi_47_to_16.index.get_level_values('year').astype(int) <= 1997]

df_qi_47_to_23 = chain(df_qi_47_to_96, df_qi_97_to_23, 1997)

df_qi_47_to_23 = df_qi_47_to_23.reset_index()

df_qi_47_to_23['year'] = df_qi_47_to_23['year'].astype(int)

for qi in qi_variables:
   df_qi_47_to_23[qi] = normalise(df_qi_47_to_23, qi, year=2009, industry_column='industry_id')

df_qi_47_to_23 = df_qi_47_to_23.set_index(['year', 'industry_id']).sort_index(level=['industry_id'])

df_47_to_23 = pd.merge(df_nominal_47_to_23[core_variables], df_qi_47_to_23[qi_variables],  left_index=True, right_index=True, how='inner').round(4)

In [41]:
df_47_to_23 = pd.merge(df_47_to_23, df_47_to_23_hours, left_index=True, right_index=True, how='inner')

# 6. Calculating constant values

In [42]:
# constant values

df_47_to_23 = df_47_to_23.reset_index()

for variable in core_variables:
    df_47_to_23[f'REAL_{variable}'] = constant_values(df_47_to_23, variable, 2009)

df_47_to_23 = df_47_to_23.set_index(['year', 'industry_id']).sort_index(level=['industry_id'])

df_47_to_23 = df_47_to_23[core_variables + constant_variables + qi_variables + ['hours']]

df = df_47_to_23.copy()

# 7. Building productivity indexes

### 7.1 Nominal Value Added
$$
P_{V A}(t) Q_{V A}(t) = P_Y (t) Q_Y (t) - P_{II} (t) Q_{II} (t)
$$

In [43]:
df['VA'] = df['GO'] - df['II']

Nominal Value Added share of Gross Output:
$$
\nu_{V A}(t) = \frac{P_{V A}(t) Q_{V A}(t)}{P_Y (t) Q_Y (t)}
$$

In [44]:
df['VA/GO'] = df['VA'] / df['GO']

### 7.2 Value Added index

To compute a value added quantity index, we start from the definition of a Tornqvist Quantity Index for total output $Y$

$$
\Delta \ln Q_Y (t) = \bar{\nu}_{V A} (t) \Delta \ln Q_{VA}(t)   + \bar{\nu}_{II} (t) \Delta \ln Q_{II} (t)
$$

where

$$
\Delta \ln X(t) = \ln X(t) - \ln X(t - 1)
$$

and

$$
\bar{\nu}_X(t) = 0.5 \times \left( \frac{P_X(t) Q_X(t)}{P_Y (t) Q_Y (t)} + \frac{P_X(t - 1) Q_X(t - 1)}{P_Y (t - 1) Q_Y (t - 1)} \right).
$$

In the formula above $\bar{\nu}_{VA}(t)$ and $\bar{\nu}_{II}(t)$ represent the Tornqvist weights for $(VA)$ and intermediate inputs $(II)$, respectively.

Re arranging terms, we get:

$$
\Delta \ln Q_{VA}(t) = \frac{\Delta \ln Q_Y (t) - \bar{\nu}_{II} (t) \Delta \ln Q_{II} (t)}{\bar{\nu}_{V A} (t)}
$$

#### 7.2.1 Compute log differences

In [45]:
df['ln_REAL_GO'] = np.log(df['REAL_GO'])
df['ln_REAL_II'] = np.log(df['REAL_II'])

df['delta_ln_REAL_GO'] = delta(df, 'ln_REAL_GO')
df['delta_ln_REAL_II'] = delta(df, 'ln_REAL_II')

#### 7.2.2 Compute the Tornqvist Output Weights

$$
\bar{\nu}_X(t) = 0.5 \times \left( \frac{P_X(t) Q_X(t)}{P_Y (t) Q_Y (t)} + \frac{P_X(t - 1) Q_X(t - 1)}{P_Y (t - 1) Q_Y (t - 1)} \right)
$$

where X is either nominal VA or nominal II

In [46]:
df['II/GO'] = df['II'] / df['GO']

df['VA/GO_lag'] = lag(df, 'VA/GO')
df['II/GO_lag'] = lag(df, 'II/GO')

df['VA_tornqvist_GO_share'] = 0.5 * (df['VA/GO'] + df['VA/GO_lag'])
df['II_tornqvist_GO_share'] = 0.5 * (df['II/GO'] + df['II/GO_lag'])

#### 7.2.3 Compute the log change of the Value Added quantity index

$$
\Delta \ln Q_{VA}(t) = \frac{\Delta \ln Q_Y (t) - \bar{\nu}_{II} (t) \Delta \ln Q_{II} (t)}{\bar{\nu}_{V A} (t)}
$$

In [47]:
df['delta_ln_VA_QI'] = ((df['delta_ln_REAL_GO'] - (df['II_tornqvist_GO_share']*df['delta_ln_REAL_II']))/df['VA_tornqvist_GO_share'])

### 7.3 Labour Productivity

For now, our measure of labor quantity is real compensation of employees. It follows that:
LP_VA:
$$
\Delta \ln LP(t) = \Delta \ln Q_{VA}(t) - \Delta \ln Q_L(t)
$$
LP_GO:
$$
\Delta \ln LP(t) = \Delta \ln Q_{GO}(t) - \Delta \ln Q_L(t)
$$

In [48]:
df['ln_REAL_LAB'] = np.log(df['REAL_LAB'])
df['delta_ln_REAL_LAB'] = delta(df, 'ln_REAL_LAB')

In [49]:
df['ln_hours'] = np.log(df['hours'])
df['delta_ln_hours'] = delta(df, 'ln_hours')

In [50]:
df['delta_ln_LP_VA'] = (df['delta_ln_VA_QI'] - df['delta_ln_hours'])
df['delta_ln_LP_GO'] = (df['delta_ln_REAL_GO'] - df['delta_ln_hours'])

### 7.4 Total Factor Productivity

Assuming that VA is produced by combining capital and labor services and TFP via a Tornqvist Index, we can back out log change in TFP using
$$
\Delta \ln TFP(t) = \Delta \ln Q_{VA}(t) - \bar{\psi}_L(t) \Delta \ln Q_L(t) - \bar{\psi}_K(t) \Delta \ln Q_K(t),
$$

where

$$
\bar{\psi}_X(t) = 0.5 \times \left( \frac{P_X(t) Q_X(t)}{P_{VA}(t) Q_{VA}(t)} + \frac{P_X(t-1) Q_X(t-1)}{P_{VA}(t-1) Q_{VA}(t-1)} \right)
$$

and X is either nominal LAB (L) or nominal CAP (K)

#### 7.4.1 Tornqvist VA and GO share

$$
\bar{\psi}_X(t) = 0.5 \times \left( \frac{P_X(t) Q_X(t)}{P_{VA}(t) Q_{VA}(t)} + \frac{P_X(t-1) Q_X(t-1)}{P_{VA}(t-1) Q_{VA}(t-1)} \right)
$$

where X is either nominal $LAB (L)$ or nominal $CAP (K)$

In [51]:
df['LAB/VA'] = df['LAB'] / df['VA']
df['CAP/VA'] = df['CAP'] / df['VA']
df['LAB/VA_lag'] = lag(df, 'LAB/VA')
df['CAP/VA_lag'] = lag(df, 'CAP/VA')
df['L_tornqvist_VA_share'] = 0.5 * (df['LAB/VA'] + df['LAB/VA_lag'])
df['CAP_tornqvist_VA_share'] = 0.5 * (df['CAP/VA'] + df['CAP/VA_lag'])

df['LAB/GO'] = df['LAB'] / df['GO']
df['CAP/GO'] = df['CAP'] / df['GO']
df['LAB/GO_lag'] = lag(df, 'LAB/GO')
df['CAP/GO_lag'] = lag(df, 'CAP/GO')
df['L_tornqvist_GO_share'] = 0.5 * (df['LAB/GO'] + df['LAB/GO_lag'])
df['CAP_tornqvist_GO_share'] = 0.5 * (df['CAP/GO'] + df['CAP/GO_lag'])

#### 7.4.2 Real capital; logged and log difference

In [52]:
df['ln_REAL_CAP'] = np.log(df['REAL_CAP'])
df['delta_ln_REAL_CAP'] = delta(df, 'ln_REAL_CAP')

#### 7.4.3 TFP growth rate
TFP_VA:
$$
\Delta \ln TFP(t) = \Delta \ln Q_{VA}(t) - \bar{\psi}_L(t) \Delta \ln Q_L(t) - \bar{\psi}_K(t) \Delta \ln Q_K(t)
$$

TFP_GO:
$$
\Delta \ln TFP(t) = \Delta \ln Q_{GO}(t) - \bar{\psi}_L(t) \Delta \ln Q_L(t) - \bar{\psi}_K(t) \Delta \ln Q_K(t) - \bar{\psi}_{II}(t) \Delta \ln Q_{II}(t)
$$

In [53]:
df['delta_ln_TFP_VA'] = (df['delta_ln_VA_QI'] - (df['L_tornqvist_VA_share']*df['delta_ln_REAL_LAB']) - (df['CAP_tornqvist_VA_share']*df['delta_ln_REAL_CAP']))

In [54]:
df['delta_ln_TFP_GO'] = (df['delta_ln_REAL_GO'] - (df)['L_tornqvist_GO_share']*df['delta_ln_REAL_LAB'] - (df['CAP_tornqvist_GO_share']*df['delta_ln_REAL_CAP']) - (df['II_tornqvist_GO_share']*df['delta_ln_REAL_II']))

### 7.5 Recover the TFP, LP and VA indexes, then normalise to 2009

In [55]:
# recover index function 

def recover_index(df, index_name):
    df['ln_' + index_name] = df.groupby('industry_id')['delta_ln_' + index_name].cumsum().fillna(0)
    df[index_name] = np.exp(df['ln_' + index_name])
    df[index_name] *= 100
    return df[index_name]

In [56]:
df = df.reset_index()

for variable in productivity_index_variables:
    df[variable] = recover_index(df, variable)
    df[variable] = normalise(df, variable, year=2009, industry_column='industry_id')
df['VA_QI'] = recover_index(df, 'VA_QI')
df['VA_QI'] = normalise(df, 'VA_QI', year=2009, industry_column='industry_id')

df = df.set_index(['year', 'industry_id']).sort_index(level=['industry_id'])

In [57]:
df = df[core_variables + ['VA'] + constant_variables + qi_variables + ['VA_QI'] + productivity_index_variables].round(4)
df_complete = df.copy()

In [58]:
df_complete.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,GO,CAP,LAB,II,VA,REAL_GO,REAL_CAP,REAL_LAB,REAL_II,GO_QI,CAP_QI,LAB_QI,II_QI,VA_QI,TFP_GO,TFP_VA,LP_GO,LP_VA
year,industry_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1947,1,31299.23,9744.228,10039.002,11516.0,19783.23,85420.7366,58072.4257,175614.4448,71655.418,28.0331,91.8022,350.6423,37.443,18.6973,34.2724,9.9743,4.7605,3.1752
1948,1,35560.695,11602.553,11443.142,12515.0,23045.695,93240.3033,62451.0323,169739.8335,71305.9727,30.5993,98.724,338.9127,37.2604,21.4984,37.0196,11.2514,5.2976,3.722
1949,1,29746.566,6988.223,11310.343,11448.0,18298.566,90564.3063,62630.1163,170840.071,70636.9362,29.7211,99.0071,341.1095,36.9108,20.6429,35.9716,10.7512,5.2764,3.6647
1950,1,31972.906,9193.447,10284.459,12495.0,19477.906,95692.0309,63096.6455,158602.6833,74804.0615,31.4039,99.7446,316.6756,39.0883,21.7807,38.0812,11.7999,5.8552,4.061
1951,1,37255.438,11954.302,10551.134,14750.0,22505.438,94865.6469,67334.6291,150626.2118,79642.3284,31.1327,106.4441,300.7493,41.6165,20.6164,36.6797,11.0925,6.0988,4.0387


# 8. Aggregated by economy, services and goods

In [59]:
def tqvist_byfactor_bycat(df, weighted_variables):
    df = df.copy()
    by_factor_df = pd.DataFrame()

    for variable in weighted_variables:  
        industries = df['industry_id'].unique()

        va_series = []
        ti_series = []

        for industry in industries:
            va_col_name = f'{industry}_VA'
            va_df = (df.loc[df['industry_id'] == industry][['year', 'VA']].rename(columns={'VA': va_col_name}).set_index('year'))

            ti_col_name = f'{industry}_{variable}'
            ti_df = (df.loc[df['industry_id'] == industry][['year', variable]].rename(columns={variable: ti_col_name}).set_index('year'))

            va_series.append(va_df)
            ti_series.append(ti_df)

        va_ti_df = pd.DataFrame()
        va_ti_df['year'] = df['year'].unique()
        va_ti_df = va_ti_df.sort_values('year').reset_index(drop=True)
        va_ti_df = pd.concat([va_ti_df.set_index('year')] + va_series + ti_series, axis=1).reset_index()

        q_vars = [col for col in va_ti_df.columns if col.endswith(variable)]
        v_vars = [col for col in va_ti_df.columns if col.endswith('VA')]
        va_ti_df[variable] = tornqvist_index(va_ti_df, q_vars, v_vars)

        if by_factor_df.empty:
            by_factor_df = va_ti_df[['year', variable]]
        else:
            by_factor_df = by_factor_df.merge(va_ti_df[['year', variable]], on='year', how='outer')

    return by_factor_df.set_index('year')

In [60]:
df_ew = df.copy().reset_index()
df_ew = df_ew[df_ew['year'].astype(int) >= 1963]

df_ew = df_ew[df_ew['industry_id'].between(1, 63)].copy()
df_goods = df_ew[df_ew['industry_id'].between(1, 26)].copy()
df_services = df_ew[df_ew['industry_id'].between(27, 63)].copy()
df_research_services = df_ew[df_ew['industry_id'].isin([39, 42, 43, 44, 47, 48, 49])].copy()
df_non_research_services = df_ew[df_ew['industry_id'].isin([27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 45, 46, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63])].copy()

weighted_variables = qi_variables + ['VA_QI'] + productivity_index_variables
sum_cols = core_variables + ['VA'] + constant_variables 

df_dict = {
    'total': df_ew,
    'goods': df_goods,
    'services': df_services,
    'research_services': df_research_services,
    'non_research_services': df_non_research_services
}

summed_data = {}
for key, df in df_dict.items():
    df_sum = df.copy()
    df_sum[sum_cols] = df_sum.groupby('year')[sum_cols].transform('sum')
    df_sum = df_sum[['year'] + sum_cols].drop_duplicates(subset='year').set_index('year')
    summed_data[key] = df_sum

for key in ['total', 'goods', 'services', 'research_services', 'non_research_services']:
    df_dict[key] = tqvist_byfactor_bycat(df_dict[key], weighted_variables)
    df_dict[key] = pd.merge(summed_data[key], df_dict[key], how='inner', left_index=True, right_index=True)

ew_df = df_dict['total']
goods_df = df_dict['goods']
services_df = df_dict['services']
research_services_df = df_dict['research_services']
non_research_services_df = df_dict['non_research_services']

for df in [ew_df, goods_df, services_df, research_services_df, non_research_services_df]:
    df.reset_index(inplace=True)
    for variable in weighted_variables:
        base_value = df.loc[df['year'] == 2009, variable].iloc[0]
        df[variable] = (df[variable] / base_value) * 100
    df.set_index(['year'], inplace=True)
    df[:] = df.round(4)

  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
  df[f'w_{col}'] = df[col] / total_v
 

# 9. Output

In [61]:
industries_info = (
    pd.DataFrame(list(industry_id_map.items()), columns=['industry_name', 'industry_id'])
    .sort_values('industry_id')
    .reset_index(drop=True))

variables_dictionary = {
    '*': 'Note: variables with * are normalised to 2009 = 100',
    'GO': 'Nominal Gross Output',
    'CAP': 'Nominal Capital',
    'LAB': 'Nominal Labor',
    'II': 'Nominal Intermediate Inputs',
    'VA': 'Nominal Value Added',
    'REAL_GO*': 'Real Gross Output',
    'REAL_CAP*': 'Real Capital',
    'REAL_LAB*': 'Real Labor',
    'REAL_II*': 'Real Intermediate Inputs',
    'GO_QI*': 'Gross Output Quantity Index',
    'CAP_QI*': 'Capital Quantity Index',
    'LAB_QI*': 'Labor Quantity Index',
    'II_QI*': 'Intermediate Inputs Quantity Index',
    'VA_QI*': 'Value Added Quantity Index',
    'TFP_GO*': 'Total Factor Productivity Index (GO)',
    'TFP_VA*': 'Total Factor Productivity Index (VA)',
    'LP_GO*': 'Labor Productivity Index (GO)',
    'LP_VA*': 'Labor Productivity Index (VA)',
}

variables_info = pd.DataFrame(
    list(variables_dictionary.items()), 
    columns=['variable_name', 'variable_description'])

aggregate_industries_dictionary = {
    'Period': '1947-2023',
    '2936': 'Industries 29-36',
    '3740': 'Industries 37-40',
    '4144': 'Industries 41-44',
    '4749': 'Industries 47-49',
    '5152': 'Industries 51-52',
    '5456': 'Industries 54-56',
    '5758': 'Industries 57-58'
}

aggregate_industries_info = pd.DataFrame(
    list(aggregate_industries_dictionary.items()),
    columns=['industry_group', 'industry_ids']
)

file_path = os.path.join(export_file_path, "df_47_to_23.xlsx")

with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
    industries_info.to_excel(writer, sheet_name='Info', index=False, startrow=1, startcol=1)
    aggregate_industries_info.to_excel(writer, sheet_name='Info', index=False, startrow=1, startcol=4)
    variables_info.to_excel(writer, sheet_name='Info', index=False, startrow=1, startcol=7)
    df_complete.to_excel(writer, sheet_name='Data', index=True)
    ew_df.to_excel(writer, sheet_name='Aggregate', index=True)
    goods_df.to_excel(writer, sheet_name='Goods_Aggregate', index=True)
    services_df.to_excel(writer, sheet_name='Services_Aggregate', index=True)
    research_services_df.to_excel(writer, sheet_name='Research_Services_Aggregate', index=True)
    non_research_services_df.to_excel(writer, sheet_name='Non_Research_Services_Aggregate', index=True)