In [337]:
# ONS FIRM LEVEL PRODUCTIVITY AND DYNAMISM BULLETIN - GRAPHS
# Will Shepherd, 8th Dec 2025

import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
from pandas.api.types import CategoricalDtype
import os
import eco_style 
import requests
import json
import io
alt.themes.enable("light")

ThemeRegistry.enable('light')

In [338]:
# Function to import and clean productivity data
def process_firmlevel_prod_data(
        file_path: str,
        sheet_name: str,
        id_vars: list[str]) -> pd.DataFrame:
    """
    Reads an Excel sheet, cleans the header, renames statistic colmns, and pivots the data from wide to long for plotting

    Args:
    file_path (str): Path to the Excel file.
    sheet_name (str): Name of the sheet to read.
    id_vars (list[str]): A list of column names that should remain as identifier variable in long format.

    Returns:
    pd.DataFrame: the processed and pivoted DataFrame
    """
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=3)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        return pd.DataFrame()
    
    # Drop the first row
    df = df.iloc[1:]

    new_columns = []
    for col in df.columns:
        if ' KP EW' in str(col).upper():
            new_name = str(col).split(' ')[0]
            new_columns.append(new_name.capitalize())
        else:
            new_columns.append(col)
            
    df.columns = new_columns

    value_vars = ['P10','P25','P50','P75','P90','Mean']

    df_long = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=value_vars,
        var_name='Decile',
        value_name='aGVA per worker (£) KP EW'
    )
    return df_long


In [371]:
# Function to import and clean productivity data
def process_dynamism_data(
        file_path: str,
        sheet_name: str,
        id_vars: list[str],
        skiprows: int) -> pd.DataFrame:
    """
    Reads an Excel sheet, cleans the header, renames statistic colmns, and pivots the data from wide to long for plotting

    Args:
    file_path (str): Path to the Excel file.
    sheet_name (str): Name of the sheet to read.
    id_vars (list[str]): A list of column names that should remain as identifier variable in long format.

    Returns:
    pd.DataFrame: the processed and pivoted DataFrame
    """
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        return pd.DataFrame()
    
    # Drop the first row
    df = df.iloc[1:]

    new_columns = []
    for col in df.columns:
        if ' KP EW' in str(col).upper():
            new_name = str(col).split(' ')[0]
            new_columns.append(new_name.capitalize())
        else:
            new_columns.append(col)
            
    df.columns = new_columns

    value_vars = ['P10','P25','P50','P75','P90','Mean']

    df_long = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=value_vars,
        var_name='Decile',
        value_name='aGVA per worker (£) KP EW'
    )
    return df_long


In [379]:
df = pd.read_excel('annualbusinessdynamism20012024.xlsx', sheet_name='Table 5', skiprows=7)

In [380]:
df

Unnamed: 0,Year,Total,Entering businesses,Incumbent growing,Total.1,Exiting businesses,Incumbent shrinking,Net flow,Entering and Exiting,Incumbents,Total .1,Entering and Exiting.1,Incumbents.1
0,2001,4462501,1283478,3179023,3983110,1307909,2675201,479391,-24431,503822,8445611,2591387,5854224
1,2002,3640846,862110,2778736,3562814,1074036,2488778,78032,-211926,289958,7203660,1936146,5267514
2,2003,3874304,1660925,2213379,3385336,1236683,2148653,488968,424242,64726,7259640,2897608,4362032
3,2004,4062993,1192540,2870453,3616979,1147915,2469064,446014,44625,401389,7679972,2340455,5339517
4,2005,3758062,1163405,2594657,3565430,1454295,2111135,192632,-290890,483522,7323492,2617700,4705792
5,2006,3404912,1096556,2308356,3384798,1078622,2306176,20114,17934,2180,6789710,2175178,4614532
6,2007,3459106,1153418,2305688,3028644,936699,2091945,430462,216719,213743,6487750,2090117,4397633
7,2008,3186470,999152,2187318,3187989,1062340,2125649,-1519,-63188,61669,6374459,2061492,4312967
8,2009,3044832,859217,2185615,3069737,1050298,2019439,-24905,-191081,166176,6114569,1909515,4205054
9,2010,2999891,1042287,1957604,3478298,1088308,2389990,-478407,-46021,-432386,6478189,2130595,4347594


In [387]:
# Function to import and process dynamism data
def process_dynamism_data(
        file_path: str,
        sheet_name: str,
        id_vars: list[str],
        skiprows: int) -> pd.DataFrame:
    
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        return pd.DataFrame()
    
    # Rename columns
    df = df.rename(columns={'Total':'Total job creation',
                            'Entering businesses':'Job creation (entrants)',
                            'Incumbent growing':'Job creation (incumbents)',
                            'Total ':'Total job destruction',
                            'Exiting businesses':'Job destruction (exiters)',
                            'Incumbent shrinking':'Job destruction (incumbents)',
                            'Net flow':'Net job creation',
                            'Entering and Exiting':'Entry and exit jobs',
                            'Total .1':'Total job reallocation',
                            'Entering and Exiting.1':'Entry and exit reallocation',
                            'Incumbents.1':'Incumbent reallocation'
                            })

    df_long = pd.melt(
        df,
        id_vars=id_vars,
        var_name='Measure',
        value_name='Jobs'
    )

    return df_long  

In [340]:
firm_level_prod_we = process_firmlevel_prod_data(
    'abslabourproductivitydatapack19972023.xlsx',
    'Table 1',
    id_vars='Year')

firm_level_prod_firmsize = process_firmlevel_prod_data(
    'abslabourproductivitydatapack19972023.xlsx',
    'Table 4',
    id_vars=['Year','Employment Band'])

firm_level_prod_industry = process_firmlevel_prod_data(
    'abslabourproductivitydatapack19972023.xlsx',
    'Table 6',
    id_vars=['Year','Industry'])

firm_level_prod_2dig_industry = process_firmlevel_prod_data(
    'abslabourproductivitydatapack19972023.xlsx',
    'Table 8',
    id_vars=['Year','Industry'])

In [343]:
# Function to import and process dynamism data
def process_dynamism_rates_data(
        file_path: str,
        sheet_name: str,
        id_vars: list[str],
        skiprows: int) -> pd.DataFrame:
    
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=skiprows)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        return pd.DataFrame()
    
    # Rename columns
    df = df.rename(columns={'Total':'Total_JC_Rate',
                            'Entering businesses':'Entrants_JC_Rate',
                            'Incumbent growing':'Incumbent_JC_Rate',
                            'Total ':'Total_JD_Rate',
                            'Exiting businesses':'Exiters_JD_Rate',
                            'Incumbent shrinking':'Incumbent_JD_Rate',
                            'Net flow':'Net_JC_Rate',
                            'Entering and Exiting':'Enter_Exit_JC_Rate',
                            'Total .1':'Total_Reallocation_Rate',
                            'Entering and Exiting.1':'Enter_Exit_Reallocation_Rate',
                            'Incumbents.1':'Incumbent_Reallocation_Rate'
                            })

    df_long = pd.melt(
        df,
        id_vars=id_vars,
        var_name='Measure',
        value_name='Rate'
    )

    return df_long  

In [389]:
dynamism_we

Unnamed: 0,Year,Measure,Jobs
0,2001,Total job creation,4462501
1,2002,Total job creation,3640846
2,2003,Total job creation,3874304
3,2004,Total job creation,4062993
4,2005,Total job creation,3758062
...,...,...,...
283,2020,Incumbent reallocation,4067749
284,2021,Incumbent reallocation,3971469
285,2022,Incumbent reallocation,4603534
286,2023,Incumbent reallocation,4734315


In [388]:
dynamism_we = process_dynamism_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 5',
    id_vars='Year',
    skiprows=7)

firmsize_dynamism = process_dynamism_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 10',
    id_vars=['Year','Firm size'],
    skiprows=8)

firmage_dynamism = process_dynamism_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 13',
    id_vars=['Year','Firm age'],
    skiprows=9)

firmagesize_dynamism = process_dynamism_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 16',
    id_vars=['Year','Firm size','Firm age'],
    skiprows=9)

industry_dynamism = process_dynamism_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 19',
    id_vars=['Year','Industry classification'],
    skiprows=8)


In [391]:
we_dynamism_rates = process_dynamism_rates_data(
    'annualbusinessdynamism20012024.xlsx',
    'Table 6',
    id_vars='Year',
    skiprows=8
)

In [346]:
# Plot whole economy distribution of firm level productivity

chart = alt.Chart(firm_level_prod_we).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('aGVA per worker (£) KP EW:Q'),
    color=alt.Color('Decile:N', legend=None),
    tooltip=['Year:O','aGVA per worker (£) KP EW:Q','Decile:N']
)

# Add end labels
end_point = firm_level_prod_we.groupby('Decile')['Year'].idxmax()

end_point_data = firm_level_prod_we.loc[end_point]

text_labels = alt.Chart(end_point_data).mark_text(
    align='left',     
    dx=10,           
    baseline='middle'
).encode(
    x=alt.X('Year:O'),         
    y=alt.Y('aGVA per worker (£) KP EW:Q'),   
    text='Decile:O',
    color=alt.Color('Decile:O')
)

chart = chart + text_labels
chart
chart.save('we_gva_per_worker.png', scale_factor=2)
chart.save('we_gva_per_worker.json', scale_factor=2)


In [224]:
# Plot 90th-10th percentile ratio

ratio_90_10 = firm_level_prod_we[firm_level_prod_we['Decile'].isin(['P90','P10'])].pivot(
    index='Year',
    columns='Decile',
    values='aGVA per worker (£) KP EW'
)
ratio_90_10['P90_P10_ratio'] = ratio_90_10['P90'] / ratio_90_10['P10']

ratio_90_10

chart = alt.Chart(ratio_90_10.reset_index()).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('P90_P10_ratio:Q', title='90th to 10th Percentile Ratio'),)

chart

In [None]:
# Plot 90th-10th percentile ratio 

ratio_90_10 = firm_level_prod_we[firm_level_prod_we['Decile'].isin(['P90','P10'])].pivot(
    index='Year',
    columns='Decile',
    values='aGVA per worker (£) KP EW'
)
ratio_90_10['P90_P10_ratio'] = ratio_90_10['P90'] / ratio_90_10['P10']

ratio_90_10

chart = alt.Chart(ratio_90_10.reset_index()).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('P90_P10_ratio:Q', title='90th to 10th Percentile Ratio'),)

chart

In [None]:
# Plot the 90th and 10th percentile

ratio_90_10 = firm_level_prod_we[firm_level_prod_we['Decile'].isin(['P90','P10'])].pivot(
    index='Year',
    columns='Decile',
    values='aGVA per worker (£) KP EW'
)
ratio_90_10['P90_P10_ratio'] = ratio_90_10['P90'] / ratio_90_10['P10']

ratio_90_10

chart = alt.Chart(ratio_90_10.reset_index()).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('P90_P10_ratio:Q', title='90th to 10th Percentile Ratio'),)

chart

In [225]:
# Plot 90th-50TH percentile ratio

ratio_90_50 = firm_level_prod_we[firm_level_prod_we['Decile'].isin(['P90','P50'])].pivot(
    index='Year',
    columns='Decile',
    values='aGVA per worker (£) KP EW'
)
ratio_90_50['P90_P50_ratio'] = ratio_90_50['P90'] / ratio_90_50['P50']

ratio_90_50

chart = alt.Chart(ratio_90_50.reset_index()).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('P90_P50_ratio:Q', title='90th to 50th Percentile Ratio'),)

chart

In [None]:
#Plot the change 90th percentile across industries 2022-2023
industry_p90 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P90'])]
industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p90['Difference in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p90_2023 = industry_p90[industry_p90['Year'] == 2023]

chart = alt.Chart(industry_p90_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('Difference in aGVA_per_worker:Q', title='Difference in 90th percentile aGVA per worker 2022-2023'),
    tooltip=['Year:O','Industry:N','Difference in aGVA_per_worker:Q']
)

chart
#chart.save('P90_change_industries_2022_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [256]:
# Try an average of P90 change over 2020-2023, the period in which we have seen increase in P90
industry_p90 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P90'])]
industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p90['Difference in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100

industry_p90_2020_2023 = industry_p90[
    (industry_p90['Year'] >= 2020) &
    (industry_p90['Year'] <= 2023)
]
industry_p90_2020_2023['avg_percent_change_by_industry_2020_2023'] = industry_p90_2020_2023.groupby('Industry')['percent change in aGVA_per_worker'].transform('mean')

industry_p90_2020_2023 = industry_p90_2020_2023[industry_p90_2020_2023['Year']==2023]

chart = alt.Chart(industry_p90_2020_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('avg_percent_change_by_industry_2020_2023:Q', title='Average % change in 90th percentile aGVA per worker 2020-2023'),
    tooltip=['Year:O','Industry:N','avg_percent_change_by_industry_2020_2023:Q']
)

chart

chart.save('P90_change_industries_2020_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [255]:
# Try an average of p50 change over 2020-2023, the period in which we have seen increase in P90
industry_p50 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P50'])]
industry_p50['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p50['aGVA per worker (£) KP EW'], errors='coerce')
industry_p50['ln_aGVA_per_worker'] = industry_p50['aGVA per worker (£) KP EW'].apply(np.log)
industry_p50['percent change in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p50['Difference in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100

industry_p50_2020_2023 = industry_p50[
    (industry_p50['Year'] >= 2020) &
    (industry_p50['Year'] <= 2023)
]
industry_p50_2020_2023['avg_percent_change_by_industry_2020_2023'] = industry_p50_2020_2023.groupby('Industry')['percent change in aGVA_per_worker'].transform('mean')

industry_p50_2020_2023 = industry_p50_2020_2023[industry_p50_2020_2023['Year']==2023]

chart = alt.Chart(industry_p50_2020_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('avg_percent_change_by_industry_2020_2023:Q', title='Average % change in 50th percentile aGVA per worker 2020-2023'),
    tooltip=['Year:O','Industry:N','avg_percent_change_by_industry_2020_2023:Q']
)

chart

#chart.save('P90_change_industries_2022_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p50['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p50['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p50['ln_aGVA_per_worker'] = industry_p50['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p50['percent change in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [None]:
# Try an average of P10 change over 2020-2023, the period in which we have seen increase in P90
industry_p10 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P10'])]
industry_p10['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p10['aGVA per worker (£) KP EW'], errors='coerce')
industry_p10['ln_aGVA_per_worker'] = industry_p10['aGVA per worker (£) KP EW'].apply(np.log)
industry_p10['percent change in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p10['Difference in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100

industry_p10_2020_2023 = industry_p10[
    (industry_p10['Year'] >= 2020) &
    (industry_p10['Year'] <= 2023)
]
industry_p10_2020_2023['avg_difference_by_industry_2020_2023'] = industry_p10_2020_2023.groupby('Industry')['percent change in aGVA_per_worker'].transform('mean')

industry_p10_2020_2023 = industry_p10_2020_2023[industry_p10_2020_2023['Year']==2023]

chart = alt.Chart(industry_p10_2020_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('avg_percent_change_by_industry_2020_2023:Q', title='Average % change in 10th percentile aGVA per worker 2020-2023'),
    tooltip=['Year:O','Industry:N','avg_percent_change_by_industry_2020_2023:Q']
)

chart

#chart.save('P10_change_industries_2020_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p10['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p10['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p10['ln_aGVA_per_worker'] = industry_p10['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p10['percent change in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [259]:
# Plot historical 90th percentile across industries
#Plot the change 90th percentile across industries 2022-2023
industry_p90 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P90'])]
industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p90['Difference in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100

# A (agriculture, forestry and fishing) and B(mining and quarrying) and D (electricity, gas, steam and air conditioning supply)
# are volatile and have very high aGVA per worker - exclude
industry_p90 = industry_p90[~industry_p90['Industry'].isin(['B','D'])]

# This is a noisy graph, not for document. Here select industry to focus
industry_p90_industry_select = industry_p90[industry_p90['Industry'].isin(['R','S','J','N','F','P','I'])]


chart = alt.Chart(industry_p90_industry_select).mark_line().encode(
    x=alt.X('Year:N', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('aGVA per worker (£) KP EW:Q', title='90th percentile aGVA per worker 1997-2023'),
    color=alt.Color('Industry:N'),
    tooltip=['Year:O','Industry:N','aGVA per worker (£) KP EW:Q']
)

chart
chart.save('frontier_p90_1997_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [260]:
# Plot historical 50TH percentile across industries who saw big increase in P90
industry_p50 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P50'])]
industry_p50['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p50['aGVA per worker (£) KP EW'], errors='coerce')
industry_p50['ln_aGVA_per_worker'] = industry_p50['aGVA per worker (£) KP EW'].apply(np.log)
industry_p50['percent change in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p50['Difference in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100

# A (agriculture, forestry and fishing) and B(mining and quarrying) and D (electricity, gas, steam and air conditioning supply)
# are volatile and have very high aGVA per worker - exclude
industry_p50 = industry_p50[~industry_p50['Industry'].isin(['B','D'])]

# This is a noisy graph, not for document. Here select industry to focus
industry_p50_industry_select = industry_p50[industry_p50['Industry'].isin(['R','S','J','N','F','P','I'])]


chart = alt.Chart(industry_p50_industry_select).mark_line().encode(
    x=alt.X('Year:N', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('aGVA per worker (£) KP EW:Q', title='50th percentile aGVA per worker 1997-2023'),
    color=alt.Color('Industry:N'),
    tooltip=['Year:O','Industry:N','aGVA per worker (£) KP EW:Q']
)

chart
chart.save('frontier_p50_1997_2023.png', scale_factor=2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p50['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p50['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p50['ln_aGVA_per_worker'] = industry_p50['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p50['percent change in aGVA_per_worker'] = industry_p50.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [263]:
# Dispersion of P90-P50 over industries over time
industry_90_50 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P90','P50'])].pivot(
    index=['Year','Industry'],
    columns='Decile',
    values='aGVA per worker (£) KP EW'
)

# Convert [c] to NA and columns to numeric
industry_90_50 = industry_90_50.replace('[c]', np.nan)
industry_90_50['P90_P50_ratio'] = industry_90_50['P90'] / industry_90_50['P50']

chart = alt.Chart(industry_90_50.reset_index()).mark_line().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('P90_P50_ratio:Q', title='90th to 50th Percentile Ratio'),
    facet=alt.Facet('Industry:N', columns=2)).resolve_scale(
    y='independent'
)

chart

  industry_90_50 = industry_90_50.replace('[c]', np.nan)


In [None]:
# Calculate averages of dispersion over 2020-2023, the period in which we have seen increase in P90


In [226]:
industry_p90[industry_p90['Industry']=='R']

Unnamed: 0,Year,Industry,Decile,aGVA per worker (£) KP EW,ln_aGVA_per_worker,percent change in aGVA_per_worker,Difference in aGVA_per_worker
2241,1997,R,P90,102500.0,11.537618,,
2242,1998,R,P90,96500.0,11.477298,-5.853659,-5.853659
2243,1999,R,P90,98000.0,11.492723,1.554404,1.554404
2244,2000,R,P90,91500.0,11.424094,-6.632653,-6.632653
2245,2001,R,P90,83000.0,11.326596,-9.289617,-9.289617
2246,2002,R,P90,95000.0,11.461632,14.457831,14.457831
2247,2003,R,P90,108000.0,11.589887,13.684211,13.684211
2248,2004,R,P90,96000.0,11.472103,-11.111111,-11.111111
2249,2005,R,P90,101500.0,11.527814,5.729167,5.729167
2250,2006,R,P90,97000.0,11.482466,-4.433498,-4.433498


In [None]:
#Plot the change in 90th percentile across 2 digit industries 2022-2023
industry_p90 = firm_level_prod_2dig_industry[firm_level_prod_2dig_industry['Decile'].isin(['P90'])]
industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p90_2023 = industry_p90[industry_p90['Year'] == 2023]

chart = alt.Chart(industry_p90_2023).mark_bar().encode(
    x=alt.X('percent change in aGVA_per_worker:Q'),
    y=alt.Y('Industry:N', sort='-x'),
    tooltip=['Year:O','Industry:N','percent change in aGVA_per_worker:Q']
)

chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p90['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p90['ln_aGVA_per_worker'] = industry_p90['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p90['percent change in aGVA_per_worker'] = industry_p90.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [191]:
#Plot the 10th percentile across industries
industry_p10 = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['P10'])]
industry_p10['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p10['aGVA per worker (£) KP EW'], errors='coerce')
industry_p10['ln_aGVA_per_worker'] = industry_p10['aGVA per worker (£) KP EW'].apply(np.log)
industry_p10['percent change in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_p10['Difference in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].diff()
industry_p10_2023 = industry_p10[industry_p10['Year'] == 2023]

# Section D massive outlier so remove from chart
industry_p10_2023 = industry_p10_2023[industry_p10_2023['Industry'] != 'D']

chart = alt.Chart(industry_p10_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('percent change in aGVA_per_worker:Q', title='Percent change in 10th percentile aGVA per worker 2022-2023'),
    tooltip=['Year:O','Industry:N','percent change in aGVA_per_worker:Q']
)
chart = alt.Chart(industry_p10_2023).mark_bar().encode(
    x=alt.X('Industry:N', sort='-y'),
    y=alt.Y('Difference in aGVA_per_worker:Q', title='Difference in 10th percentile aGVA per worker 2022-2023'),
    tooltip=['Year:O','Industry:N','Difference in aGVA_per_worker:Q']
)
chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p10['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_p10['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_p10['ln_aGVA_per_worker'] = industry_p10['aGVA per worker (£) KP EW'].apply(np.log)
  industry_p10['percent change in aGVA_per_worker'] = industry_p10.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [194]:
# Just print section A
industry_p10[industry_p10['Industry']=='C']

Unnamed: 0,Year,Industry,Decile,aGVA per worker (£) KP EW,ln_aGVA_per_worker,percent change in aGVA_per_worker,Difference in aGVA_per_worker
54,1997,C,P10,6000.0,8.699515,,
55,1998,C,P10,6500.0,8.779557,8.333333,500.0
56,1999,C,P10,6500.0,8.779557,0.0,0.0
57,2000,C,P10,8000.0,8.987197,23.076923,1500.0
58,2001,C,P10,9500.0,9.159047,18.75,1500.0
59,2002,C,P10,11000.0,9.305651,15.789474,1500.0
60,2003,C,P10,13000.0,9.472705,18.181818,2000.0
61,2004,C,P10,15500.0,9.648595,19.230769,2500.0
62,2005,C,P10,15000.0,9.615805,-3.225806,-500.0
63,2006,C,P10,17000.0,9.740969,13.333333,2000.0


In [205]:
# MEAN LABOUR PRODUCTIVITY ACROSS INDUSTRIES IN 2023
industry_mean = firm_level_prod_industry[firm_level_prod_industry['Decile'].isin(['Mean'])]
industry_mean['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_mean['aGVA per worker (£) KP EW'], errors='coerce')
industry_mean['ln_aGVA_per_worker'] = industry_mean['aGVA per worker (£) KP EW'].apply(np.log)
industry_mean['percent change in aGVA_per_worker'] = industry_mean.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
industry_mean['Difference in aGVA_per_worker'] = industry_mean.groupby('Industry')['aGVA per worker (£) KP EW'].diff()
industry_mean_2023 = industry_mean[industry_mean['Year'] == 2023]

chart = alt.Chart(industry_mean_2023).mark_bar().encode(
    x=alt.X('aGVA per worker (£) KP EW:Q'),
    y=alt.Y('Industry:N', sort='-x'))

chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_mean['aGVA per worker (£) KP EW'] = pd.to_numeric(industry_mean['aGVA per worker (£) KP EW'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  industry_mean['ln_aGVA_per_worker'] = industry_mean['aGVA per worker (£) KP EW'].apply(np.log)
  industry_mean['percent change in aGVA_per_worker'] = industry_mean.groupby('Industry')['aGVA per worker (£) KP EW'].pct_change() * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [390]:
dynamism_we

Unnamed: 0,Year,Measure,Jobs
0,2001,Total job creation,4462501
1,2002,Total job creation,3640846
2,2003,Total job creation,3874304
3,2004,Total job creation,4062993
4,2005,Total job creation,3758062
...,...,...,...
283,2020,Incumbent reallocation,4067749
284,2021,Incumbent reallocation,3971469
285,2022,Incumbent reallocation,4603534
286,2023,Incumbent reallocation,4734315


In [396]:
we_dynamism_rates.groupby('Measure').sum()

Unnamed: 0_level_0,Year,Rate
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1
Enter_Exit_JC_Rate,48300,0.0981
Enter_Exit_Reallocation_Rate,48300,1.7757
Incumbent_Reallocation_Rate,48300,3.5144
Incumbents,48300,0.1364
Job creation rate (Entering firms),48300,0.9369
Job creation rate (Incumbent firms),48300,1.8254
Job destruction rate (Exiting firms),48300,0.8388
Job destruction rate (Incumbent firms),48300,1.689
Net_JC_Rate,48300,0.2345
Total_JC_Rate,48300,2.7623


In [397]:
we_jc_jd

Unnamed: 0,Year,Measure,Jobs,sort_order


In [412]:
# Whole economy dynamism

# Rename rates for plotting
replacement_map = {
    'Entrants_JC_Rate': 'Job creation rate (Entering firms)',
    'Incumbent_JC_Rate': 'Job creation rate (Incumbent firms)',
    'Exiters_JD_Rate': 'Job destruction rate (Exiting firms)',
    'Incumbent_JD_Rate':'Job destruction rate (Incumbent firms)'
}

# Apply the replacement to the 'measures' column
we_dynamism_rates['Measure'] = we_dynamism_rates['Measure'].replace(replacement_map)

# Define colours for variables and order for plotting
domain_vars = [
    'Job creation rate (Entering firms)',
    'Job creation rate (Incumbent firms)',
    'Job destruction rate (Exiting firms)',
    'Job destruction rate (Incumbent firms)'
]

variable_order_map = {
    'Job creation rate (Entering firms)': 4,
    'Job creation rate (Incumbent firms)': 3,
    'Job destruction rate (Exiting firms)': 2,
    'Job destruction rate (Incumbent firms)': 1
}

we_dynamism_rates['sort_order'] = we_dynamism_rates['Measure'].map(variable_order_map)

range_colors = [
    "#179FDB", 
    "#0063AF",  
    '#E54753',
    '#ff7f0e',  
]


we_jc_jd = we_dynamism_rates[we_dynamism_rates['Measure'].isin([
    'Job creation rate (Entering firms)',
    'Job creation rate (Incumbent firms)',
    'Job destruction rate (Incumbent firms)',
    'Job destruction rate (Exiting firms)'
])]

chart = alt.Chart(we_jc_jd).mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='%'), title=None),
    color=alt.Color('Measure', scale=alt.Scale(domain=domain_vars, range=range_colors)),
        order=alt.Order('sort_order:Q'),
tooltip=['Year:O','Rate:Q','Measure:N'],
).properties(
    title={
        "text":"Job creation and destruction by UK firms 2001-2024",  # Main Title
        "subtitle": "ONS, Longitudinal Business Database",
        "subtitleFontStyle": "italic", # Optional styling
        "subtitleFontSize": 12 # Subtitle
    }
)


chart
chart.save('Charts/aggregate_jc_jd.png', scale_factor=2)
chart.save('Charts/aggregate_jc_jd.json')

In [136]:
# FIRM SIZE
firmsize_jc_jd = firmsize_dynamism[firmsize_dynamism['Measure'].isin([
    'Entrants_JC_Rate',
    'Incumbent_JC_Rate',
    'Exiters_JD_Rate',
    'Incumbent_JD_Rate'
])]

chart = alt.Chart(firmsize_jc_jd).mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('Measure', scale=alt.Scale(domain=domain_vars, range=range_colors)),
    facet=alt.Facet('Firm size:N', columns=2),
tooltip=['Year:O','Rate:Q','Measure:N']
)

chart

In [137]:
# FIRM AGE
firmage_jc_jd = firmage_dynamism[firmage_dynamism['Measure'].isin([
    'Entrants_JC_Rate',
    'Incumbent_JC_Rate',
    'Exiters_JD_Rate',
    'Incumbent_JD_Rate'
])]

chart = alt.Chart(firmage_jc_jd).mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('Measure', scale=alt.Scale(domain=domain_vars, range=range_colors)),
    facet=alt.Facet('Firm age:N', columns=2),
tooltip=['Year:O','Rate:Q','Measure:N']
)

chart

In [138]:
# FIRM AGE
firmagesize_jc_jd = firmagesize_dynamism[firmagesize_dynamism['Measure'].isin([
    'Entrants_JC_Rate',
    'Incumbent_JC_Rate',
    'Exiters_JD_Rate',
    'Incumbent_JD_Rate'
])]

firmagesize_jc_jd['AgeSize'] = firmagesize_jc_jd['Firm age'].astype(str) + firmagesize_jc_jd['Firm size']

chart = alt.Chart(firmagesize_jc_jd).mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('Measure', scale=alt.Scale(domain=domain_vars, range=range_colors)),
    facet=alt.Facet('AgeSize:N', columns=2),
tooltip=['Year:O','Rate:Q','Measure:N']
)

chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firmagesize_jc_jd['AgeSize'] = firmagesize_jc_jd['Firm age'].astype(str) + firmagesize_jc_jd['Firm size']


In [139]:
# INDUSTRY

industry_dynamism = industry_dynamism[industry_dynamism['Measure'].isin([
    'Entrants_JC_Rate',
    'Incumbent_JC_Rate',
    'Exiters_JD_Rate',
    'Incumbent_JD_Rate'
])]


chart = alt.Chart(industry_dynamism).mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(
            labelExpr="datum.value % 2 == 0 ? datum.label : ''",
            labelAngle=0)),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('Measure', scale=alt.Scale(domain=domain_vars, range=range_colors)),
    facet=alt.Facet('Industry classification:N', columns=2),
tooltip=['Year:O','Rate:Q','Measure:N']
)

chart

# CONTRIBUTION ANALYSIS TO AGGREGATE JOB CREATION AND DESTRUCTION

In 2024, there was a total reallocation rate of 19.75%. This meant 19.75% of jobs from 2023 were either created or destroyed in 2024.

## Industry contribution analysis

In [351]:
# Beyond visualisations, can I calculate the contribution of different firm characteristics to aggregate job creation and destruction?
# For this we need absolute values of JC and JD, not rates.
industry_abs_dynamism = pd.read_excel('annualbusinessdynamism20012024.xlsx', sheet_name='Table 18', skiprows=7)

industry_abs_dynamism = industry_abs_dynamism.rename(columns={'Total':'Total_JC',
                            'Entering businesses':'Entrants_JC',
                            'Incumbent growing':'Incumbent_JC',
                            'Total ':'Total_JD',
                            'Exiting businesses':'Exiters_JD',
                            'Incumbent shrinking':'Incumbent_JD',
                            'Net flow':'Net_JC',
                            'Entering and Exiting':'Enter_Exit_JC',
                            'Total .1':'Total_Reallocation',
                            'Entering and Exiting.1':'Enter_Exit_Reallocation',
                            'Incumbents.1':'Incumbent_Reallocation'
                            })

# Create lag employment column
industry_abs_dynamism['EmploymentLag'] = industry_abs_dynamism.groupby('Industry classification')['Employment'].shift(1)

JOB_FLOW_COLS = [
    'Entrants_JC',
    'Incumbent_JC',
    'Exiters_JD',
    'Incumbent_JD',
    'Total_Reallocation'
]

# Calculate the total economy employment (sum of EmploymentLag) for each year
yearly_total_employment = industry_abs_dynamism.groupby('Year')['EmploymentLag'].sum().reset_index()
yearly_total_employment.rename(columns={'EmploymentLag': 'Total_Economy_Employment_Lag'}, inplace=True)

# 3. Merge the total denominator back into the main DataFrame
industry_contributions = pd.merge(industry_abs_dynamism, yearly_total_employment, on='Year', how='left')

# Create new columns for the contribution rates
for col in JOB_FLOW_COLS:
    new_col_name = f'Contribution_{col}_Pct'
    # Contribution = (Industry Job Flow / Total Economy Employment T-1) * 100
    industry_contributions[new_col_name] = (industry_contributions[col] / industry_contributions['Total_Economy_Employment_Lag']) * 100

# Calculate the overall Industry JCR and JDR contributions
industry_contributions['Contribution_Total_JCR_Pct'] = (
    industry_contributions['Contribution_Entrants_JC_Pct'] + industry_contributions['Contribution_Incumbent_JC_Pct']
)
industry_contributions['Contribution_Total_JDR_Pct'] = (
    industry_contributions['Contribution_Exiters_JD_Pct'] + industry_contributions['Contribution_Incumbent_JD_Pct']
)

In [301]:
JOB_FLOW_COLS = [
    'Entrants_JC',
    'Incumbent_JC',
    'Exiters_JD',
    'Incumbent_JD',
    'Total_Reallocation'
]

# Calculate the total economy employment (sum of EmploymentLag) for each year
yearly_total_employment = industry_abs_dynamism.groupby('Year')['EmploymentLag'].sum().reset_index()
yearly_total_employment.rename(columns={'EmploymentLag': 'Total_Economy_Employment_Lag'}, inplace=True)

# 3. Merge the total denominator back into the main DataFrame
industry_contributions = pd.merge(industry_abs_dynamism, yearly_total_employment, on='Year', how='left')

# Create new columns for the contribution rates
for col in JOB_FLOW_COLS:
    new_col_name = f'Contribution_{col}_Pct'
    # Contribution = (Industry Job Flow / Total Economy Employment T-1) * 100
    industry_contributions[new_col_name] = (industry_contributions[col] / industry_contributions['Total_Economy_Employment_Lag']) * 100

# Calculate the overall Industry JCR and JDR contributions
industry_contributions['Contribution_Total_JCR_Pct'] = (
    industry_contributions['Contribution_Entrants_JC_Pct'] + industry_contributions['Contribution_Incumbent_JC_Pct']
)
industry_contributions['Contribution_Total_JDR_Pct'] = (
    industry_contributions['Contribution_Exiters_JD_Pct'] + industry_contributions['Contribution_Incumbent_JD_Pct']
)

In [307]:
# Select a year for verification (e.g., 2024)
YEAR_TO_EXAMINE = 2024
df_2024 = industry_contributions[industry_contributions['Year'] == YEAR_TO_EXAMINE].copy()

# Print the contributions for 2024
contribution_cols = [col for col in df_2024.columns if 'Contribution_' in col]
print(f"--- Industry Contributions for {YEAR_TO_EXAMINE} (as % of Total Employment) ---")
print(df_2024[['Industry classification'] + contribution_cols].round(3))

print("\n--- Verification ---")
# Sum the contributions to get the Aggregate Total Rates for that year
Total_JCR_Rate = df_2024['Contribution_Total_JCR_Pct'].sum()
Total_JDR_Rate = df_2024['Contribution_Total_JDR_Pct'].sum()
Total_Reallocation_Rate = df_2024['Contribution_Total_Reallocation_Pct'].sum()


print(f"Aggregate Total Job Creation Rate (2024): {Total_JCR_Rate:.3f}%")
print(f"Aggregate Total Job Destruction Rate (2024): {Total_JDR_Rate:.3f}%")
print(f"Aggregate Total Reallocation Rate (2024): {Total_Reallocation_Rate:.3f}%")

--- Industry Contributions for 2024 (as % of Total Employment) ---
                               Industry classification  \
391                      Accommodation & food services   
392                    Agriculture, forestry & fishing   
393  Arts, entertainment, recreation and other acti...   
394                Administration and support services   
395                                       Construction   
396                                          Education   
397                                Finance & insurance   
398                       Human health and social work   
399                        Information & communication   
400                                     Other services   
401                                      Manufacturing   
402                      Mining, quarrying & utilities   
403               Professional, scientific & technical   
404                             Real estate activities   
405  Public administration & defence; compulsory so...   
406  

Let's view the tabular format to see if our contributions seem reasonable. What is the interpretation? Remind us that 19.75% is the total reallocation rate.

- 2.86% of this reallocation took place in Administration and support services
- 

In [420]:
# CREATE INTERACTIVE ALTAIR CHART FOR INDUSTRY CONTRIBUTIONS

FLOW_COMPONENT_RENAMES = {
    'Contribution_Entrants_JC_Pct': 'Entrant Job Creation',
    'Contribution_Incumbent_JC_Pct': 'Incumbent Job Creation',
    'Contribution_Exiters_JD_Pct': 'Exiter Job Destruction',
    'Contribution_Incumbent_JD_Pct': 'Incumbent Job Destruction'
}

def create_interactive_contribution_chart(df_data, category_col, initial_year=None, show_legend=True):
    """
    Generates an Altair mirrored stacked bar chart with a dropdown menu to select the year.

    Args:
        df_data (pd.DataFrame): DataFrame containing 'Year', category_col, and contribution columns.
        category_col (str): The column name to use for the Y-axis (e.g., 'Firm size' or 'Industry').
        initial_year (int): The default year to show when the chart loads.
        show_legend (bool): Whether to display the legend.

    Returns:
        alt.Chart: The interactive Altair chart object.
    """
    contribution_components = list(FLOW_COMPONENT_RENAMES.keys())

    # --- 1. DATA PREPARATION ---
    
    # Select and copy necessary columns, including 'Year'
    df_chart_base = df_data[['Year', category_col] + contribution_components + ['Contribution_Total_Reallocation_Pct']].copy()

    # Apply the negative sign for Job Destruction flows (must be done on the full dataset)
    df_chart_base['Contribution_Exiters_JD_Pct'] *= -1
    df_chart_base['Contribution_Incumbent_JD_Pct'] *= -1

    # Melt the DataFrame to long format for Altair stacking
    df_melted = df_chart_base.melt(
        id_vars=['Year', category_col],
        value_vars=contribution_components,
        var_name='Flow_Component',
        value_name='Contribution_Rate'
    )

    df_melted['Job flow component'] = df_melted['Flow_Component'].map(FLOW_COMPONENT_RENAMES)

    # --- 2. INTERACTIVE SELECTION (The Dropdown) ---
    
    # Get the min, max, and step for the slider range
    year_min = df_data['Year'].min()
    year_max = df_data['Year'].max()
    year_step = 1 # Assuming years are sequential and change by 1

    # Set the initial year if provided
    if initial_year is None:
        initial_year = year_min

    # Create the selection object for the SLIDER
    year_selection = alt.selection_point(
        name='SelectYear',
        fields=['Year'],
        bind=alt.binding_range(
            min=year_min,
            max=year_max,
            step=year_step,
            name='Year: '
        ),
        value=[{'Year': initial_year}] # Set the initial selected value
    )

    # --- 3. SORTING LOGIC ---

    # Sort the category column based on the total reallocation for the initial year
    # This sorting is applied *before* the transform_filter, but Altair handles it.
    df_initial_sort = df_chart_base[df_chart_base['Year'] == initial_year].sort_values(
        'Contribution_Total_Reallocation_Pct', ascending=False
    )[category_col].tolist()


    # --- 4. ALTAIR ENCODINGS AND CHART DEFINITION ---
    
    color_domain_renamed = list(FLOW_COMPONENT_RENAMES.values())

    color_encoding = alt.Color('Job flow component:N', 
                               scale=alt.Scale(domain=color_domain_renamed, range=["#179FDB", 
    "#0063AF",  
    '#E54753',
    '#ff7f0e']))
    
    
    # Base chart with filtering transformation
    base = alt.Chart(df_melted).add_params(
        year_selection # Add the selection parameter
    ).transform_filter(
        year_selection # Filter the data based on the selection
    ).properties(
        title=alt.Title("Industry Contribution to Job Reallocation in selected year")
    )

    # Create the Mirrored Stacked Bar Chart
    chart = base.mark_bar().encode(
        # Category on the Y-axis, sorted by the total flow magnitude of the filtered data
        y=alt.Y(category_col, sort=df_initial_sort, title=""),

        # Contribution Rate on the X-axis (mirrored)
        x=alt.X('Contribution_Rate',
                title="Contribution Rate (%)", 
                axis=alt.Axis(titleY=30, titlePadding=10, titleAlign='center')),

        # Stacking by the specific component and controlling legend
        color=color_encoding,

        # Tooltip for detailed info
        tooltip=[
            category_col,
            alt.Tooltip('Job flow component', title="Job flow component"),
            alt.Tooltip('Contribution_Rate', format='.2f', title="Contribution (%)")
        ]
    ).interactive() # Allows zooming and panning

    return chart

# 1. Create the interactive chart
interactive_chart = create_interactive_contribution_chart(
    df_data=industry_contributions, 
    category_col='Industry classification', 
    initial_year=2024, 
    show_legend=True
)

# 2. Save the interactive chart as a JSON file
output_file = "interactive_industry_contribution_chart.json"

# To save, you will need a suitable environment/library. 
# We'll print the save instruction for a successful local run:
print(f"\n--- Altair Chart JSON Output ---")
interactive_chart.save(output_file) 




interactive_chart


--- Altair Chart JSON Output ---


In [None]:
# 1. Create the interactive chart
interactive_chart = create_interactive_contribution_chart(
    df_data=size_contributions, 
    category_col='Firm size', 
    initial_year=2024, 
    show_legend=True
)

# 2. Save the interactive chart as a JSON file
output_file = "interactive_firmsize_contribution_chart.json"

# To save, you will need a suitable environment/library. 
# We'll print the save instruction for a successful local run:
print(f"\n--- Altair Chart JSON Output ---")
interactive_chart.save(output_file) 

In [400]:
contribution_components = [
    'Contribution_Entrants_JC_Pct',
    'Contribution_Incumbent_JC_Pct',
    'Contribution_Exiters_JD_Pct',
    'Contribution_Incumbent_JD_Pct'
]

# Create a copy for the chart and add a 'Job_Flow_Type' column for grouping
df_chart = df_2024[['Industry classification'] + contribution_components].copy()

# Job Destruction flows must be negative for the mirrored stacked bar chart
df_chart['Contribution_Exiters_JD_Pct'] *= -1
df_chart['Contribution_Incumbent_JD_Pct'] *= -1

# Melt the DataFrame to long format for Altair stacking
df_melted = df_chart.melt(
    id_vars=['Industry classification'],
    value_vars=contribution_components,
    var_name='Flow_Component',
    value_name='Contribution_Rate'
)

# Add a marker column to separate JC (Positive) and JD (Negative)
df_melted['Flow_Direction'] = df_melted['Flow_Component'].apply(
    lambda x: 'Job Creation' if 'JC' in x else 'Job Destruction'
)

# Define color scale and order
color_scale = alt.Scale(
    domain=['Contribution_Entrants_JC_Pct', 'Contribution_Incumbent_JC_Pct',
            'Contribution_Exiters_JD_Pct', 'Contribution_Incumbent_JD_Pct'],
    range=["#179FDB", 
    "#0063AF",  
    '#E54753',
    '#ff7f0e']
)

# Sort industries by total absolute contribution for better readability
df_sort_order = df_2024.sort_values(
    'Contribution_Total_Reallocation_Pct', ascending=False
)['Industry classification'].tolist()


# Create the Mirrored Stacked Bar Chart
chart = alt.Chart(df_melted).mark_bar().encode(
    # Industry on the Y-axis, sorted by total flow magnitude
    y=alt.Y('Industry classification', sort=df_sort_order, title="Industry"),

    # Contribution Rate on the X-axis (mirrored)
    x=alt.X('Contribution_Rate',
            title="Contribution to Total Job Flow Rate (%)"),

    # Stacking by the specific component
    color=alt.Color('Flow_Component', scale=color_scale, legend=alt.Legend(title="Flow Component")),

    # Tooltip for detailed info
    tooltip=[
        'Industry classification',
        'Flow_Component',
        alt.Tooltip('Contribution_Rate', format='.2f')
    ]
).properties(
    title=f'Industry Contribution to Job Creation and Destruction Rates (2024)'
).interactive() # Allows zooming and panning

chart

KeyError: "['Industry classification'] not in index"

In [311]:
# CONTRIBUTIONS OF FIRM SIZES
# Beyond visualisations, can I calculate the contribution of different firm characteristics to aggregate job creation and destruction?
# For this we need absolute values of JC and JD, not rates.
size_abs_dynamism = pd.read_excel('annualbusinessdynamism20012024.xlsx', sheet_name='Table 9', skiprows=7)

size_abs_dynamism = size_abs_dynamism.rename(columns={'Total':'Total_JC',
                            'Entering businesses':'Entrants_JC',
                            'Incumbent growing':'Incumbent_JC',
                            'Total ':'Total_JD',
                            'Exiting businesses':'Exiters_JD',
                            'Incumbent shrinking':'Incumbent_JD',
                            'Net flow':'Net_JC',
                            'Entering and Exiting':'Enter_Exit_JC',
                            'Total .1':'Total_Reallocation',
                            'Entering and Exiting.1':'Enter_Exit_Reallocation',
                            'Incumbents.1':'Incumbent_Reallocation'
                            })

# The firm size sheet doesn't have a total employment column, so I obtain this from the industry sheet
industry_abs_dynamism_for_employment = pd.read_excel('annualbusinessdynamism20012024.xlsx', sheet_name='Table 18', skiprows=7)
industry_abs_dynamism_for_employment = industry_abs_dynamism_for_employment[['Year','Employment']]
yearly_total_employment = industry_abs_dynamism_for_employment.groupby('Year')['Employment'].sum().reset_index()
yearly_total_employment.rename(columns={'Employment': 'Total_Economy_Employment'}, inplace=True)

JOB_FLOW_COLS = [
    'Entrants_JC',
    'Incumbent_JC',
    'Exiters_JD',
    'Incumbent_JD',
    'Total_Reallocation'
]

# 3. Merge the total employment onto the main dataframe and create a new lag
size_contributions = pd.merge(size_abs_dynamism, yearly_total_employment, on='Year', how='left')

size_contributions['Total_Economy_Employment_Lag'] = size_contributions['Total_Economy_Employment'].shift(1)

# Create new columns for the contribution rates
for col in JOB_FLOW_COLS:
    new_col_name = f'Contribution_{col}_Pct'
    # Contribution = (Industry Job Flow / Total Economy Employment T-1) * 100
    size_contributions[new_col_name] = (size_contributions[col] / size_contributions['Total_Economy_Employment_Lag']) * 100

# Calculate the overall Industry JCR and JDR contributions
size_contributions['Contribution_Total_JCR_Pct'] = (
    size_contributions['Contribution_Entrants_JC_Pct'] + size_contributions['Contribution_Incumbent_JC_Pct']
)
size_contributions['Contribution_Total_JDR_Pct'] = (
    size_contributions['Contribution_Exiters_JD_Pct'] + size_contributions['Contribution_Incumbent_JD_Pct']
)


In [403]:
import altair as alt
import pandas as pd
import os

def create_interactive_contribution_chart(df_data, category_col, initial_year=None, show_legend=True):
    """
    Generates an Altair mirrored stacked bar chart with a dropdown menu to select the year.

    Args:
        df_data (pd.DataFrame): DataFrame containing 'Year', category_col, and contribution columns.
        category_col (str): The column name to use for the Y-axis (e.g., 'Firm size' or 'Industry').
        initial_year (int): The default year to show when the chart loads.
        show_legend (bool): Whether to display the legend.

    Returns:
        alt.Chart: The interactive Altair chart object.
    """
    contribution_components = [
        'Contribution_Entrants_JC_Pct',
        'Contribution_Incumbent_JC_Pct',
        'Contribution_Exiters_JD_Pct',
        'Contribution_Incumbent_JD_Pct'
    ]

    # --- 1. DATA PREPARATION ---
    
    # Select and copy necessary columns, including 'Year'
    df_chart_base = df_data[['Year', category_col] + contribution_components + ['Contribution_Total_Reallocation_Pct']].copy()

    # Apply the negative sign for Job Destruction flows (must be done on the full dataset)
    df_chart_base['Contribution_Exiters_JD_Pct'] *= -1
    df_chart_base['Contribution_Incumbent_JD_Pct'] *= -1

    # Melt the DataFrame to long format for Altair stacking
    df_melted = df_chart_base.melt(
        id_vars=['Year', category_col],
        value_vars=contribution_components,
        var_name='Flow_Component',
        value_name='Contribution_Rate'
    )

    # --- 2. INTERACTIVE SELECTION (The Dropdown) ---
    
    # Get the min, max, and step for the slider range
    year_min = df_data['Year'].min()
    year_max = df_data['Year'].max()
    year_step = 1 # Assuming years are sequential and change by 1

    # Set the initial year if provided
    if initial_year is None:
        initial_year = year_min

    # Create the selection object for the SLIDER
    year_selection = alt.selection_point(
        name='SelectYear',
        fields=['Year'],
        bind=alt.binding_range(
            min=year_min,
            max=year_max,
            step=year_step,
            name='Year: '
        ),
        value=[{'Year': initial_year}] # Set the initial selected value
    )

    # --- 3. SORTING LOGIC ---

    # Sort the category column based on the total reallocation for the initial year
    # This sorting is applied *before* the transform_filter, but Altair handles it.
    df_initial_sort = df_chart_base[df_chart_base['Year'] == initial_year].sort_values(
        'Contribution_Total_Reallocation_Pct', ascending=False
    )[category_col].tolist()


    # --- 4. ALTAIR ENCODINGS AND CHART DEFINITION ---
    
    # Define color scale and order
    color_scale = alt.Scale(
        domain=['Contribution_Entrants_JC_Pct', 'Contribution_Incumbent_JC_Pct',
                'Contribution_Exiters_JD_Pct', 'Contribution_Incumbent_JD_Pct'],
        range=["#179FDB", 
    "#0063AF",  
    '#E54753',
    '#ff7f0e'] # Green tones for JC, Red/Orange for JD
    )

    # Conditional color encoding
    color_encoding = alt.Color('Flow_Component', scale=color_scale)
    if show_legend:
        color_encoding = alt.Color('Flow_Component', scale=color_scale, legend=alt.Legend(title="Flow Component"))
    else:
        color_encoding = alt.Color('Flow_Component', scale=color_scale, legend=None)
        
    # Base chart with filtering transformation
    base = alt.Chart(df_melted).add_params(
        year_selection # Add the selection parameter
    ).transform_filter(
        year_selection # Filter the data based on the selection
    ).properties(
        title="Firm Size Contribution to Job Reallocation by Year"
    )

    # Create the Mirrored Stacked Bar Chart
    chart = base.mark_bar().encode(
        # Category on the Y-axis, sorted by the total flow magnitude of the filtered data
        y=alt.Y(category_col, sort=df_initial_sort, title=""),

        # Contribution Rate on the X-axis (mirrored)
        x=alt.X('Contribution_Rate',
                title="Contribution Rate (%)"),

        # Stacking by the specific component and controlling legend
        color=color_encoding,

        # Tooltip for detailed info
        tooltip=[
            category_col,
            alt.Tooltip('Flow_Component', title="Flow Component"),
            alt.Tooltip('Contribution_Rate', format='.2f', title="Contribution (%)")
        ]
    ).interactive() # Allows zooming and panning

    return chart

# 1. Create the interactive chart
interactive_chart = create_interactive_contribution_chart(
    df_data=size_contributions, 
    category_col='Firm size', 
    initial_year=2023, 
    show_legend=True
)

# 2. Save the interactive chart as a JSON file
output_file = "interactive_firmsize_contribution_chart.json"

# To save, you will need a suitable environment/library. 
# We'll print the save instruction for a successful local run:
print(f"\n--- Altair Chart JSON Output ---")
interactive_chart.save(output_file) 

# 1. Create the interactive chart
interactive_chart = create_interactive_contribution_chart(
    df_data=industry_contributions, 
    category_col='Industry classification', 
    initial_year=2023, 
    show_legend=True
)

# 2. Save the interactive chart as a JSON file
output_file = "interactive_industry_contribution_chart.json"

# To save, you will need a suitable environment/library. 
# We'll print the save instruction for a successful local run:
print(f"\n--- Altair Chart JSON Output ---")
interactive_chart.save(output_file) 

interactive_chart


--- Altair Chart JSON Output ---

--- Altair Chart JSON Output ---


In [323]:
# Loop to create industry contribution charts for each year
OUTPUT_DIR = 'Charts'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

df = pd.read_excel('annualbusinessdynamism20012024.xlsx', sheet_name='Table 18', skiprows=7)

df = df.rename(columns={'Total':'Total_JC',
                            'Entering businesses':'Entrants_JC',
                            'Incumbent growing':'Incumbent_JC',
                            'Total ':'Total_JD',
                            'Exiting businesses':'Exiters_JD',
                            'Incumbent shrinking':'Incumbent_JD',
                            'Net flow':'Net_JC',
                            'Entering and Exiting':'Enter_Exit_JC',
                            'Total .1':'Total_Reallocation',
                            'Entering and Exiting.1':'Enter_Exit_Reallocation',
                            'Incumbents.1':'Incumbent_Reallocation'
                            })

# Create lag employment column
df['EmploymentLag'] = df.groupby('Industry classification')['Employment'].shift(1)

JOB_FLOW_COLS = ['Entrants_JC', 'Incumbent_JC', 'Exiters_JD', 'Incumbent_JD']

# Calculate the yearly total denominator (Total Economy Employment T-1)
yearly_total_employment = df.groupby('Year')['EmploymentLag'].sum().reset_index()
yearly_total_employment.rename(columns={'EmploymentLag': 'Total_Economy_Employment_Lag'}, inplace=True)
df = pd.merge(df, yearly_total_employment, on='Year', how='left')

# Calculate all contribution rate columns
for col in JOB_FLOW_COLS:
    new_col_name = f'Contribution_{col}_Pct'
    df[new_col_name] = (df[col] / df['Total_Economy_Employment_Lag']) * 100

df['Contribution_Total_JCR_Pct'] = df['Contribution_Entrants_JC_Pct'] + df['Contribution_Incumbent_JC_Pct']
df['Contribution_Total_JDR_Pct'] = df['Contribution_Exiters_JD_Pct'] + df['Contribution_Incumbent_JD_Pct']
df['Contribution_Net_JC_Pct'] = df['Contribution_Total_JCR_Pct'] - df['Contribution_Total_JDR_Pct']
df['Contribution_Reallocation_Pct'] = df['Contribution_Total_JCR_Pct'] + df['Contribution_Total_JDR_Pct']
df['Employment_Share_Pct'] = (df['EmploymentLag'] / df['Total_Economy_Employment_Lag']) * 100

def generate_industry_report(df_year, year):
    """Generates the tabular report and Altair charts for a specific year."""
    print(f"\n=======================================================")
    print(f"## 📊 Industry Job Flow Analysis for Year: {year}")
    print(f"=======================================================")

    # --- 1. Tabular Representation ---
    table_cols = [
        'Industry classification',
        'Contribution_Entrants_JC_Pct', 'Contribution_Incumbent_JC_Pct', 'Contribution_Total_JCR_Pct',
        'Contribution_Exiters_JD_Pct', 'Contribution_Incumbent_JD_Pct', 'Contribution_Total_JDR_Pct',
        'Contribution_Net_JC_Pct'
    ]
    aggregate_row = df_year[table_cols[1:]].sum().rename('AGGREGATE TOTAL')
    df_table = pd.concat([df_year[table_cols], aggregate_row.to_frame().T], ignore_index=True)

    formatted_df_table = df_table.style.format({
        col: "{:.2f}%" for col in table_cols[1:]
    }).set_properties(**{'font-size': '10pt', 'text-align': 'center'})

    print(f"### Tabular Report ({year})")
    # Outputting HTML for display in environments that support it (like Jupyter)
    # If running in a standard console, you might just print(df_table)
    print(formatted_df_table.to_html())


    # --- 2. Stacked Bar Chart (JC vs JD) ---
    print(f"\n### Stacked Bar Chart ({year})")
    
    contribution_components = [
        'Contribution_Entrants_JC_Pct', 'Contribution_Incumbent_JC_Pct',
        'Contribution_Exiters_JD_Pct', 'Contribution_Incumbent_JD_Pct'
    ]
    df_chart = df_year[['Industry classification'] + contribution_components].copy()

    # Make JD flows negative for mirrored chart
    df_chart.loc[:, 'Contribution_Exiters_JD_Pct'] *= -1
    df_chart.loc[:, 'Contribution_Incumbent_JD_Pct'] *= -1

    df_melted = df_chart.melt(
        id_vars=['Industry classification'],
        value_vars=contribution_components,
        var_name='Flow_Component',
        value_name='Contribution_Rate'
    )
    df_melted['Flow_Direction'] = df_melted['Flow_Component'].apply(
        lambda x: 'Job Creation' if 'JC' in x else 'Job Destruction'
    )

    color_scale = alt.Scale(
        domain=['Contribution_Entrants_JC_Pct', 'Contribution_Incumbent_JC_Pct',
                'Contribution_Exiters_JD_Pct', 'Contribution_Incumbent_JD_Pct'],
        range=['#31a354', '#a1d99b', '#de2d26', '#fc9272']
    )
    
    # Sort industries by total absolute contribution (Reallocation)
    df_sort_order = df_year.sort_values('Contribution_Reallocation_Pct', ascending=False)['Industry classification'].tolist()

    chart = alt.Chart(df_melted).mark_bar().encode(
        y=alt.Y('Industry classification', sort=df_sort_order, title="Industry"),
        x=alt.X('Contribution_Rate', title="Contribution to Total Job Flow Rate (%)"),
        color=alt.Color('Flow_Component', scale=color_scale, legend=alt.Legend(title="Flow Component")),
        tooltip=['Industry classification', 'Flow_Component', alt.Tooltip('Contribution_Rate', format='.2f')]
    ).properties(
        title=f'Industry Contribution to Job Flows ({year})'
    ).interactive()

    png_filepath_bar = os.path.join(OUTPUT_DIR, f'{year}_industry_dynamism_contributions.png')
    chart.save(png_filepath_bar, format='png')    
    print(json.dumps(chart.to_dict())) # Output the chart spec as JSON


    # --- 3. Scatter Plot (Dynamism vs. Net Contribution) ---
    print(f"\n### Scatter Plot (Dynamism vs. Growth) ({year})")

    scatter = alt.Chart(df_year).mark_circle().encode(
        x=alt.X('Contribution_Net_JC_Pct', title='Net Job Creation Contribution (Growth) (%)'),
        y=alt.Y('Contribution_Reallocation_Pct', title='Job Reallocation Contribution (Churn) (%)'),
        size=alt.Size('Employment_Share_Pct', title='Industry Employment Share (%)'),
        color=alt.condition(
            alt.datum.Contribution_Net_JC_Pct > 0,
            alt.value('green'),
            alt.value('red')
        ),
        tooltip=['Industry classification', alt.Tooltip('Contribution_Net_JC_Pct', title='Net Contr.', format='.2f'),
                 alt.Tooltip('Contribution_Reallocation_Pct', title='Reallocation Contr.', format='.2f'),
                 alt.Tooltip('Employment_Share_Pct', title='Employment Share', format='.2f')]
    )

    v_line = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray', strokeDash=[5, 5]).encode(x='x')
    h_line = alt.Chart(pd.DataFrame({'y': [df_year['Contribution_Reallocation_Pct'].mean()]})).mark_rule(color='gray', strokeDash=[5, 5]).encode(y='y')

    final_chart = (scatter + v_line + h_line).properties(
        title=f'Industry Dynamism vs. Growth Matrix ({year})'
    ).interactive()

    print(json.dumps(final_chart.to_dict())) # Output the chart spec as JSON

# --- Main Loop Execution ---
unique_years = sorted(df['Year'].unique())

# Loop through each year and call the function
for year in unique_years:
    df_year = df[df['Year'] == year].copy()
    generate_industry_report(df_year, year)


## 📊 Industry Job Flow Analysis for Year: 2001
### Tabular Report (2001)
<style type="text/css">
#T_5a9d4_row0_col0, #T_5a9d4_row0_col1, #T_5a9d4_row0_col2, #T_5a9d4_row0_col3, #T_5a9d4_row0_col4, #T_5a9d4_row0_col5, #T_5a9d4_row0_col6, #T_5a9d4_row0_col7, #T_5a9d4_row1_col0, #T_5a9d4_row1_col1, #T_5a9d4_row1_col2, #T_5a9d4_row1_col3, #T_5a9d4_row1_col4, #T_5a9d4_row1_col5, #T_5a9d4_row1_col6, #T_5a9d4_row1_col7, #T_5a9d4_row2_col0, #T_5a9d4_row2_col1, #T_5a9d4_row2_col2, #T_5a9d4_row2_col3, #T_5a9d4_row2_col4, #T_5a9d4_row2_col5, #T_5a9d4_row2_col6, #T_5a9d4_row2_col7, #T_5a9d4_row3_col0, #T_5a9d4_row3_col1, #T_5a9d4_row3_col2, #T_5a9d4_row3_col3, #T_5a9d4_row3_col4, #T_5a9d4_row3_col5, #T_5a9d4_row3_col6, #T_5a9d4_row3_col7, #T_5a9d4_row4_col0, #T_5a9d4_row4_col1, #T_5a9d4_row4_col2, #T_5a9d4_row4_col3, #T_5a9d4_row4_col4, #T_5a9d4_row4_col5, #T_5a9d4_row4_col6, #T_5a9d4_row4_col7, #T_5a9d4_row5_col0, #T_5a9d4_row5_col1, #T_5a9d4_row5_col2, #T_5a9d4_row5_col3, #T_5a9d4_row5_col4, #T