In [1]:
# CEP OCCASSIONAL PAPER CHARTS
# Will Shepherd, Nov 2025

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
from pandas.api.types import CategoricalDtype
import os
import eco_style 
alt.themes.enable("report")

ThemeRegistry.enable('report')

In [3]:
# Import data
whole_economy_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='whole_economy')
firm_size_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='firm_size')
firm_age_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='firm_age')
industry_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='industry')
region_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='region')

In [4]:
# Define order for categorical variables

# 1. Employment sizeband
size_order = [
    'Large (250+)'
    'Medium (50-249)'
    'Small (10-49)'
    'Micro (0-9)'
]

# 2. Age group
age_order = [
    'New (0-2 years)',
    'Young (3-5 years)',
    'Old (5-10 years)',
    'Mature (10+ years)'
]

In [5]:
# Write function to calculate rates for dynamism measures, apply this across dataframes
def calculate_dynamism_rates(df, group_by_cols=None):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Sort data
    sort_cols = group_by_cols + ['year'] if group_by_cols else ['year']
    df = df.sort_values(sort_cols)
    
    # Create lagged employment (with or without grouping)
    if group_by_cols is None:
        df['total_employment_lagged'] = df['employment'].shift(1)
    else:
        df['total_employment_lagged'] = df.groupby(group_by_cols)['employment'].shift(1)
    
    # Calculate rates (same regardless of grouping)
    df['Entry rate'] = (df['n_entrants'] + df['n_entry_and_exit']) / df['n_firms']
    df['Exit rate'] = (df['n_exiters'] + df['n_entry_and_exit']) / df['n_firms']
    df['Job creation rate'] = (df['jc_incumbents'] + df['jc_entrants']) / df['total_employment_lagged']
    df['Job destruction rate'] = (df['jd_incumbents'] + df['jd_exiters']) / df['total_employment_lagged']
    df['Entry job creation rate'] = (df['jc_entrants']) / df['total_employment_lagged']
    df['Incumbent job creation rate'] = (df['jc_incumbents']) / df['total_employment_lagged']
    df['Exit job destruction rate'] = (df['jd_exiters']) / df['total_employment_lagged']
    df['Incumbent job destruction rate'] = (df['jd_incumbents']) / df['total_employment_lagged']


    # We can't use the first/last year for dynamic variables due to no backward/forward looking observatinons
    years = df['year'].unique()
    df = df[~df['year'].isin([years.min(), years.max()])]

    return df

# Apply function to dataframes
whole_economy_dynamism = calculate_dynamism_rates(whole_economy_df)
firm_size_dynamism = calculate_dynamism_rates(firm_size_df, group_by_cols=['emp_sizeband'])
firm_age_dynamism = calculate_dynamism_rates(firm_age_df, group_by_cols=['age_group'])
industry_dynamism = calculate_dynamism_rates(industry_df, group_by_cols=['industry_name'])
region_dynamism = calculate_dynamism_rates(region_df, group_by_cols=['region'])

In [7]:
# FIGURE 1 - ENTRY RATES

# Plot just entry rate
entry_df = whole_economy_dynamism.melt(id_vars=['year'],
                                                 value_vars=['Entry rate'])

chart = alt.Chart(entry_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Market Sector Firm Entry Rate'),
    color=alt.Color('variable:N', title=None)

)

chart
#chart.save('Charts/entry_we.png', scale_factor=2)
#chart.save('Charts/entry_we.json')

In [14]:
industry_dynamism

Unnamed: 0,year,industry_name,n_firms,employment,n_entrants,n_exiters,n_entry_and_exit,n_incumbents,jc_entrants,jc_incumbents,...,site_closure_incumbents,total_employment_lagged,Entry rate,Exit rate,Job creation rate,Job destruction rate,Entry job creation rate,Incumbent job creation rate,Exit job destruction rate,Incumbent job destruction rate
12,1998,Automotives,79929,600582,7677,6250,1648,64354,28745,53493,...,767,621861.0,0.116666,0.098813,0.132245,0.101185,0.046224,0.086021,0.065298,0.035887
24,1999,Automotives,79415,601788,5718,8792,1647,63258,27378,39655,...,277,600582.0,0.092741,0.131449,0.111613,0.105408,0.045586,0.066028,0.076491,0.028917
36,2000,Automotives,76664,588527,6067,7046,1580,61971,21904,43127,...,797,601788.0,0.099747,0.112517,0.108063,0.114869,0.036398,0.071665,0.067206,0.047663
48,2001,Automotives,75088,590836,5808,6767,1280,61233,18366,43389,...,507,588527.0,0.094396,0.107168,0.104931,0.111191,0.031207,0.073725,0.071115,0.040076
60,2002,Automotives,74424,599606,6059,6666,1255,60444,25596,61739,...,570,590836.0,0.098275,0.106431,0.147816,0.135244,0.043322,0.104494,0.072612,0.062632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,2018,z. Other industries,294192,2623155,24375,20380,6527,242910,90428,208072,...,2183,2471190.0,0.105040,0.091461,0.120792,0.089783,0.036593,0.084199,0.034120,0.055663
275,2019,z. Other industries,295444,2628454,20670,20525,7426,246823,78677,171962,...,2750,2623155.0,0.095098,0.094607,0.095549,0.091666,0.029993,0.065555,0.029356,0.062310
287,2020,z. Other industries,290467,2617884,22457,25111,8862,234037,89540,143442,...,1722,2628454.0,0.107823,0.116960,0.088638,0.070943,0.034066,0.054573,0.026282,0.044660
299,2021,z. Other industries,294848,2563952,24278,19904,14345,236321,61403,126796,...,2973,2617884.0,0.130993,0.116158,0.071890,0.074064,0.023455,0.048435,0.019613,0.054452


In [51]:
# Entry rates by industry

# Plot just entry rate
industry_entry_df = industry_dynamism.melt(id_vars=['year','industry_name'],
                                                 value_vars=['Entry rate'])

# Just display selected industries
# - IT
# - Recreation and culture
# - Other business
# - Retail
# - Transport
highlight_industries = ['IT', 'Recreation & Culture','Other business','Retail','Transport']

"#E6224B","#F4C245","#0063AF","#00A767" "#179FDB"

custom_color_map = {'IT':'#0063AF',
                    'Recreation & Culture':'#F4C245',
                    'Other business':'#E6224B',
                    'Retail':"#00A767",
                    'Transport':'#179FDB'}

custom_color_dict = dict(custom_color_map)

color_condition = alt.condition(
    alt.FieldOneOfPredicate(field='industry_name', oneOf=highlight_industries),
    alt.Color(
        'industry_name:N', 
        scale=alt.Scale(domain=list(custom_color_dict.keys()), range=list(custom_color_dict.values())),
        title=None),
        alt.value('grey')                         # Set all other lines to grey
)

opacity_condition = alt.condition(
    # TEST: Is the industry_name IN the list of highlighted industries?
    alt.FieldOneOfPredicate(field='industry_name', oneOf=highlight_industries),
    alt.value(1.0),   # VALUE_IF_TRUE: Full opacity (1.0)
    alt.value(0.1)    # VALUE_IF_FALSE: Reduced opacity (0.3)
)

chart = alt.Chart(industry_entry_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Firm Entry Rate'),
    color=color_condition,
    opacity=opacity_condition,
    tooltip=['year:O','industry_name:N', 'value:Q']
)

chart

Unnamed: 0,year,industry_name,variable,value
250,1998,Wholesale,Entry rate,0.113142
251,1999,Wholesale,Entry rate,0.098957
252,2000,Wholesale,Entry rate,0.110732
253,2001,Wholesale,Entry rate,0.097407
254,2002,Wholesale,Entry rate,0.087776
255,2003,Wholesale,Entry rate,0.094497
256,2004,Wholesale,Entry rate,0.10681
257,2005,Wholesale,Entry rate,0.103013
258,2006,Wholesale,Entry rate,0.093989
259,2007,Wholesale,Entry rate,0.094304


In [23]:
# Size of entrants
firm_size_dynamism

Unnamed: 0,year,emp_sizeband,n_firms,employment,n_entrants,n_exiters,n_entry_and_exit,n_incumbents,jc_entrants,jc_incumbents,...,site_closure_incumbents,total_employment_lagged,Entry rate,Exit rate,Job creation rate,Job destruction rate,Entry job creation rate,Incumbent job creation rate,Exit job destruction rate,Incumbent job destruction rate
7,1998,Large (250+),6333,7856086,200,394,105,5634,169739,1006981,...,8342,7778506.0,0.048160,0.078794,0.151278,0.099271,0.021822,0.129457,0.046533,0.052738
11,1999,Large (250+),6387,8050818,179,365,95,5748,189649,677315,...,1572,7856086.0,0.042900,0.072021,0.110356,0.078179,0.024140,0.086215,0.047379,0.030800
15,2000,Large (250+),6493,8229747,176,409,105,5803,135081,818776,...,9958,8050818.0,0.043277,0.079162,0.118480,0.092864,0.016779,0.101701,0.045616,0.047248
19,2001,Large (250+),6539,8502798,229,360,112,5838,295872,871683,...,8489,8229747.0,0.052149,0.072182,0.141870,0.092675,0.035952,0.105919,0.037058,0.055616
23,2002,Large (250+),6575,8630062,197,418,106,5854,204762,888258,...,14709,8502798.0,0.046084,0.079696,0.128548,0.110500,0.024082,0.104467,0.046539,0.063961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,2018,Small (10-49),192840,3698307,7805,9015,1908,174112,136999,286718,...,1962,3665594.0,0.050368,0.056643,0.115593,0.092843,0.037374,0.078219,0.045051,0.047792
89,2019,Small (10-49),193984,3724997,7876,9189,1923,174996,138112,282546,...,1912,3698307.0,0.050514,0.057283,0.113743,0.088257,0.037345,0.076399,0.044943,0.043313
93,2020,Small (10-49),194610,3754228,8315,6546,1327,178422,145704,285998,...,1769,3724997.0,0.049545,0.040455,0.115893,0.076131,0.039115,0.076778,0.032605,0.043526
97,2021,Small (10-49),194451,3752213,6140,6295,1415,180601,109884,250086,...,1997,3754228.0,0.038853,0.039650,0.095884,0.083990,0.029269,0.066614,0.030318,0.053672


In [10]:
# FIGURE 4 - INDUSTRY ENTRY RATES 2018 TO 2022

# Note we could include 2023 here, currently blanked in the SecureLab export but theoretically we have the info we need.
# ENTRY = NOT PRESENT IN T-1 BUT PRESENT IN T (we don't need t+1)
# Check the figures in SecureLab

industry_entry = industry_dynamism.melt(id_vars=['year','industry_name'], value_vars=['Entry rate'])

industry_entry

chart = alt.Chart(industry_entry).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q',  axis=alt.Axis(format='%')),
    color=alt.Color('industry_name:N')
)

chart

In [43]:
whole_economy_dynamism.columns

Index(['year', 'n_firms', 'employment', 'n_entrants', 'n_exiters',
       'n_entry_and_exit', 'n_incumbents', 'jc_entrants', 'jc_incumbents',
       'jd_exiters', 'jd_incumbents', 'site_exp_entrants',
       'site_exp_incumbents', 'site_closure_exit', 'site_closure_incumbents',
       'total_employment_lagged', 'Entry rate', 'Exit rate',
       'Job creation rate', 'Job destruction rate', 'Entry job creation rate',
       'Incumbent job creation rate', 'Exit job destruction rate',
       'Incumbent job destruction rate'],
      dtype='object')

In [None]:
# FIGURE X - JOB CREATION

# Calculate 

job_creation = firm_size_dynamism.melt(id_vars=['year','emp_sizeband'],value_vars=['Entry job creation rate','Incumbent job creation rate'])

chart = alt.Chart(job_creation).mark_bar().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q'),
    color=alt.Color('variable:N'),
    facet=alt.Facet('emp_sizeband:O', columns=2, sort=size_order, header=alt.Header(
            title='Job creation by firm size (employment)',
            titleFontSize=16,
            labelFontSize=12
        ))
)

job_creation = firm_size_dynamism.melt(id_vars=['year','emp_sizeband'],value_vars=['Incumbent job creation rate'])

chart = alt.Chart(job_creation).mark_bar().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q'),
    color=alt.Color('emp_sizeband:N')
)
chart

In [41]:
# FIGURE X - site expansion
# Didn't export total number of sites so can't do rates only absolute values
# How many sites are present each year?

site_expansion = whole_economy_dynamism.rename(columns={'site_exp_entrants':'New firms opening sites',
                                                  'site_exp_incumbents':'Existing firms opening new sites'})
site_expansion = site_expansion.melt(id_vars='year',value_vars=['New firms opening sites','Existing firms opening new sites'])

# Plot bar chart
chart = alt.Chart(site_expansion).mark_bar().encode(
    x = alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y = alt.Y('value:Q', title='Site expansion'),
    color=alt.Color('variable:N', legend=None)
)

# Add end labels
end_point = site_expansion.groupby(
    ['variable']
)['year'].idxmax()

end_point_data = site_expansion.loc[end_point]

text_labels = alt.Chart(end_point_data).mark_text(
    align='left',     
    dx=10,           
    baseline='middle'
).encode(
    x=alt.X('year:O'),         
    y=alt.Y('value:Q'),   
    text='variable:N',
    color=alt.Color('variable:N')
)

chart = chart + text_labels
chart

chart.save('Charts/site_expansion.png', scale_factor=2)
chart.save('Charts/site_expansion.json')

In [54]:
site_expansion_industry = industry_df.rename(columns={'site_exp_entrants':'New firms opening sites',
                                                  'site_exp_incumbents':'Existing firms opening new sites'})
site_expansion_industry = site_expansion_industry.melt(id_vars=['year','industry_name'],value_vars=['New firms opening sites','Existing firms opening new sites'])


chart = alt.Chart(site_expansion_industry).mark_bar().encode(
    x = alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y = alt.Y('value:Q', title='Site expansion'),
    color=alt.Color('variable:N'),
    facet=alt.Facet('industry_name:N', columns=3)
)

chart = chart.resolve_axis(
    x='independent',
)

chart

In [56]:
site_expansion_region = region_dynamism.rename(columns={'site_exp_entrants':'New firms opening sites',
                                                  'site_exp_incumbents':'Existing firms opening new sites'})
site_expansion_region = site_expansion_region.melt(id_vars=['year','region'],value_vars=['New firms opening sites','Existing firms opening new sites'])


chart = alt.Chart(site_expansion_region).mark_bar().encode(
    x = alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y = alt.Y('value:Q', title='Site expansion'),
    color=alt.Color('variable:N'),
    facet=alt.Facet('region:N', columns=3)
)

chart = chart.resolve_axis(
    x='independent',
)

chart

In [11]:
# FIGURE 3 - EXIT RATES

# Plot just exit rate
exit_df = whole_economy_dynamism.melt(id_vars=['year'],
                                                 value_vars=['Exit rate'])

chart = alt.Chart(exit_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Market Sector Firm Exit Rate'),
    color=alt.Color('variable:N', title=None)

)

chart

In [30]:
# FIGURE 3 - EXIT RATES ACROSS FIRM SIZES

firmsize_exit_df = firm_size_dynamism.melt(id_vars=['year','emp_sizeband'],
                                                 value_vars=['Exit rate'])

chart = alt.Chart(firmsize_exit_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Firm Exit Rate'),
    color=alt.Color('emp_sizeband:O', title=None, legend=None)

)

# Add end labels
end_point = firmsize_exit_df.groupby(
    ['emp_sizeband']
)['year'].idxmax()

end_point_data = firmsize_exit_df.loc[end_point]

end_point_data.loc[end_point_data['emp_sizeband'] == 'Large (250+)', 'value'] -= 0.003
end_point_data.loc[end_point_data['emp_sizeband'] == 'Medium (50-249)', 'value'] += 0.005

text_labels = alt.Chart(end_point_data).mark_text(
    align='left',     
    dx=10,           
    baseline='middle'
).encode(
    x=alt.X('year:O'),         
    y=alt.Y('value:Q'),   
    text='emp_sizeband:O',
    color=alt.Color('emp_sizeband:O')
)

chart = chart + text_labels
chart.save('Charts/exit_rate_firmsize.png', scale_factor=2)
chart.save('Charts/exit_rate_firmsize.json')

In [6]:
whole_economy_dynamism

Unnamed: 0,year,n_firms,employment,n_entrants,n_exiters,n_entry_and_exit,n_incumbents,jc_entrants,jc_incumbents,jd_exiters,...,site_closure_incumbents,total_employment_lagged,Entry rate,Exit rate,Job creation rate,Job destruction rate,Entry job creation rate,Incumbent job creation rate,Exit job destruction rate,Incumbent job destruction rate
1,1998,1847011,17709729,234733,160782,48630,1402866,848394,1620884,1074088,...,21672,17612823.0,0.153417,0.113379,0.140198,0.118257,0.048169,0.092029,0.060983,0.057273
2,1999,1873060,17936146,187154,212458,48307,1425141,718589,1291775,1287866,...,6184,17709729.0,0.125709,0.139219,0.113517,0.111397,0.040576,0.072942,0.072721,0.038677
3,2000,1857200,18029246,199710,176270,45195,1436025,739561,1501027,1117071,...,21806,17936146.0,0.131868,0.119247,0.12492,0.111356,0.041233,0.083687,0.06228,0.049075
4,2001,1877929,18416010,196912,197356,45282,1438379,908243,1599402,1090562,...,17641,18029246.0,0.128969,0.129205,0.139088,0.11858,0.050376,0.088712,0.060488,0.058091
5,2002,1879029,18872566,200563,208480,43175,1426811,829886,2097098,1241772,...,23526,18416010.0,0.129715,0.133928,0.158937,0.141893,0.045063,0.113874,0.067429,0.074464
6,2003,1879213,18804479,203886,214205,47953,1413169,812769,1508126,1253052,...,25063,18872566.0,0.134013,0.139504,0.122977,0.128192,0.043066,0.079911,0.066395,0.061796
7,2004,1922606,18661685,247150,202611,58401,1414444,798028,1283703,1280367,...,16919,18804479.0,0.158925,0.135759,0.110704,0.1207,0.042438,0.068266,0.068088,0.052611
8,2005,1955539,18761138,237378,194044,56567,1467550,792623,1673070,1027365,...,17555,18661685.0,0.150314,0.128154,0.132126,0.109688,0.042473,0.089653,0.055052,0.054636
9,2006,1991156,18906423,235783,187612,50445,1517316,718538,1506004,1047028,...,18572,18761138.0,0.14375,0.119557,0.118572,0.111161,0.038299,0.080273,0.055808,0.055353
10,2007,2062766,19006496,234317,249264,75350,1503835,702730,1553796,1126645,...,15834,18906423.0,0.150122,0.157368,0.119352,0.121199,0.037169,0.082183,0.059591,0.061608


In [13]:
# JOB DESTRUCTION RATES

job_destruction_df = firm_size_dynamism.melt(id_vars=['year','emp_sizeband'],
                                                 value_vars=['Exit job destruction rate','Incumbent job destruction rate'])

chart = alt.Chart(job_destruction_df).mark_bar().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q',axis=alt.Axis(format='%'), title='Job Destruction Rate'),
    color=alt.Color('variable:N', title=None),
    facet=alt.Facet('emp_sizeband:O', columns=2, sort=size_order, header=alt.Header(
            title='Job destruction by firm size (employment)')
))

chart