# FULL UK DYNAMISM INVESTIGATION
# Will Shepherd, Nov 2025

RQ1: How has the composition of UK firms evolved over the past decade according to the BSD?
RQ2: To what extent has the rate of creative destruction in the UK declined between 1997 and 2023? 
RQ3: How have gaps between the most productive ‘frontier’ firms and ‘laggard’ firms evolved? 
RQ4: How are changes in business dynamism and productivity dispersion related?

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import altair as alt
from pandas.api.types import CategoricalDtype
import os
import eco_style 
alt.themes.enable("report")

ThemeRegistry.enable('report')

In [3]:
# Import data
whole_economy_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='whole_economy')
firm_size_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='firm_size')
firm_age_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='firm_age')
industry_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='industry')
region_df = pd.read_excel('business_dynamism_BSD_1997_2023.xlsx', sheet_name='region')

In [4]:
# Define order for categorical variables

# 1. Employment sizeband
size_order = [
    'Large (250+)'
    'Medium (50-249)'
    'Small (10-49)'
    'Micro (0-9)'
]

size_order = [
    'Micro (0-9)'
    'Small (10-49)'
    'Medium (50-249)'
    'Large (250+)'
]

# 2. Age group
age_order = [
    'New (0-2 years)',
    'Young (3-5 years)',
    'Old (5-10 years)',
    'Mature (10+ years)'
]

In [63]:
# Write function to calculate rates for dynamism measures, apply this across dataframes
def calculate_dynamism_rates(df, group_by_cols=None):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Sort data
    sort_cols = group_by_cols + ['year'] if group_by_cols else ['year']
    df = df.sort_values(sort_cols)
    
    # Create lagged employment (with or without grouping)
    if group_by_cols is None:
        df['total_employment_lagged'] = df['employment'].shift(1)
        df['n_firms_lagged'] = df['n_firms'].shift(1)

    else:
        df['total_employment_lagged'] = df.groupby(group_by_cols)['employment'].shift(1)
        df['n_firms_lagged'] = df['n_firms'].shift(1)
    
    # Calculate rates (same regardless of grouping)
    df['Entry rate'] = (df['n_entrants'] + df['n_entry_and_exit']) / df['n_firms_lagged']
    df['Exit rate'] = (df['n_exiters'] + df['n_entry_and_exit']) / df['n_firms_lagged']
    df['Firm churn rate'] = (df['n_entrants'] + df['n_entry_and_exit'] + df['n_exiters']) / df['n_firms_lagged']
    df['Job creation rate'] = (df['jc_incumbents'] + df['jc_entrants']) / df['total_employment_lagged']
    df['Job destruction rate'] = (df['jd_incumbents'] + df['jd_exiters']) / df['total_employment_lagged']
    df['Entry job creation rate'] = (df['jc_entrants']) / df['total_employment_lagged']
    df['Incumbent job creation rate'] = (df['jc_incumbents']) / df['total_employment_lagged']
    df['Exit job destruction rate'] = (df['jd_exiters']) / df['total_employment_lagged']
    df['Incumbent job destruction rate'] = (df['jd_incumbents']) / df['total_employment_lagged']
    df['Job reallocation'] = df['jc_incumbents'] + df['jc_entrants'] + df['jd_incumbents'] + df['jd_exiters']
    df['Job reallocation rate'] = df['Job reallocation'] / df['total_employment_lagged']
    df['Incumbent job reallocation'] = df['jc_incumbents'] + df['jd_incumbents']
    df['Incumbent job reallocation rate'] = df['Incumbent job reallocation'] / df['total_employment_lagged']
    df['Entry and exit job reallocation'] = df['jc_entrants'] + df['jd_exiters']
    df['Entry and exit job reallocation rate'] = df['Entry and exit job reallocation'] / df['total_employment_lagged']

    # We can't use the first/last year for dynamic variables due to no backward/forward looking observatinons
    years = df['year'].unique()
    df = df[~df['year'].isin([years.min(), years.max()])]

    return df

# Apply function to dataframes
whole_economy_dynamism = calculate_dynamism_rates(whole_economy_df)
firm_size_dynamism = calculate_dynamism_rates(firm_size_df, group_by_cols=['emp_sizeband'])
firm_age_dynamism = calculate_dynamism_rates(firm_age_df, group_by_cols=['age_group'])
industry_dynamism = calculate_dynamism_rates(industry_df, group_by_cols=['industry_name'])
region_dynamism = calculate_dynamism_rates(region_df, group_by_cols=['region'])

## The productivity slowdown - decline in output per hour worked

In [133]:
import pandas as pd
import altair as alt
import numpy as np

# Load and prepare data
df = pd.read_csv('output_per_hour_worked_ons_dec25.csv', skiprows=6)
df = df.rename(columns={df.columns[0]: 'year', df.columns[1]: 'output_per_hour_worked'})

# Convert "1971 Q1" format to datetime
def quarter_to_date(quarter_str):
    """Convert '1971 Q1' to '1971-01-01' (start of quarter)"""
    year, quarter = quarter_str.split()
    quarter_num = int(quarter[1])  # Extract number from 'Q1', 'Q2', etc.
    
    # Map quarter to month (start of each quarter)
    month_map = {1: 1, 2: 4, 3: 7, 4: 10}
    month = month_map[quarter_num]
    
    return pd.Timestamp(year=int(year), month=month, day=1)

df['date'] = df['year'].apply(quarter_to_date)

# Rebase to 2008 = 100
value_2008 = df[df['date'].dt.year == 2008]['output_per_hour_worked'].mean()
print(f"2008 base value: {value_2008:.2f}")

df['value'] = (df['output_per_hour_worked'] / value_2008) * 100

print(f"Data range: {df['date'].min().date()} to {df['date'].max().date()}")

# Sort data by date to ensure smooth lines
df = df.sort_values('date')

# Calculate pre-recession trend
recession_start = pd.to_datetime('2008-01-01')
pre_recession = df[df['date'] < recession_start].copy()

# Use the average value at 2008 as the baseline
value_2008_actual = df[df['date'].dt.year == 2008]['value'].mean()

# Fit trend to pre-2008 data
pre_recession['time'] = (pre_recession['date'] - pre_recession['date'].min()).dt.days
z = np.polyfit(pre_recession['time'], pre_recession['value'], 1)

# Calculate what the trend value would be at 2008
time_at_2008 = (recession_start - pre_recession['date'].min()).days
trend_value_at_2008 = np.polyval(z, time_at_2008)

# Adjust the trend to pass through the actual 2008 value
# Shift the intercept so trend matches actual at 2008
adjustment = value_2008_actual - trend_value_at_2008
z_adjusted = [z[0], z[1] + adjustment]

# Extend adjusted trend to all dates
df['time'] = (df['date'] - pre_recession['date'].min()).dt.days
df['trend'] = np.polyval(z_adjusted, df['time'])

# Create base chart
base = alt.Chart(df).encode(
    x=alt.X('year(date):T', 
            title='', 
            axis=alt.Axis(
                grid=True, 
                gridDash=[2, 2], 
                gridOpacity=0.4,
                tickCount=10
            ))
)

# Actual productivity line (blue)
actual_line = alt.Chart(df).mark_line(
    color='#5B9BD5',
    strokeWidth=2.5
).encode(
    x=alt.X('date:T', 
            title='', 
            axis=alt.Axis(
                grid=True, 
                gridDash=[2, 2], 
                gridOpacity=0.4,
                tickCount=10,
                format='%Y'
            )),
    y=alt.Y('value:Q', 
            title='', 
            scale=alt.Scale(domain=[0, 160]),
            axis=alt.Axis(
                grid=True, 
                gridDash=[2, 2], 
                gridOpacity=0.4
            ))
)

# Pre-recession trend line (red dashed) - show for entire period
trend_line = alt.Chart(df).mark_line(
    color='#C94545',
    strokeDash=[8, 4],
    strokeWidth=2.5,
    interpolate='natural'
).encode(
    x=alt.X('date:T', title='', axis=alt.Axis(format='%Y')),
    y='trend:Q'
)

# Vertical line at recession start
recession_line = alt.Chart(pd.DataFrame({
    'x': [recession_start]
})).mark_rule(
    strokeDash=[6, 4],
    strokeWidth=1.5,
    color='#808080'
).encode(
    x=alt.X('x:T', title='')
)

# Label: "Pre-recession trend"
trend_label_date = pd.to_datetime('2016-01-01')
trend_value_at_label = df[df['date'] == trend_label_date]['trend'].values[0] if len(df[df['date'] == trend_label_date]) > 0 else 115

trend_label = alt.Chart(pd.DataFrame({
    'x': [trend_label_date],
    'y': [trend_value_at_label + 14],  # Position above the trend line
    'text': ['Pre-recession trend']
})).mark_text(
    align='center',
    color='#C94545',
    fontSize=12
).encode(
    x=alt.X('x:T', title=''),
    y='y:Q',
    text='text:N'
)

# Label: "Productivity gap" 
latest_date = df['date'].max()
latest_actual = df[df['date'] == latest_date]['value'].values[0]
latest_trend = df[df['date'] == latest_date]['trend'].values[0]

gap_label = alt.Chart(pd.DataFrame({
    'x': [latest_date + pd.DateOffset(years=1)],
    'y': [latest_trend - 8],
    'text': ['Productivity gap']
})).mark_text(
    align='left',
    color='#5B9BD5',
    fontSize=12
).encode(
    x=alt.X('x:T', title=''),
    y='y:Q',
    text='text:N'
)

# Line showing the gap with arrows
gap_line = alt.Chart(pd.DataFrame({
    'x': [latest_date],
    'y': [latest_actual],
    'y2': [latest_trend]
})).mark_rule(
    color='#5B9BD5',
    strokeWidth=2,
    strokeDash=[3, 3]
).encode(
    x=alt.X('x:T', title=''),
    y='y:Q',
    y2='y2:Q'
)

# Arrow markers using triangle shapes
# Top arrow (pointing down at trend line)
gap_arrow_top = alt.Chart(pd.DataFrame({
    'x': [latest_date],
    'y': [latest_trend - 1]  # Slightly below the trend line
})).mark_point(
    shape='triangle-up',
    filled=True,
    color='#5B9BD5',
    size=100
).encode(
    x=alt.X('x:T', title='', axis=alt.Axis(format='%Y')),
    y='y:Q'
)

# Bottom arrow (pointing up at actual line)  
gap_arrow_bottom = alt.Chart(pd.DataFrame({
    'x': [latest_date],
    'y': [latest_actual + 1]  # Slightly above the actual line
})).mark_point(
    shape='triangle-down',
    filled=True,
    color='#5B9BD5',
    size=100
).encode(
    x=alt.X('x:T', title='', axis=alt.Axis(format='%Y')),
    y='y:Q'
)

# Combine all layers
chart = (
    recession_line + 
    actual_line + 
    trend_line + 
    trend_label + 
    gap_label +
    gap_line +
    gap_arrow_top +
    gap_arrow_bottom
).properties(
    width=600,
    height=400

).configure_view(
    strokeWidth=0,
    stroke=None
).configure_axis(
    gridColor='#D3D3D3',
    domainColor='#000000',
    tickColor='#000000'
)

# Save and display
chart.save('Paper charts/productivity_chart.png', scale_factor=2)
chart.save('Paper charts/productivity_chart.json')
print("✓ Chart saved as 'productivity_chart.html'")
print(f"✓ Latest productivity (2008=100): {latest_actual:.1f}")
print(f"✓ Trend projection: {latest_trend:.1f}")
print(f"✓ Productivity gap: {latest_trend - latest_actual:.1f} points")

chart

2008 base value: 92.68
Data range: 1971-01-01 to 2025-04-01
✓ Chart saved as 'productivity_chart.html'
✓ Latest productivity (2008=100): 106.8
✓ Trend projection: 126.7
✓ Productivity gap: 19.8 points


## RQ1: How has the composition of UK firms evolved over the past decade according to the BSD?

In [170]:
# Descriptive statistics on the whole economy business population
#   - number of firms
#   - total employees
#   - total turnover
#   - average turnover per employee (plus other deciles?)

# UPDATE WITH TURNOVER AND TURNOVER PER EMPLOYEE AFTER SECURELAB EXPORT

chart1 = alt.Chart(whole_economy_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('n_firms:Q',title='Total number of firms in BSD', scale=alt.Scale(domainMin=1500000, domainMax=2500000),axis=alt.Axis(format=".2s"))
)
chart1

chart2 = alt.Chart(whole_economy_df).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('employment:Q',title='Total employment in BSD', scale=alt.Scale(domainMin=15000000, domainMax=22000000), axis=alt.Axis(format=".2s"))
)
chart2

final_chart = chart1 | chart2
final_chart
#final_chart.save('Paper charts/fig1_bsd_basic_facts.png', scale_factor=2)
#final_chart.save('Paper charts/fig1_bsd_basic_facts.json')

In [172]:
# FIGURE 2 - ECONOMIC CONTRIBUTION BY FIRM SIZE 2023
firmsize_2023 = firm_size_df[firm_size_df['year']==2023]

firmsize_2023['Total_employment'] = firmsize_2023['employment'].sum()
firmsize_2023['Total_firms'] = firmsize_2023['n_firms'].sum()

firmsize_2023['Share_of_employment'] = firmsize_2023['employment']/firmsize_2023['Total_employment']
firmsize_2023['Share_of_firms'] = firmsize_2023['n_firms']/firmsize_2023['Total_firms']

firmsize_share_of_activity = firmsize_2023.melt(id_vars='emp_sizeband',
                                                value_vars=['Share_of_employment','Share_of_firms'],
                                                value_name='Share of activity')

label_map = {
    'Share_of_employment': 'Employment',
    'Share_of_firms': 'Firms'
}

firmsize_share_of_activity['variable'] = firmsize_share_of_activity['variable'].map(label_map)

sizeband_order = ['Micro (0-9)', 'Small (10-49)', 'Medium (50-249)', 'Large (250+)']

chart = alt.Chart(firmsize_share_of_activity).mark_bar().encode(
    x=alt.X('emp_sizeband:O', sort=sizeband_order),
    y=alt.Y('Share of activity:Q', axis=alt.Axis(format='%')),
    color=alt.Color('variable:N').legend(title=None, orient='bottom', 
        direction='horizontal'),
    xOffset=alt.XOffset('variable:N')
)

firmsize_2023
#chart.save('Paper charts/fig2_firmsize_share_of_activity.json')
#chart.save('Paper charts/fig2_firmsize_share_of_activity.png', scale_factor=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firmsize_2023['Total_employment'] = firmsize_2023['employment'].sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firmsize_2023['Total_firms'] = firmsize_2023['n_firms'].sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firmsize_2023['Share_of_employment'] = firmsize_2023['employment']/firmsi

Unnamed: 0,year,emp_sizeband,n_firms,employment,n_entrants,n_exiters,n_entry_and_exit,n_incumbents,jc_entrants,jc_incumbents,jd_exiters,jd_incumbents,site_exp_entrants,site_exp_incumbents,site_closure_exit,site_closure_incumbents,Total_employment,Total_firms,Share_of_employment,Share_of_firms
104,2023,Micro (0-9),2158169,5002259,0,0,0,0,0,0,0,0,0,0,0,0,21982815,2405540,0.227553,0.897166
105,2023,Small (10-49),205944,3989788,0,0,0,0,0,0,0,0,0,0,0,0,21982815,2405540,0.181496,0.085612
106,2023,Medium (50-249),34100,3321052,0,0,0,0,0,0,0,0,0,0,0,0,21982815,2405540,0.151075,0.014176
107,2023,Large (250+),7327,9669716,0,0,0,0,0,0,0,0,0,0,0,0,21982815,2405540,0.439876,0.003046


In [None]:
# FIRM COUNTS BY FIRM SIZE BAND
chart = (
    alt.Chart(firm_size_df)
    .mark_line()
    .encode(
        x=alt.X(
            'year:O',
            axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",
                labelAngle=0
            )
        ),
        y=alt.Y(
            'n_firms:Q',
            axis=alt.Axis(format=".2s")
        ),
        facet=alt.Facet('emp_sizeband:N', columns=2)
    )
    .resolve_scale(y='independent')
    .properties(title='Total number of firms in BSD')x
)

#chart.save('Exploration charts/firm_count_by_size.png', scale_factor=2)
#chart.save('Exploration charts/firm_count_by_size.json')

# EMPLOYMENT BY FIRM SIZE BAND
chart = (
    alt.Chart(firm_size_df)
    .mark_line()
    .encode(
        x=alt.X(
            'year:O',
            axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",
                labelAngle=0
            )
        ),
        y=alt.Y(
            'employment:Q',
            axis=alt.Axis(format=".2s")
        ),
        facet=alt.Facet('emp_sizeband:N', columns=2)
    )
    .resolve_scale(y='independent')
    .properties(title='Total employment in BSD firms')
)

chart
#chart.save('Exploration charts/employment_by_size.png', scale_factor=2)
#chart.save('Exploration charts/employment_by_size.json')

In [32]:
firm_size_df['total_firms_year'] = firm_size_df.groupby('year')['n_firms'].transform('sum')
firm_size_df['share'] = firm_size_df['n_firms'] / firm_size_df['total_firms_year']

chart = (
    alt.Chart(firm_size_df)
    .mark_line()
    .encode(
        x=alt.X(
            'year:O',
            axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",
                labelAngle=0
            )
        ),
        y=alt.Y(
            'share:Q',
            axis=alt.Axis(format="%", title="Share of total firms")
        ),
        facet=alt.Facet('emp_sizeband:N', columns=2)
    )
    .resolve_scale(y='independent')
    .properties(title='Share of total firms in BSD')
)

chart

In [10]:
chart1 = alt.Chart(firm_age_df).mark_bar().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('employment:Q',title='Total number of firms in BSD',axis=alt.Axis(format=".2s"), stack='normalize'),
    color='age_group:N'
)
chart1

In [142]:
# How much has the total number of firms changed between 2000 and 2023?

# Filter for years 2000 and 2023
df_2000 = whole_economy_df[whole_economy_df['year'] == 2000]
df_2023 = whole_economy_df[whole_economy_df['year'] == 2023]

# Sum total firms across all regions for each year
total_firms_2000 = df_2000['n_firms'].sum()
total_firms_2023 = df_2023['n_firms'].sum()

# Calculate change
absolute_change = total_firms_2023 - total_firms_2000
percent_change = (absolute_change / total_firms_2000) * 100

# Display results
print(f"Total firms in 2000: {total_firms_2000:,}")
print(f"Total firms in 2023: {total_firms_2023:,}")
print(f"Absolute change: {absolute_change:,}")
print(f"Percentage change: {percent_change:.2f}%")

Total firms in 2000: 1,857,200
Total firms in 2023: 2,405,540
Absolute change: 548,340
Percentage change: 29.53%


In [None]:
# Which regions saw the biggest increase in number of firms?

# Filter for years 2000 and 2023
df_filtered = region_df[region_df['year'].isin([2000, 2023])].copy()

# Pivot to get 2000 and 2023 values side by side
pivot = df_filtered.pivot(index='region', columns='year', values='n_firms')

# Calculate % change
pivot['% Change 2000-2023'] = ((pivot[2023] - pivot[2000]) / pivot[2000]) * 100

# Create final table with clean column names
result = pivot.rename(columns={
    2000: 'Firms 2000',
    2023: 'Firms 2023'
})

# Round the percentage to 2 decimal places
result['% Change 2000-2023'] = result['% Change 2000-2023'].round(2)

# Reset index to make region a column
result = result.reset_index()
result_sorted = result.sort_values('% Change 2000-2023', ascending=False)

# Display the table
print(result_sorted)

year                    region  Firms 2000  Firms 2023  % Change 2000-2023
2                       London      284771      458225               60.91
3                   North East       49462       64853               31.12
1              East Of England      186611      237639               27.34
11    Yorkshire and The Humber      134082      170380               27.07
4                   North West      183120      232379               26.90
0                East Midlands      126873      160548               26.54
10               West Midlands      153463      193298               25.96
7                   South East      287378      358263               24.67
5             Northern Ireland       57369       71038               23.83
8                   South West      175204      208452               18.98
6                     Scotland      134012      154390               15.21
9                        Wales       84855       96075               13.22


In [149]:
# Which industries saw the biggest increase in number of firms?

# Filter for years 2000 and 2023
df_filtered = industry_df[industry_df['year'].isin([2000, 2023])].copy()

# Pivot to get 2000 and 2023 values side by side
pivot = df_filtered.pivot(index='industry_name', columns='year', values='n_firms')

# Calculate % change
pivot['% Change 2000-2023'] = ((pivot[2023] - pivot[2000]) / pivot[2000]) * 100

# Create final table with clean column names
result = pivot.rename(columns={
    2000: 'Firms 2000',
    2023: 'Firms 2023'
})

# Round the percentage to 2 decimal places
result['% Change 2000-2023'] = result['% Change 2000-2023'].round(2)

# Reset index to make region a column
result = result.reset_index()
result_sorted = result.sort_values('% Change 2000-2023', ascending=False)

# Display the table
print(result_sorted)

year         industry_name  Firms 2000  Firms 2023  % Change 2000-2023
5           Other business      317544      604003               90.21
1             Construction      207289      336821               62.49
7     Recreation & Culture       72213      106429               47.38
9                Transport       50014       71649               43.26
2             Hospitality       129542      178196               37.56
11     z. Other industries      257673      299330               16.17
0              Automotives       76664       88855               15.90
3                       IT      129229      140934                9.06
8                   Retail      232281      225401               -2.96
4           Manufacturing       175984      164511               -6.52
6            Other service       87835       82099               -6.53
10               Wholesale      120932      107312              -11.26


## RQ2: To what extent has the rate of creative destruction in the UK declined between 1997 and 2023? 

### MEASURE: ENTRY AND EXIT (EXTENSIVE MARGIN)

In [75]:
# CHURN RATE TOTAL (ENTRY + EXIT)
entry_exit_churn = whole_economy_dynamism.melt(id_vars=['year'],
                                               value_vars=['Firm churn rate'])

chart = alt.Chart(entry_exit_churn).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', title='Firm churn rate (entry + exit)', axis=alt.Axis(format='%'))
).properties(
    width=600, height=400
)


chart.save('Exploration charts/firm_churn_rate.png', scale_factor=2)
chart.save('Exploration charts/firm_churn_rate.json')
entry_exit_churn['value'].mean()


np.float64(0.24593199259765314)

In [80]:
# ENTRY AND EXIT PLUS EMPLOYMENT WEIGHTED
entry_exit_rates = whole_economy_dynamism.melt(id_vars='year',value_vars=['Entry rate','Exit rate'])
emp_weighted_entry_exit_rates = whole_economy_dynamism.melt(id_vars='year',value_vars=['Entry job creation rate','Exit job destruction rate'])

entry_exit_rates['variable'] = entry_exit_rates['variable'].replace({
    'Entry rate': 'Entry', 
    'Exit rate': 'Exit'
})

emp_weighted_entry_exit_rates['variable'] = emp_weighted_entry_exit_rates['variable'].replace({
    'Entry job creation rate': 'Entry', 
    'Exit job destruction rate': 'Exit'
})

counts_based = alt.Chart(entry_exit_rates).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Entry and exit rate (number of firms)'),
    color=alt.Color('variable:N', legend=alt.Legend(title=None, orient='bottom', direction='horizontal', labelFontSize=12,))
)

emp_weighted = alt.Chart(emp_weighted_entry_exit_rates).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Entry and exit rate (employment weighted)'),
    color=alt.Color('variable:N', legend=alt.Legend(title=None, orient='bottom', direction='horizontal', labelFontSize=12,))
)

final_chart = counts_based | emp_weighted
final_chart

counts_based = counts_based.properties(height=400, width=600)

counts_based.save('Exploration charts/entry_exit.png', scale_factor=2)
counts_based.save('Exploration charts/entry_exit.json')

In [156]:
# FIRM ENTRY AND EXIT RATES BY SIZE

entry_firmsize_chart = alt.Chart(firm_size_dynamism).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('Entry rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('emp_sizeband:O', sort=size_order, legend=alt.Legend(orient="top"))
).properties(height=400, width = 600)

exit_firmsize_chart = alt.Chart(firm_size_dynamism).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  # Show every 2nd year
            labelAngle=0)),
    y=alt.Y('Exit rate:Q', axis=alt.Axis(format='%')),
    color=alt.Color('emp_sizeband:O', title='Firm size (employment)', sort=size_order, legend=alt.Legend(orient="top"))
).properties(height=400, width = 600)

combined_chart = alt.hconcat(entry_firmsize_chart, exit_firmsize_chart)

combined_chart
#exit_firmsize_chart.save('Exploration Charts/exit_rate_firmsize.png', scale_factor=2)
#exit_firmsize_chart.save('Exploration Charts/exit_rate_firmsize.json')

In [155]:
# AVERAGE FIRM ENTRY AND EXIT RATES
average_entry_rate = whole_economy_dynamism['Entry rate'].mean()
print(f"Average entry rate: {average_entry_rate:.2f}")

average_entry_rate = whole_economy_dynamism['Exit rate'].mean()
print(f"Average entry rate: {average_entry_rate:.2f}")


Average entry rate: 0.14
Average entry rate: 0.13


In [168]:
# AVERAGE FIRM ENTRY AND EXIT RATES BY INDUSTRY
# Calculate average entry and exit rates by industry
industry_averages = industry_dynamism.groupby('industry_name')[['Entry rate', 'Exit rate']].mean()

print(industry_averages.to_string(float_format=lambda x: f'{x:.2%}'))

                      Entry rate  Exit rate
industry_name                              
Automotives               10.50%     10.36%
Construction              14.89%     12.95%
Hospitality               18.59%     17.58%
IT                        17.92%     15.60%
Manufacturing             10.46%     11.13%
Other business            18.55%     15.78%
Other service             12.81%     11.86%
Recreation & Culture      13.14%     11.71%
Retail                    13.31%     13.45%
Transport                 16.09%     14.91%
Wholesale                 10.25%     11.02%
z. Other industries        9.25%      8.95%


In [162]:
# Entry and exit rates by industry

industry_entry_exit = industry_dynamism.melt(id_vars=['year','industry_name'],value_vars=['Entry rate','Exit rate'])

chart = alt.Chart(industry_entry_exit).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",    
                labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%')),
    color=alt.Color('variable:N'),
    facet=alt.Facet('industry_name:N', columns=3)).resolve_scale(
    y='independent'
)

chart

In [161]:
# Entry and exit rates by region

region_entry_exit = region_dynamism.melt(id_vars=['year','region'],value_vars=['Entry rate','Exit rate'])

chart = alt.Chart(region_entry_exit).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",    
                labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%')),
    color=alt.Color('variable:N'),
    facet=alt.Facet('region:N', columns=3)).resolve_scale(
    y='independent'
)

chart

### MEASURE: JOB REALLOCATION

In [55]:
# JOB REALLOCATION RATES BY INCUMBENTS VS ENTRY AND EXIT OVER TIME
extensive_intensive_reallocation = whole_economy_df.copy()
extensive_intensive_reallocation['extensive_reallocation'] = extensive_intensive_reallocation['jc_entrants'] + extensive_intensive_reallocation['jd_exiters']
extensive_intensive_reallocation['intensive_reallocation'] = extensive_intensive_reallocation['jc_incumbents'] + extensive_intensive_reallocation['jd_incumbents']
extensive_intensive_reallocation['total_employment_lagged'] = extensive_intensive_reallocation['employment'].shift(1)
extensive_intensive_reallocation['extensive_reallocation_rate'] = extensive_intensive_reallocation['extensive_reallocation'] / extensive_intensive_reallocation['total_employment_lagged']
extensive_intensive_reallocation['intensive_reallocation_rate'] = extensive_intensive_reallocation['intensive_reallocation'] / extensive_intensive_reallocation['total_employment_lagged']
extensive_intensive_reallocation['total_reallocation'] = extensive_intensive_reallocation['jc_entrants'] + extensive_intensive_reallocation['jd_exiters'] + extensive_intensive_reallocation['jc_incumbents'] + extensive_intensive_reallocation['jd_incumbents']
extensive_intensive_reallocation['total_reallocation_rate'] = extensive_intensive_reallocation['total_reallocation'] / extensive_intensive_reallocation['total_employment_lagged']
extensive_intensive_reallocation = extensive_intensive_reallocation[extensive_intensive_reallocation['year'].isin(range(1998,2023))]

# Store the last year for the end labels
max_year = extensive_intensive_reallocation['year'].max()

extensive_intensive_reallocation = extensive_intensive_reallocation.melt(
    id_vars=['year'],
    value_vars=['extensive_reallocation_rate', 'intensive_reallocation_rate','total_reallocation_rate'])

# Define neater labels for the chart
label_map = {
    'extensive_reallocation_rate': 'Extensive (Entry/Exit)',
    'intensive_reallocation_rate': 'Intensive (Incumbents)',
    'total_reallocation_rate': 'Total'
}

extensive_intensive_reallocation['display_label'] = extensive_intensive_reallocation['variable'].map(label_map)

# Change colour of lines
color_scale = alt.Scale(
    domain=['extensive_reallocation_rate', 'intensive_reallocation_rate', 'total_reallocation_rate'],
    range=['#1f77b4', '#ff7f0e', 'grey']  # Blue, Orange, Black
)

dash_scale = alt.Scale(
    domain=['extensive_reallocation_rate', 'intensive_reallocation_rate', 'total_reallocation_rate'],
    range=[[0, 0], [0, 0], [6, 4]]  # [0,0] is solid, [6,4] is 6px dash and 4px gap
)

chart = alt.Chart(extensive_intensive_reallocation).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Job reallocation rate'),
    color=alt.Color('variable:N', legend=None, scale=color_scale),
    strokeDash=alt.StrokeDash('variable:N', scale=dash_scale, legend=None))

labels = chart.mark_text(
    align='left',
    dx=5,
    baseline='middle'
).encode(
    text='display_label:N'  # Use the new descriptive column here
).transform_filter(
    alt.datum.year == int(max_year)
)

chart = (chart + labels).properties(
    width=600, height=400
).configure_view(
    clip=False
)

chart
chart.save('Exploration charts/job_reallocation_rate.png', scale_factor=2)
chart.save('Exploration charts/job_reallocation_rate.json')

In [56]:
# On average, what is the contribution of existing firms to job reallocation?
df_pivot = extensive_intensive_reallocation.pivot(
    index='year', 
    columns='variable', 
    values='value'
)

# 2. Calculate the share for each year
# Share = Intensive / (Intensive + Extensive) 
# Note: Since intensive + extensive = total, you can also use Intensive / Total
df_pivot['extensive_share'] = (
    df_pivot['extensive_reallocation_rate'] / 
    (df_pivot['intensive_reallocation_rate'] + df_pivot['extensive_reallocation_rate'])
)

# 3. Calculate the average over the whole period
average_intensive_share = df_pivot['extensive_share'].mean()
print(average_intensive_share)

df_pivot = df_pivot.reset_index()

chart = alt.Chart(df_pivot).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('extensive_share:Q'))

chart

0.40554533289254735


In [57]:
# JOB REALLOCATION RATES ACROSS THE FIRM SIZE DISTRIBUTION

reallocation_firmsize = firm_size_dynamism.copy()

reallocation_firmsize = reallocation_firmsize.melt(id_vars=['year','emp_sizeband'],
                                                   value_vars=['Job reallocation rate'])

chart = alt.Chart(reallocation_firmsize).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%')),
    color=alt.Color('emp_sizeband:N')
)

chart

In [58]:
# JOB REALLOCATION RATES ACROSS THE FIRM SIZE DISTRIBUTION - BY INCUMBENT VS ENTRY/EXIT

reallocation_firmsize = firm_size_dynamism.copy()

incumbent_reallocation_firmsize = reallocation_firmsize.melt(id_vars=['year','emp_sizeband'],
                                                   value_vars=['Incumbent job reallocation rate'])

entryexit_reallocation_firmsize = reallocation_firmsize.melt(id_vars=['year','emp_sizeband'],
                                                   value_vars=['Entry and exit job reallocation rate'])

chart1 = alt.Chart(incumbent_reallocation_firmsize).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Job reallocation from incumbents'),
    color=alt.Color('emp_sizeband:N')
)

chart2 = alt.Chart(entryexit_reallocation_firmsize).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Job reallocation from entry and exit'),
    color=alt.Color('emp_sizeband:N')
)

chart = chart1 | chart2
chart

chart.save('Exploration charts/reallocation_firmsize_intensive_extensive.png', scale_factor=2)
chart.save('Exploration charts/reallocation_firmsize_intensive_extensive.json')

In [59]:
# JOB REALLOCATION RATES ACROSS INDUSTRIES

reallocation_industry = industry_dynamism.copy()

reallocation_industry = reallocation_industry.melt(id_vars=['year','industry_name'],
                                                   value_vars=['Job reallocation rate'])

chart = alt.Chart(reallocation_industry).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%')),
    color=alt.Color('industry_name:N')
)

chart

In [60]:
# JOB REALLOCATION RATES ACROSS INDUSTRIES - BY INCUMBENT VS ENTRY/EXIT

reallocation_industry = industry_dynamism.copy()

incumbent_reallocation_industry = reallocation_industry.melt(id_vars=['year','industry_name'],
                                                   value_vars=['Incumbent job reallocation rate'])

entryexit_reallocation_industry = reallocation_industry.melt(id_vars=['year','industry_name'],
                                                   value_vars=['Entry and exit job reallocation rate'])

chart1 = alt.Chart(incumbent_reallocation_industry).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Job reallocation from incumbents'),
    color=alt.Color('industry_name:N')
)

chart2 = alt.Chart(entryexit_reallocation_industry).mark_line().encode(
    x=alt.X('year:O', axis=alt.Axis(
                labelExpr="datum.value % 2 == 0 ? datum.label : ''",  
            labelAngle=0)),
    y=alt.Y('value:Q', axis=alt.Axis(format='%'), title='Job reallocation from entry and exit'),
    color=alt.Color('industry_name:N')
)

chart = chart1 | chart2
chart

In [61]:
# CONTRIBUTION OF THE INTENSIVE VS EXTENSIVE MARGIN TO JOB REALLOCATION OVER TIME

whole_economy_dynamism['Entry/exit reallocation contribution'] = whole_economy_dynamism['Entry and exit job reallocation'] / whole_economy_dynamism['Job reallocation']

whole_economy_dynamism

Unnamed: 0,year,n_firms,employment,n_entrants,n_exiters,n_entry_and_exit,n_incumbents,jc_entrants,jc_incumbents,jd_exiters,...,Incumbent job creation rate,Exit job destruction rate,Incumbent job destruction rate,Job reallocation,Job reallocation rate,Incumbent job reallocation,Incumbent job reallocation rate,Entry and exit job reallocation,Entry and exit job reallocation rate,Entry/exit reallocation contribution
1,1998,1847011,17709729,234733,160782,48630,1402866,848394,1620884,1074088,...,0.092029,0.060983,0.057273,4552109,0.258454,2629627,0.149302,1922482,0.109152,0.422328
2,1999,1873060,17936146,187154,212458,48307,1425141,718589,1291775,1287866,...,0.072942,0.072721,0.038677,3983183,0.224915,1976728,0.111618,2006455,0.113297,0.503732
3,2000,1857200,18029246,199710,176270,45195,1436025,739561,1501027,1117071,...,0.083687,0.06228,0.049075,4237882,0.236276,2381250,0.132763,1856632,0.103513,0.438104
4,2001,1877929,18416010,196912,197356,45282,1438379,908243,1599402,1090562,...,0.088712,0.060488,0.058091,4645544,0.257667,2646739,0.146803,1998805,0.110865,0.430263
5,2002,1879029,18872566,200563,208480,43175,1426811,829886,2097098,1241772,...,0.113874,0.067429,0.074464,5540087,0.30083,3468429,0.188338,2071658,0.112492,0.37394
6,2003,1879213,18804479,203886,214205,47953,1413169,812769,1508126,1253052,...,0.079911,0.066395,0.061796,4740205,0.251169,2674384,0.141707,2065821,0.109462,0.435808
7,2004,1922606,18661685,247150,202611,58401,1414444,798028,1283703,1280367,...,0.068266,0.068088,0.052611,4351424,0.231404,2273029,0.120877,2078395,0.110527,0.477636
8,2005,1955539,18761138,237378,194044,56567,1467550,792623,1673070,1027365,...,0.089653,0.055052,0.054636,4512660,0.241814,2692672,0.144289,1819988,0.097525,0.403307
9,2006,1991156,18906423,235783,187612,50445,1517316,718538,1506004,1047028,...,0.080273,0.055808,0.055353,4310050,0.229733,2544484,0.135625,1765566,0.094108,0.409639
10,2007,2062766,19006496,234317,249264,75350,1503835,702730,1553796,1126645,...,0.082183,0.059591,0.061608,4547957,0.240551,2718582,0.143791,1829375,0.096759,0.402241
