# Link projected demographic data to smaller areas

The age projection data is by local authority. Work out how to split the data across the MSOAs making up each local authority.

Split by current proportions of MSOA sizes?

## Code setup

In [114]:
import os
import polars as pl

## Load data

MSOA to local authority lookup:

In [151]:
path_to_lookup = os.path.join('..', 'data', 'LSOA11_WD19_LAD19_EW_LU.csv')

df_lsoa_auth_lookup = pl.read_csv(path_to_lookup)

# Only keep LSOA and LAD columns:
cols_to_keep = [c for c in df_lsoa_auth_lookup.columns if ('LSOA' in c) | ('LAD' in c)]
df_lsoa_auth_lookup = df_lsoa_auth_lookup[cols_to_keep]

In [152]:
df_lsoa_auth_lookup.head()

LSOA11CD,LSOA11NM,LAD19CD,LAD19NM
str,str,str,str
"""E01012057""","""Middlesbrough 009E""","""E06000002""","""Middlesbrough"""
"""E01012054""","""Middlesbrough 009B""","""E06000002""","""Middlesbrough"""
"""E01012053""","""Middlesbrough 009A""","""E06000002""","""Middlesbrough"""
"""E01033469""","""Middlesbrough 006E""","""E06000002""","""Middlesbrough"""
"""E01012026""","""Middlesbrough 006A""","""E06000002""","""Middlesbrough"""


LSOA to MSOA lookup:

In [153]:
path_to_lookup = os.path.join('..', 'data', 'lsoa_to_msoa.csv')

df_lsoa_msoa_lookup = pl.read_csv(path_to_lookup)

In [154]:
df_lsoa_msoa_lookup.head()

lsoa11cd,msoa11cd,ladcd,lsoa11nm,msoa11nm,country
str,str,str,str,str,str
"""E01000001""","""E02000001""","""E09000001""","""City of London 001A""","""City of London 001""","""E"""
"""E01000002""","""E02000001""","""E09000001""","""City of London 001B""","""City of London 001""","""E"""
"""E01000003""","""E02000001""","""E09000001""","""City of London 001C""","""City of London 001""","""E"""
"""E01000005""","""E02000001""","""E09000001""","""City of London 001E""","""City of London 001""","""E"""
"""E01000006""","""E02000017""","""E09000002""","""Barking and Dagenham 016A""","""Barking and Dagenham 016""","""E"""


Join the data:

In [155]:
df_lookup = df_lsoa_auth_lookup.join(df_lsoa_msoa_lookup, left_on='LSOA11CD', right_on='lsoa11cd', how='left')

In [156]:
df_lookup.head()

LSOA11CD,LSOA11NM,LAD19CD,LAD19NM,msoa11cd,ladcd,lsoa11nm,msoa11nm,country
str,str,str,str,str,str,str,str,str
"""E01012057""","""Middlesbrough 009E""","""E06000002""","""Middlesbrough""","""E02002504""","""E06000002""","""Middlesbrough 009E""","""Middlesbrough 009""","""E"""
"""E01012054""","""Middlesbrough 009B""","""E06000002""","""Middlesbrough""","""E02002504""","""E06000002""","""Middlesbrough 009B""","""Middlesbrough 009""","""E"""
"""E01012053""","""Middlesbrough 009A""","""E06000002""","""Middlesbrough""","""E02002504""","""E06000002""","""Middlesbrough 009A""","""Middlesbrough 009""","""E"""
"""E01033469""","""Middlesbrough 006E""","""E06000002""","""Middlesbrough""","""E02002501""","""E06000002""","""Middlesbrough 006E""","""Middlesbrough 006""","""E"""
"""E01012026""","""Middlesbrough 006A""","""E06000002""","""Middlesbrough""","""E02002501""","""E06000002""","""Middlesbrough 006A""","""Middlesbrough 006""","""E"""


Drop LSOA columns and remove repeats:

In [157]:
cols_to_drop = ['LSOA11CD', 'LSOA11NM', 'lsoa11nm', 'ladcd']

df_lookup = df_lookup.drop(cols_to_drop)

In [158]:
# Remove repeated rows:
df_lookup = df_lookup.unique()

In [159]:
df_lookup.head()

LAD19CD,LAD19NM,msoa11cd,msoa11nm,country
str,str,str,str,str
"""E07000240""","""St Albans""","""E02004929""","""St Albans 006""","""E"""
"""E07000011""","""Huntingdonshire""","""E02003758""","""Huntingdonshire 006""","""E"""
"""E06000056""","""Central Bedfordshire""","""E02003651""","""Central Bedfordshire 033""","""E"""
"""W06000009""","""Pembrokeshire""","""W02000129""","""Pembrokeshire 004""","""W"""
"""E08000030""","""Walsall""","""E02002143""","""Walsall 034""","""E"""


## Decide how to split district data between MSOAs

Use ratio of population of MSOA to population of district.

Load MSOA population numbers from the health statistics:

In [160]:
path_to_msoa_stats = os.path.join('..', 'data', 'msoa_cleaned.csv')

df_stats = pl.read_csv(path_to_msoa_stats)

# Recalculate total patients:
df_stats = df_stats.with_columns((pl.col('good_health') + pl.col('fair health') + pl.col('bad health')).alias('total_health'))

# Only keep some columns:
cols_to_keep = ['MSOA', 'MSOA11CD', 'total_health']
df_stats = df_stats[cols_to_keep]

In [161]:
df_stats.head()

MSOA,MSOA11CD,total_health
str,str,i64
"""Adur 001""","""E02006534""",8524
"""Adur 002""","""E02006535""",6634
"""Adur 003""","""E02006536""",7100
"""Adur 004""","""E02006537""",10127
"""Adur 005""","""E02006538""",8526


Merge in population numbers:

In [162]:
# Join on the right because df_stats is missing the Isles of Scilly
# and df_lookup is not.

df_lookup = df_lookup.join(df_stats, left_on='msoa11cd', right_on='MSOA11CD', how='right')

In [163]:
df_lookup.head()

LAD19CD,LAD19NM,msoa11nm,country,MSOA,MSOA11CD,total_health
str,str,str,str,str,str,i64
"""E07000223""","""Adur""","""Adur 001""","""E""","""Adur 001""","""E02006534""",8524
"""E07000223""","""Adur""","""Adur 002""","""E""","""Adur 002""","""E02006535""",6634
"""E07000223""","""Adur""","""Adur 003""","""E""","""Adur 003""","""E02006536""",7100
"""E07000223""","""Adur""","""Adur 004""","""E""","""Adur 004""","""E02006537""",10127
"""E07000223""","""Adur""","""Adur 005""","""E""","""Adur 005""","""E02006538""",8526


Calculate total patients per district:

In [166]:
all_districts = df_lookup['LAD19CD'].unique().to_numpy()

In [168]:
dict_district_pops = {}

for district in all_districts:
    mask = df_lookup['LAD19CD'] == district
    if len(df_lookup.filter(mask)) > 0:
        pop_here = df_lookup.filter(mask)['total_health'].sum()
        dict_district_pops[district] = pop_here

In [169]:
df_lookup.head()

LAD19CD,LAD19NM,msoa11nm,country,MSOA,MSOA11CD,total_health
str,str,str,str,str,str,i64
"""E07000223""","""Adur""","""Adur 001""","""E""","""Adur 001""","""E02006534""",8524
"""E07000223""","""Adur""","""Adur 002""","""E""","""Adur 002""","""E02006535""",6634
"""E07000223""","""Adur""","""Adur 003""","""E""","""Adur 003""","""E02006536""",7100
"""E07000223""","""Adur""","""Adur 004""","""E""","""Adur 004""","""E02006537""",10127
"""E07000223""","""Adur""","""Adur 005""","""E""","""Adur 005""","""E02006538""",8526


Calculate ratio of patients per district:

In [170]:
# Placeholder data:
df_lookup = df_lookup.with_columns(pl.lit(0.0).alias('district_pop_ratio'))

for district in all_districts:
    mask = df_lookup['LAD19CD'] == district
    df_lookup = df_lookup.with_columns(
        pl.when((mask))
        .then(df_lookup['total_health'].to_numpy() / dict_district_pops[district])         # replace with bin min
        .otherwise(pl.col('district_pop_ratio'))  # otherwise keep the existing value
        .name.keep()
    )

In [171]:
df_lookup.head()

LAD19CD,LAD19NM,msoa11nm,country,MSOA,MSOA11CD,total_health,district_pop_ratio
str,str,str,str,str,str,i64,f64
"""E07000223""","""Adur""","""Adur 001""","""E""","""Adur 001""","""E02006534""",8524,0.139322
"""E07000223""","""Adur""","""Adur 002""","""E""","""Adur 002""","""E02006535""",6634,0.108431
"""E07000223""","""Adur""","""Adur 003""","""E""","""Adur 003""","""E02006536""",7100,0.116047
"""E07000223""","""Adur""","""Adur 004""","""E""","""Adur 004""","""E02006537""",10127,0.165523
"""E07000223""","""Adur""","""Adur 005""","""E""","""Adur 005""","""E02006538""",8526,0.139355


Sanity check:

In [173]:
for district in all_districts:
    mask = df_lookup['LAD19CD'] == district
    total_pop = df_lookup.filter(mask)['district_pop_ratio'].sum()
    if round(total_pop, 3) != 1.000:
        print(district)

## Link projected demographic data

In [184]:
proj_years_str = ['2025', '2030', '2035', '2040']

In [207]:
dict_dfs_proj = {}

for label in proj_years_str:
    df = pl.read_csv(os.path.join('..', 'data', f'nomis_age_predictions_{label}.csv'), comment_prefix='#')
    # Drop the Isles of Scilly.
    mask = df['local authority: district / unitary (as of April 2019)'].str.contains('Scilly')
    df = df.filter(~mask)
    dict_dfs_proj[label] = df

Check one:

In [228]:
dict_dfs_proj[proj_years_str[0]]

local authority: district / unitary (as of April 2019),mnemonic,Age 0 - 4,Aged 5-9,Aged 10-14,Aged 15-19,Aged 20-24,Aged 25-29,Aged 30-34,Aged 35-39,Aged 40-44,Aged 45-49,Aged 50-54,Aged 55-59,Aged 60-64,Aged 65-69,Aged 70-74,Aged 75-79,Aged 80-84,Aged 85+
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Darlington""","""E06000005""",5241,5739,6660,6263,4744,5847,6454,6678,6699,6302,6842,7605,7583,6532,5523,5405,3514,3475
"""County Durham""","""E06000047""",25167,27584,31229,33431,35331,30585,32949,31373,31522,29206,33259,38789,39263,33958,29160,27405,16918,14349
"""Hartlepool""","""E06000001""",4636,5321,6015,5797,4511,5337,6077,6082,5745,5140,5711,6469,6798,5839,4826,4320,2474,2597
"""Middlesbrough""","""E06000002""",8754,9036,9370,9283,10458,9696,9778,8633,7931,7004,7536,8131,8700,7562,6090,5374,3386,3038
"""Northumberland""","""E06000057""",13735,15957,17447,16703,12613,15188,17023,18670,18953,18359,20978,24732,27325,25205,21489,20413,12574,11055
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Tewkesbury""","""E07000083""",5779,6274,6425,5519,3693,5223,6181,7118,6793,6129,6571,7356,7115,6239,5549,5308,3632,3177
"""Mendip""","""E07000187""",5606,6411,7495,7302,4213,5379,5983,6849,6961,6817,8115,9432,9343,7971,7230,7000,4566,4193
"""Sedgemoor""","""E07000188""",6123,6833,7789,7131,5153,6185,6827,7268,7187,6907,8225,9678,9761,8452,7694,7445,4988,4687
"""South Somerset""","""E07000189""",7879,8975,10024,9468,6597,8085,8805,9522,9124,8924,10801,12777,13438,12000,11271,11170,7205,6438


The following function combines the lookup for MSOA to local authority area with the age projections and scales the number of people down to the ratio for that MSOA.

In [225]:
def convert_proj_to_msoa(df_proj, df_lookup):
    # Join the two dataframes:
    df = df_proj.join(df_lookup, left_on='mnemonic', right_on='LAD19CD', how='left')
    # Pick out population columns:
    cols_pop = [c for c in df_proj.columns if c.startswith('Age')]
    # Scale population columns by ratio column:
    df[cols_pop] = df['district_pop_ratio'] * df[cols_pop]
    # Round to nearest integer:
    for col in cols_pop:
        df = df.with_columns(pl.col(col).round(0).cast(int))
    return df

Run this function for all year projections:

In [233]:
dict_df_proj_scaled = {}

for label in proj_years_str:
    df = convert_proj_to_msoa(dict_dfs_proj[label], df_lookup)
    # Drop some columns:
    cols_to_drop = ['local authority: district / unitary (as of April 2019)', 'mnemonic', 'LAD19NM', 'msoa11nm', 'total_health', 'district_pop_ratio']
    df = df.drop(cols_to_drop)
    # Move MSOA columns to start:
    cols = [c for c in df.columns if 'MSOA' in c] + [c for c in df.columns if 'MSOA' not in c]
    df = df[cols]
    dict_df_proj_scaled[label] = df

Check the results for one:

In [234]:
dict_df_proj_scaled[proj_years_str[0]]

MSOA,MSOA11CD,Age 0 - 4,Aged 5-9,Aged 10-14,Aged 15-19,Aged 20-24,Aged 25-29,Aged 30-34,Aged 35-39,Aged 40-44,Aged 45-49,Aged 50-54,Aged 55-59,Aged 60-64,Aged 65-69,Aged 70-74,Aged 75-79,Aged 80-84,Aged 85+,country
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""Darlington 001""","""E02002559""",439,481,558,525,397,490,541,559,561,528,573,637,635,547,463,453,294,291,"""E"""
"""Darlington 002""","""E02002560""",298,326,378,356,270,332,367,379,381,358,389,432,431,371,314,307,200,197,"""E"""
"""Darlington 003""","""E02002561""",285,312,362,341,258,318,351,363,364,343,372,414,412,355,300,294,191,189,"""E"""
"""Darlington 004""","""E02002562""",301,330,383,360,273,336,371,384,385,362,393,437,436,375,317,311,202,200,"""E"""
"""Darlington 005""","""E02002563""",271,297,345,324,246,303,334,346,347,326,354,394,392,338,286,280,182,180,"""E"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""West Somerset 001""","""E02006113""",370,421,482,444,315,395,430,455,467,446,519,619,650,592,537,512,350,341,"""E"""
"""West Somerset 002""","""E02006114""",340,387,442,407,289,363,395,418,429,410,476,568,597,544,493,470,321,313,"""E"""
"""West Somerset 003""","""E02006115""",364,415,474,437,310,389,424,448,460,440,511,610,640,583,529,504,345,336,"""E"""
"""West Somerset 004""","""E02006116""",397,452,516,476,338,424,461,488,501,479,556,664,697,635,575,549,375,366,"""E"""


Save results:

In [236]:
for label in proj_years_str:
    df = dict_df_proj_scaled[label]
    df.write_csv(os.path.join('..', 'data', 'projections', f'msoa_projections_{label}.csv'))