# Combine age bands

In the projected data, convert all of the separate age bands into fewer.

Also make a total patients column.

## Code setup

In [1]:
import os
import polars as pl

## Load data

The projected data file names contain these labels:

In [2]:
proj_years_str = ['2025', '2030', '2035', '2040']

Load in the data and store in a dictionary:

In [3]:
dict_df_proj_scaled = {}

for label in proj_years_str:
    df = pl.read_csv(os.path.join('..', 'data', 'projections', f'msoa_projections_{label}.csv'))
    dict_df_proj_scaled[label] = df

Check one of them:

In [4]:
dict_df_proj_scaled[proj_years_str[-1]]

MSOA,MSOA11CD,Age 0 - 4,Aged 5-9,Aged 10-14,Aged 15-19,Aged 20-24,Aged 25-29,Aged 30-34,Aged 35-39,Aged 40-44,Aged 45-49,Aged 50-54,Aged 55-59,Aged 60-64,Aged 65-69,Aged 70-74,Aged 75-79,Aged 80-84,Aged 85+,country
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""Darlington 001""","""E02002559""",454,451,460,444,417,526,523,479,522,567,563,547,506,546,584,546,415,446,"""E"""
"""Darlington 002""","""E02002560""",308,306,312,301,283,357,355,325,354,384,382,371,343,370,396,370,281,303,"""E"""
"""Darlington 003""","""E02002561""",295,293,299,288,271,342,339,311,339,368,365,355,329,354,379,354,269,290,"""E"""
"""Darlington 004""","""E02002562""",311,309,316,304,286,361,359,328,358,389,386,376,347,375,401,374,284,306,"""E"""
"""Darlington 005""","""E02002563""",280,278,284,274,258,325,323,296,322,350,348,338,313,337,361,337,256,276,"""E"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""West Somerset 001""","""E02006113""",397,416,438,433,362,442,436,408,471,530,569,593,589,669,719,657,495,586,"""E"""
"""West Somerset 002""","""E02006114""",365,382,403,398,332,406,401,375,433,486,522,545,540,614,661,604,454,538,"""E"""
"""West Somerset 003""","""E02006115""",391,410,432,427,356,436,430,402,464,522,560,584,580,659,709,648,487,577,"""E"""
"""West Somerset 004""","""E02006116""",426,446,470,464,388,474,468,437,506,568,610,636,631,717,771,705,530,628,"""E"""


Check column names as some have weird spacing:

In [5]:
dict_df_proj_scaled[proj_years_str[-1]].columns

['MSOA',
 'MSOA11CD',
 'Age 0 - 4',
 'Aged 5-9',
 'Aged 10-14',
 'Aged 15-19',
 'Aged 20-24',
 'Aged 25-29',
 'Aged 30-34',
 'Aged 35-39',
 'Aged 40-44',
 'Aged 45-49',
 'Aged 50-54',
 'Aged 55-59',
 'Aged 60-64',
 'Aged 65-69',
 'Aged 70-74',
 'Aged 75-79',
 'Aged 80-84',
 'Aged 85+',
 'country']

## Combine data

Bands to combine:
+ Age less than 65 years
+ Age at least 80 years

In [6]:
def combine_ages(df):
    # Age under 65:
    cols_to_sum = [
        'Age 0 - 4', 'Aged 5-9', 'Aged 10-14', 'Aged 15-19', 'Aged 20-24',
        'Aged 25-29', 'Aged 30-34', 'Aged 35-39', 'Aged 40-44', 'Aged 45-49',
        'Aged 50-54', 'Aged 55-59', 'Aged 60-64',
    ]
    df = df.with_columns(df[cols_to_sum].sum_horizontal().alias('Aged 0-65'))
    
    # Age over 80:
    cols_to_sum = ['Aged 80-84', 'Aged 85+']
    df = df.with_columns(df[cols_to_sum].sum_horizontal().alias('Aged 80+'))

    # Totals column:
    cols_to_sum = [
        'Age 0 - 4', 'Aged 5-9', 'Aged 10-14', 'Aged 15-19', 'Aged 20-24',
        'Aged 25-29', 'Aged 30-34', 'Aged 35-39', 'Aged 40-44', 'Aged 45-49',
        'Aged 50-54', 'Aged 55-59', 'Aged 60-64', 'Aged 65-69', 'Aged 70-74',
        'Aged 75-79', 'Aged 80-84', 'Aged 85+',
    ]
    df = df.with_columns(df[cols_to_sum].sum_horizontal().alias('total_patients'))
    return df

In [7]:
for label, df in dict_df_proj_scaled.items():
    df = combine_ages(df)
    # Drop other age columns:
    cols_to_drop = [
        'Age 0 - 4', 'Aged 5-9', 'Aged 10-14', 'Aged 15-19', 'Aged 20-24',
        'Aged 25-29', 'Aged 30-34', 'Aged 35-39', 'Aged 40-44', 'Aged 45-49',
        'Aged 50-54', 'Aged 55-59', 'Aged 60-64', 'Aged 80-84', 'Aged 85+',
    ]
    df = df.drop(cols_to_drop)
    dict_df_proj_scaled[label] = df

Check one of them:

In [8]:
dict_df_proj_scaled[proj_years_str[-1]]

MSOA,MSOA11CD,Aged 65-69,Aged 70-74,Aged 75-79,country,Aged 0-65,Aged 80+,total_patients
str,str,i64,i64,i64,str,i64,i64,i64
"""Darlington 001""","""E02002559""",546,584,546,"""E""",6459,861,8996
"""Darlington 002""","""E02002560""",370,396,370,"""E""",4381,584,6101
"""Darlington 003""","""E02002561""",354,379,354,"""E""",4194,559,5840
"""Darlington 004""","""E02002562""",375,401,374,"""E""",4430,590,6170
"""Darlington 005""","""E02002563""",337,361,337,"""E""",3989,532,5556
…,…,…,…,…,…,…,…,…
"""West Somerset 001""","""E02006113""",669,719,657,"""E""",6084,1081,9210
"""West Somerset 002""","""E02006114""",614,661,604,"""E""",5588,992,8459
"""West Somerset 003""","""E02006115""",659,709,648,"""E""",5994,1064,9074
"""West Somerset 004""","""E02006116""",717,771,705,"""E""",6524,1158,9875


## Calculate proportions

In [9]:
def calculate_proportions(df):
    cols = [c for c in df.columns if c.startswith('Age')]
    new_cols = ['prop_age' + ''.join(c.split(' ')[1:]) for c in cols]
    df[new_cols] = (df[cols] / df['total_patients'])
    # Round results:
    for col in new_cols:
        df = df.with_columns(pl.col(col).round(4))
    return df

In [10]:
rename_dict = {
    'prop_age0-65': 'age_less65_proportion',
    'prop_age65-69': 'age_65_proportion',
    'prop_age70-74': 'age_70_proportion',
    'prop_age75-79': 'age_75_proportion',
    'prop_age80+': 'age_over80_proportion',
    }

for label, df in dict_df_proj_scaled.items():
    df = calculate_proportions(df)
    # Rename columns:
    df = df.rename(rename_dict)
    # Store:
    dict_df_proj_scaled[label] = df

Check one of them:

In [11]:
dict_df_proj_scaled[proj_years_str[-1]]

MSOA,MSOA11CD,Aged 65-69,Aged 70-74,Aged 75-79,country,Aged 0-65,Aged 80+,total_patients,age_65_proportion,age_70_proportion,age_75_proportion,age_less65_proportion,age_over80_proportion
str,str,i64,i64,i64,str,i64,i64,i64,f64,f64,f64,f64,f64
"""Darlington 001""","""E02002559""",546,584,546,"""E""",6459,861,8996,0.0607,0.0649,0.0607,0.718,0.0957
"""Darlington 002""","""E02002560""",370,396,370,"""E""",4381,584,6101,0.0606,0.0649,0.0606,0.7181,0.0957
"""Darlington 003""","""E02002561""",354,379,354,"""E""",4194,559,5840,0.0606,0.0649,0.0606,0.7182,0.0957
"""Darlington 004""","""E02002562""",375,401,374,"""E""",4430,590,6170,0.0608,0.065,0.0606,0.718,0.0956
"""Darlington 005""","""E02002563""",337,361,337,"""E""",3989,532,5556,0.0607,0.065,0.0607,0.718,0.0958
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""West Somerset 001""","""E02006113""",669,719,657,"""E""",6084,1081,9210,0.0726,0.0781,0.0713,0.6606,0.1174
"""West Somerset 002""","""E02006114""",614,661,604,"""E""",5588,992,8459,0.0726,0.0781,0.0714,0.6606,0.1173
"""West Somerset 003""","""E02006115""",659,709,648,"""E""",5994,1064,9074,0.0726,0.0781,0.0714,0.6606,0.1173
"""West Somerset 004""","""E02006116""",717,771,705,"""E""",6524,1158,9875,0.0726,0.0781,0.0714,0.6607,0.1173


Remove age total columns:

In [12]:
for label, df in dict_df_proj_scaled.items():
    df = df.drop([c for c in df.columns if c.startswith('Age')])
    dict_df_proj_scaled[label] = df

## Save results

In [13]:
for label, df in dict_df_proj_scaled.items():
    df.write_csv(os.path.join('..', 'data', 'projections', f'msoa_projections_props_{label}.csv'))