# Convert Nomis prediction tables into multiple files

On download, the .csv contains one long sheet with multiple tables each with their own header and footer.

In this notebook, load in the downloaded file, split it into separate tables, and then save the separate tables.

## Code setup

In [2]:
import os
import polars as pl
import numpy as np

## Load data

Load downloaded projection data as line strings:

In [48]:
path_to_file = os.path.join('..', 'data', 'nomis_age_predictions.csv')

with open(path_to_file, 'r') as f:
    all_lines = f.readlines()

In [49]:
all_lines[:6]

['\n',
 '"Population projections - local authority based by single year of age"\n',
 '"ONS Crown Copyright Reserved [from Nomis on 13 March 2025]"\n',
 '"Projected Year:","2025"\n',
 '"Gender     :","Total"\n',
 '\n']

Remove any empty lines:

In [50]:
all_lines = [a.replace('\n', '').replace('""', '') for a in all_lines]
all_lines = [a for a in all_lines if a != '']

In [51]:
all_lines[:6]

['"Population projections - local authority based by single year of age"',
 '"ONS Crown Copyright Reserved [from Nomis on 13 March 2025]"',
 '"Projected Year:","2025"',
 '"Gender     :","Total"',
 '"local authority: district / unitary (as of April 2019)","mnemonic","Age 0 - 4","Aged 5-9","Aged 10-14","Aged 15-19","Aged 20-24","Aged 25-29","Aged 30-34","Aged 35-39","Aged 40-44","Aged 45-49","Aged 50-54","Aged 55-59","Aged 60-64","Aged 65-69","Aged 70-74","Aged 75-79","Aged 80-84","Aged 85+"',
 '"Darlington","E06000005",5241,5739,6660,6263,4744,5847,6454,6678,6699,6302,6842,7605,7583,6532,5523,5405,3514,3475']

## Split apart tables

Assume that each new table starts with the string '"Population projections' and split the lines:

In [52]:
split_table_lines = []

ind_start = 0
ind_end = 0
for i, line in enumerate(all_lines):
    if line.startswith('"Population projections') | (i == len(all_lines) - 1):
        # Start of a new table or end of the last table.
        ind_end = i
        if ind_end > ind_start:
            # If this is the first table, don't store anything above it:
            split_table_lines.append(all_lines[ind_start:ind_end])
            ind_start = ind_end
    else:
        pass

How many tables are there now?

In [53]:
len(split_table_lines)

4

Check the first few lines of each:

In [54]:
for i, lines in enumerate(split_table_lines):
    print(lines[:8])

['"Population projections - local authority based by single year of age"', '"ONS Crown Copyright Reserved [from Nomis on 13 March 2025]"', '"Projected Year:","2025"', '"Gender     :","Total"', '"local authority: district / unitary (as of April 2019)","mnemonic","Age 0 - 4","Aged 5-9","Aged 10-14","Aged 15-19","Aged 20-24","Aged 25-29","Aged 30-34","Aged 35-39","Aged 40-44","Aged 45-49","Aged 50-54","Aged 55-59","Aged 60-64","Aged 65-69","Aged 70-74","Aged 75-79","Aged 80-84","Aged 85+"', '"Darlington","E06000005",5241,5739,6660,6263,4744,5847,6454,6678,6699,6302,6842,7605,7583,6532,5523,5405,3514,3475', '"County Durham","E06000047",25167,27584,31229,33431,35331,30585,32949,31373,31522,29206,33259,38789,39263,33958,29160,27405,16918,14349', '"Hartlepool","E06000001",4636,5321,6015,5797,4511,5337,6077,6082,5745,5140,5711,6469,6798,5839,4826,4320,2474,2597']
['"Population projections - local authority based by single year of age"', '"ONS Crown Copyright Reserved [from Nomis on 13 March 20

And the final few lines:

In [55]:
for i, lines in enumerate(split_table_lines):
    print(lines[-10:])

['"Forest of Dean","E07000080",4268,4843,5171,5545,3885,4095,4512,4858,5027,4795,5943,7307,7432,6395,5745,5524,3558,2816', '"Gloucester","E07000081",7688,7732,8367,8189,7604,8276,8839,9059,8816,7944,8252,8947,8460,6892,5545,5366,3707,3482', '"Stroud","E07000082",5667,6584,7720,7099,4720,5631,6190,7132,7612,7660,9128,9815,9507,8073,7099,6873,4449,3779', '"Tewkesbury","E07000083",5779,6274,6425,5519,3693,5223,6181,7118,6793,6129,6571,7356,7115,6239,5549,5308,3632,3177', '"Mendip","E07000187",5606,6411,7495,7302,4213,5379,5983,6849,6961,6817,8115,9432,9343,7971,7230,7000,4566,4193', '"Sedgemoor","E07000188",6123,6833,7789,7131,5153,6185,6827,7268,7187,6907,8225,9678,9761,8452,7694,7445,4988,4687', '"South Somerset","E07000189",7879,8975,10024,9468,6597,8085,8805,9522,9124,8924,10801,12777,13438,12000,11271,11170,7205,6438', '"Somerset West and Taunton","E07000246",7314,8329,9520,8770,6228,7812,8508,8995,9235,8826,10259,12235,12844,11703,10610,10119,6920,6742', ',"Figures may not sum becau

## Mark headers and footers as comments

In [56]:
footer_start_str = ',"Figures may not sum because of rounding."'
header_end_str = '"Gender     :","Total"'

In [57]:
for i, lines in enumerate(split_table_lines):
    for ip, p in enumerate(lines):
        if p.startswith(header_end_str):
            ind_header_end = ip
        elif p.startswith(footer_start_str):
            ind_footer_start = ip
    lines[:ind_header_end+1] = [f'# {p}' for p in lines[:ind_header_end+1]]
    lines[ind_footer_start:] = [f'# {p}' for p in lines[ind_footer_start:]]

Check the first few lines of each:

In [61]:
for i, lines in enumerate(split_table_lines):
    print(lines[:6])

['# "Population projections - local authority based by single year of age"', '# "ONS Crown Copyright Reserved [from Nomis on 13 March 2025]"', '# "Projected Year:","2025"', '# "Gender     :","Total"', '"local authority: district / unitary (as of April 2019)","mnemonic","Age 0 - 4","Aged 5-9","Aged 10-14","Aged 15-19","Aged 20-24","Aged 25-29","Aged 30-34","Aged 35-39","Aged 40-44","Aged 45-49","Aged 50-54","Aged 55-59","Aged 60-64","Aged 65-69","Aged 70-74","Aged 75-79","Aged 80-84","Aged 85+"', '"Darlington","E06000005",5241,5739,6660,6263,4744,5847,6454,6678,6699,6302,6842,7605,7583,6532,5523,5405,3514,3475']
['# "Population projections - local authority based by single year of age"', '# "ONS Crown Copyright Reserved [from Nomis on 13 March 2025]"', '# "Projected Year:","2030"', '# "Gender     :","Total"', '"local authority: district / unitary (as of April 2019)","mnemonic","Age 0 - 4","Aged 5-9","Aged 10-14","Aged 15-19","Aged 20-24","Aged 25-29","Aged 30-34","Aged 35-39","Aged 40-4

And the final few lines:

In [62]:
for i, lines in enumerate(split_table_lines):
    print(lines[-3:])

['"Somerset West and Taunton","E07000246",7314,8329,9520,8770,6228,7812,8508,8995,9235,8826,10259,12235,12844,11703,10610,10119,6920,6742', '# ,"Figures may not sum because of rounding."', '# ,']
['"Somerset West and Taunton","E07000246",7345,8031,8965,9406,7093,7316,8118,8975,9776,9914,9592,11309,13301,13689,11711,9952,8832,8154', '# ,"Figures may not sum because of rounding."', '# ,']
['"Somerset West and Taunton","E07000246",7509,8059,8646,8837,7576,8284,7634,8559,9774,10465,10687,10607,12357,14171,13725,11045,8721,10400', '# ,"Figures may not sum because of rounding."', '# ,']
['"South Somerset","E07000189",8061,8382,8782,8599,7231,8689,8712,8044,8945,9823,10693,10952,11196,13036,14230,13230,9565,10743', '"Somerset West and Taunton","E07000246",7853,8229,8669,8560,7149,8747,8625,8064,9322,10471,11248,11724,11637,13224,14222,12998,9777,11583', '# ,"Figures may not sum because of rounding."']


## Label the tables

Pick out the projected year from the table lines and store the year and lines in a dict:

In [65]:
label_str = 'Projected Year'

dict_split_table_lines = {}

for i, lines in enumerate(split_table_lines):
    for p in lines:
        if label_str in p:
            label = p.split('Year:","')[1].replace('"', '')
            dict_split_table_lines[label] = lines

In [66]:
dict_split_table_lines.keys()

dict_keys(['2025', '2030', '2035', '2040'])

## Save results

In [67]:
for label, lines in dict_split_table_lines.items():
    path_to_out = os.path.join('..', 'data', f'nomis_age_predictions_{label}.csv')
    with open(path_to_out, 'w') as f:
        # Place linebreaks back in with '\n':
        f.writelines([f'{p}\n' for p in lines])

## Check results by loading in new data:

In [69]:
dict_dfs_check = {}

for label in list(dict_split_table_lines.keys()):
    df = pl.read_csv(os.path.join('..', 'data', f'nomis_age_predictions_{label}.csv'), comment_prefix='#')
    dict_dfs_check[label] = df

In [70]:
dict_dfs_check['2025']

local authority: district / unitary (as of April 2019),mnemonic,Age 0 - 4,Aged 5-9,Aged 10-14,Aged 15-19,Aged 20-24,Aged 25-29,Aged 30-34,Aged 35-39,Aged 40-44,Aged 45-49,Aged 50-54,Aged 55-59,Aged 60-64,Aged 65-69,Aged 70-74,Aged 75-79,Aged 80-84,Aged 85+
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Darlington""","""E06000005""",5241,5739,6660,6263,4744,5847,6454,6678,6699,6302,6842,7605,7583,6532,5523,5405,3514,3475
"""County Durham""","""E06000047""",25167,27584,31229,33431,35331,30585,32949,31373,31522,29206,33259,38789,39263,33958,29160,27405,16918,14349
"""Hartlepool""","""E06000001""",4636,5321,6015,5797,4511,5337,6077,6082,5745,5140,5711,6469,6798,5839,4826,4320,2474,2597
"""Middlesbrough""","""E06000002""",8754,9036,9370,9283,10458,9696,9778,8633,7931,7004,7536,8131,8700,7562,6090,5374,3386,3038
"""Northumberland""","""E06000057""",13735,15957,17447,16703,12613,15188,17023,18670,18953,18359,20978,24732,27325,25205,21489,20413,12574,11055
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Tewkesbury""","""E07000083""",5779,6274,6425,5519,3693,5223,6181,7118,6793,6129,6571,7356,7115,6239,5549,5308,3632,3177
"""Mendip""","""E07000187""",5606,6411,7495,7302,4213,5379,5983,6849,6961,6817,8115,9432,9343,7971,7230,7000,4566,4193
"""Sedgemoor""","""E07000188""",6123,6833,7789,7131,5153,6185,6827,7268,7187,6907,8225,9678,9761,8452,7694,7445,4988,4687
"""South Somerset""","""E07000189""",7879,8975,10024,9468,6597,8085,8805,9522,9124,8924,10801,12777,13438,12000,11271,11170,7205,6438


In [71]:
dict_dfs_check['2040']

local authority: district / unitary (as of April 2019),mnemonic,Age 0 - 4,Aged 5-9,Aged 10-14,Aged 15-19,Aged 20-24,Aged 25-29,Aged 30-34,Aged 35-39,Aged 40-44,Aged 45-49,Aged 50-54,Aged 55-59,Aged 60-64,Aged 65-69,Aged 70-74,Aged 75-79,Aged 80-84,Aged 85+
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Darlington""","""E06000005""",5418,5379,5494,5299,4980,6281,6241,5716,6229,6765,6717,6536,6041,6518,6971,6515,4950,5328
"""County Durham""","""E06000047""",27094,27349,27690,30951,38613,34449,32561,29260,31110,34507,33829,33578,30845,33340,36007,32506,23048,22249
"""Hartlepool""","""E06000001""",4696,4824,4932,4970,4856,5655,5604,5049,5570,6246,6218,5769,5100,5488,5881,5443,3813,3675
"""Middlesbrough""","""E06000002""",8976,8443,8005,8605,11277,10481,9160,7443,7582,8037,7733,7194,6394,6592,6808,6619,4989,4429
"""Northumberland""","""E06000057""",13944,14871,15607,15067,13319,15738,16345,16333,18632,20892,21731,22217,21783,24155,25801,24591,18416,18631
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Tewkesbury""","""E07000083""",6447,6645,6584,6152,4562,6083,6571,6708,7317,7713,7774,7449,6731,7255,7648,6877,5032,5253
"""Mendip""","""E07000187""",5874,6285,6962,7024,4752,5784,6008,6124,7216,8039,8513,8392,8029,8940,9576,8689,6339,7258
"""Sedgemoor""","""E07000188""",6634,6844,7089,6858,5760,6738,6870,6579,7552,8328,8754,8645,8236,9172,9846,8982,6572,7867
"""South Somerset""","""E07000189""",8061,8382,8782,8599,7231,8689,8712,8044,8945,9823,10693,10952,11196,13036,14230,13230,9565,10743
