In [1]:
import duckdb
con = duckdb.connect(database=':memory:')

In [2]:
addr_root = "../data/raw/ppd_addresses"
con.execute(f"""
    CREATE OR REPLACE VIEW ppd_src AS
    SELECT *
    FROM parquet_scan('{addr_root}/*.parquet', hive_partitioning=1)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1078badf0>

In [3]:
# Quick sanity check: expect only GL / 2025 rows below
df = con.execute("""
    SELECT *
    FROM ppd_src
    LIMIT 1
""").fetchdf()
df

Unnamed: 0,property_type,postcode,paon,saon,street,locality,town_city,district,county
0,S,AL35NA,41,,GRANGE STREET,ST. ALBANS,ST. ALBANS,ST ALBANS,HERTFORDSHIRE


In [19]:
df.columns

Index(['property_type', 'postcode', 'paon', 'saon', 'street', 'locality',
       'town_city', 'district', 'county'],
      dtype='object')

Checking patterns of addresses in PAON or SAON

In [20]:
tdf = con.execute("""
    SELECT DISTINCT property_type, paon, saon
    FROM ppd_src
    WHERE paon IS NOT NULL
      AND TRIM(paon) <> ''
      AND LOWER(paon) LIKE '%flat%'
      AND property_type != 'F'
    LIMIT 10
""").fetchdf()
tdf

Unnamed: 0,property_type,paon,saon
0,O,"FLAT 3, 46",
1,O,"FLAT 2, 30",
2,O,EBENEZER CHURCH FLATS,
3,O,HATCHETT FLATS,
4,O,"FLAT, 43",
5,O,ROOF AND AIRSPACE ABOVE FLATS 173 AND 183,
6,O,OSMUND FLATTS FARM,
7,O,"FLATS 1-8, 1 - 2",
8,S,GARDEN COTTAGE FLATS,4.0
9,O,"FLAT 1, 12",


Flat addresses also occur in PAON, but when they do SAON is usually empty but sometime has values. PAON has comma seperated values sometimes.

In [23]:
tdf = con.execute("""
    SELECT DISTINCT property_type, paon, saon
    FROM ppd_src
    WHERE saon IS NOT NULL
      AND property_type != 'F'

""").fetchdf()
tdf

Unnamed: 0,property_type,paon,saon
0,S,DUNORLAN FARM COTTAGE,2
1,S,IVY VILLAS,1
2,D,LONGSPRING WOOD,19
3,T,HILLSIDE,12
4,D,FARTHINGFIELD,3
...,...,...,...
253713,O,"MALTON ENTERPRISE PARK, 13",UNIT 1
253714,O,"MALTON ENTERPRISE PARK, 21",UNIT 1B
253715,O,FRIARS OVEN FARM,LOT 3
253716,O,GREATE BENTLEY ESTATE,"PLOTS 61 TO 65, 79, 80"


paon has (building name, building number) or (building number - can be alphanumeric) or (building name)

saon has (sub building name - like FLAT 2 or FLAT C) or (building number) or (building name - can be alphanumeric) or (building name)

## Filter data on more strict conditions

In [4]:
sql = """
WITH cleaned AS (
  SELECT
    TRIM(property_type)               AS property_type,
    UPPER(TRIM(postcode))             AS postcode,
    TRIM(paon)                        AS paon_raw,
    TRIM(saon)                        AS saon_raw,
    TRIM(street)                      AS street,
    TRIM(locality)                    AS locality,
    TRIM(town_city)                   AS town_city,
    TRIM(district)                    AS district,
    TRIM(county)                      AS county
  FROM ppd_src
),
patterns AS (
  SELECT
    *,
    -- PAON presence flags
    REGEXP_MATCHES(paon_raw, '^\s*\d+[A-Za-z]?(?:\s*-\s*\d+[A-Za-z]?)?\s*$')                                        AS paon_is_num_only,
    REGEXP_MATCHES(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$')                         AS paon_is_name_num,
    REGEXP_MATCHES(paon_raw, '^\s*[A-Za-z][A-Za-z\s''&\.-]*\s*$')                                                   AS paon_is_name_only,

    -- PAON extracted parts (explicit capture groups!)
    REGEXP_EXTRACT(paon_raw, '^\s*(\d+[A-Za-z]?(?:\s*-\s*\d+[A-Za-z]?)?)\s*$', 1)                                   AS paon_num_only_val,
    REGEXP_EXTRACT(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$', 1)                      AS paon_name_part,
    REGEXP_EXTRACT(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$', 2)                      AS paon_num_from_name,
    REGEXP_EXTRACT(paon_raw, '^\s*([A-Za-z][A-Za-z\s''&\.-]*)\s*$', 1)                                              AS paon_name_only,

    -- SAON patterns (we use them only as validity checks)
    REGEXP_MATCHES(saon_raw, '(?i)^\s*(flat|apartment|apt|appts|room|unit|annexe?|block|blk|studio)\s+[0-9]+[A-Za-z]?\s*$') AS saon_is_flat_pat,
    REGEXP_MATCHES(saon_raw, '^\s*\d+[A-Za-z]?\s*$')                                                                  AS saon_is_num_only,
    REGEXP_MATCHES(saon_raw, '^[A-Za-z0-9][A-Za-z0-9\s''&\.-]*$')                                                     AS saon_is_name_like,

    -- Postcode split (validated)
    REGEXP_EXTRACT(
      postcode,
      '(?i)^\s*((?:GIR\s?0AA)|(?:[A-PR-UWYZ][0-9][0-9A-HJKMNPR-Y]?)|(?:[A-PR-UWYZ][A-HK-Y][0-9][0-9A-HJKMNPR-Y]?))\s*([0-9][ABD-HJLNP-UW-Z]{2})\s*$',
      1
    ) AS outcode,
    REGEXP_EXTRACT(
      postcode,
      '(?i)^\s*((?:GIR\s?0AA)|(?:[A-PR-UWYZ][0-9][0-9A-HJKMNPR-Y]?)|(?:[A-PR-UWYZ][A-HK-Y][0-9][0-9A-HJKMNPR-Y]?))\s*([0-9][ABD-HJLNP-UW-Z]{2})\s*$',
      2
    ) AS incode
  FROM cleaned
),
cohort AS (
  SELECT
    property_type,
    street, locality, town_city, district, county,

    -- Normalised PAON parts
    CASE
      WHEN paon_is_name_num THEN paon_name_part
      WHEN paon_is_name_only THEN paon_name_only
      ELSE NULL
    END AS paon_building_name,
    CASE
      WHEN paon_is_name_num THEN paon_num_from_name
      WHEN paon_is_num_only THEN paon_num_only_val
      ELSE NULL
    END AS paon_building_number,

    -- SAON accepted shapes (keep original text)
    CASE
      WHEN saon_is_flat_pat OR saon_is_num_only OR saon_is_name_like THEN saon_raw
      ELSE NULL
    END AS saon_norm,

    outcode, incode
  FROM patterns
  WHERE
    (paon_is_num_only OR paon_is_name_num OR paon_is_name_only)
    AND outcode IS NOT NULL AND incode IS NOT NULL
    AND (
      (property_type = 'F' AND (saon_is_flat_pat OR saon_is_num_only OR saon_is_name_like))
      OR
      (property_type <> 'F' AND (saon_raw IS NULL OR saon_raw = ''))
    )
)
SELECT
  property_type,
  paon_building_name                    AS BuildingName,
  paon_building_number                  AS BuildingNumber,
  saon_norm                             AS SubBuildingName,
  street                                AS StreetName,
  locality                              AS Locality,
  town_city                             AS TownName,
  outcode || ' ' || incode              AS Postcode,
  outcode                               AS Outcode,
  incode                                AS Incode
FROM cohort;
"""

In [5]:
df = con.execute(sql).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,property_type,BuildingName,BuildingNumber,SubBuildingName,StreetName,Locality,TownName,Postcode,Outcode,Incode
0,S,,41,,GRANGE STREET,ST. ALBANS,ST. ALBANS,AL3 5NA,AL3,5NA
1,T,,5,,WOOD CLOSE,HATFIELD,HATFIELD,AL10 8TY,AL10,8TY
2,T,,1,,PUTTOCKS CLOSE,WELHAM GREEN,HATFIELD,AL9 7LN,AL9,7LN
3,D,WATLING GARTH,,,OLD WATLING STREET,FLAMSTEAD,ST. ALBANS,AL3 8HJ,AL3,8HJ
4,T,,32,,NEW GREENS AVENUE,ST. ALBANS,ST. ALBANS,AL3 6NZ,AL3,6NZ
...,...,...,...,...,...,...,...,...,...,...
27482936,F,,15,FLAT 1,VICTORIA ROAD,,BRIDLINGTON,YO15 2BW,YO15,2BW
27482937,T,WAYSIDE,,,CLIFF ROAD,ATWICK,DRIFFIELD,YO25 8DF,YO25,8DF
27482938,D,,3,,RIBBLESDALE CLOSE,,BRIDLINGTON,YO16 6FH,YO16,6FH
27482939,S,,24,,BAILEYWOOD LANE,HOLME ON SPALDING MOOR,YORK,YO43 4ER,YO43,4ER


In [7]:
df.shape

(27482941, 10)

In [8]:
df[(df['BuildingName'].notna()) & (df['BuildingNumber'].notna())]

Unnamed: 0,property_type,BuildingName,BuildingNumber,SubBuildingName,StreetName,Locality,TownName,Postcode,Outcode,Incode
158,F,ALBANIAN COURT,85,FLAT 9,CAMP ROAD,ST. ALBANS,ST. ALBANS,AL1 5EA,AL1,5EA
161,F,ALBANIAN COURT,85,FLAT 3,CAMP ROAD,ST. ALBANS,ST. ALBANS,AL1 5EA,AL1,5EA
197,F,LINGFIELD COURT,60,FLAT 1,HIGH STREET,HARBORNE,BIRMINGHAM,B17 9NE,B17,9NE
214,F,SUMMERFIELD COURT,1A,FLAT 6,HERMITAGE ROAD,EDGBASTON,BIRMINGHAM,B15 3UP,B15,3UP
238,F,ELMHURST,5A,FLAT 8,NORFOLK ROAD,EDGBASTON,BIRMINGHAM,B15 3PR,B15,3PR
...,...,...,...,...,...,...,...,...,...,...
27482622,F,KILHENDRE COURT,43,FLAT 19,BROADWAY NORTH,,WALSALL,WS1 2QJ,WS1,2QJ
27482624,F,MELLISH PARK,84,APARTMENT 23,MELLISH ROAD,,WALSALL,WS4 2EB,WS4,2EB
27482733,F,GATE HOUSE,103,FLAT 5,BOROUGHBRIDGE ROAD,,YORK,YO26 6AA,YO26,6AA
27482756,T,SWALLOW BARN,7,,HIGH STREET,CAWOOD,SELBY,YO8 3TH,YO8,3TH


In [9]:
df.to_csv("../data/interim/ppd_address_sql_v01.csv", index=None)

## Getting Distinct Counties

In [14]:
c_sql = """
SELECT DISTINCT
  UPPER(REGEXP_REPLACE(TRIM(county), '\s+', ' ')) AS county
FROM ppd_src
WHERE county IS NOT NULL
  AND TRIM(county) <> ''
ORDER BY county;
"""

In [15]:
dfc = con.execute(c_sql).fetchdf()

In [16]:
dfc

Unnamed: 0,county
0,AVON
1,BATH AND NORTH EAST SOMERSET
2,BEDFORD
3,BEDFORDSHIRE
4,BERKSHIRE
...,...
127,WOKINGHAM
128,WORCESTERSHIRE
129,WREKIN
130,WREXHAM


In [17]:
dfc.shape

(132, 1)

## Getting Localities

In [10]:
l_sql = """
SELECT DISTINCT
  UPPER(REGEXP_REPLACE(TRIM(locality), '\s+', ' ')) AS locality
FROM ppd_src
WHERE locality IS NOT NULL
  AND TRIM(locality) <> ''
ORDER BY locality;
"""

In [11]:
dfl = con.execute(l_sql).fetchdf()

In [12]:
dfl

Unnamed: 0,locality
0,AB KETTLEBY
1,ABBERD
2,ABBERLEY
3,ABBERTON
4,ABBESS RODING
...,...
23993,ZENNOR
23994,ZONE 1
23995,ZONE 2
23996,ZONE 3


In [13]:
dfl.to_csv("../data/lookups/localities.csv", index=None)

## Getting Postcode_distict_to-town

## Evaluating/Fixing Existing Code

In [18]:
from machine_learning.address_parser.src.tokens import tokenize, tokens2features

In [19]:
tokenize("flat 25 willowbrook drive GL510pu cheltenham gloucestershire")

['FLAT', '25', 'WILLOWBROOK', 'DRIVE', 'GL510PU', 'CHELTENHAM']

In [20]:
tokens2features(['FLAT', '25', 'WILLOWBROOK', 'DRIVE', 'GL510PU', 'CHELTENHAM'])

[{'digits': 'no_digits',
  'word': 'FLAT',
  'length': 'w:4',
  'endsinpunc': False,
  'directional': False,
  'outcode': False,
  'posttown': False,
  'has.vowels': True,
  'flat': True,
  'company': False,
  'road': False,
  'residential': False,
  'business': False,
  'locational': False,
  'ordinal': False,
  'hyphenations': 0,
  'next': {'digits': 'all_digits',
   'word': False,
   'length': 'd:2',
   'endsinpunc': False,
   'directional': False,
   'outcode': False,
   'posttown': False,
   'has.vowels': False,
   'flat': False,
   'company': False,
   'road': False,
   'residential': False,
   'business': False,
   'locational': False,
   'ordinal': False,
   'hyphenations': 0},
  'rawstring.start': True},
 {'digits': 'all_digits',
  'word': False,
  'length': 'd:2',
  'endsinpunc': False,
  'directional': False,
  'outcode': False,
  'posttown': False,
  'has.vowels': False,
  'flat': False,
  'company': False,
  'road': False,
  'residential': False,
  'business': False,
  'lo

## Pre-processing Data

In [21]:
import pandas as pd

In [22]:
df = pd.read_csv("../data/interim/ppd_address_sql_v01.csv")

In [23]:
df.head()

Unnamed: 0,property_type,BuildingName,BuildingNumber,SubBuildingName,StreetName,Locality,TownName,Postcode,Outcode,Incode
0,S,,41.0,,GRANGE STREET,ST. ALBANS,ST. ALBANS,AL3 5NA,AL3,5NA
1,T,,5.0,,WOOD CLOSE,HATFIELD,HATFIELD,AL10 8TY,AL10,8TY
2,T,,1.0,,PUTTOCKS CLOSE,WELHAM GREEN,HATFIELD,AL9 7LN,AL9,7LN
3,D,WATLING GARTH,,,OLD WATLING STREET,FLAMSTEAD,ST. ALBANS,AL3 8HJ,AL3,8HJ
4,T,,32.0,,NEW GREENS AVENUE,ST. ALBANS,ST. ALBANS,AL3 6NZ,AL3,6NZ


In [24]:
df.shape

(27482941, 10)

In [26]:
df.drop_duplicates(inplace=True)

In [32]:
df.shape

(17974221, 10)

In [34]:
df.to_csv("../data/interim/ppd_address_sql_v01.csv", index=None)

In [35]:
df.columns

Index(['property_type', 'BuildingName', 'BuildingNumber', 'SubBuildingName',
       'StreetName', 'Locality', 'TownName', 'Postcode', 'Outcode', 'Incode'],
      dtype='object')

## Data to XML

In [13]:
import pandas as pd
from machine_learning.address_parser.src.train_test_stratify_split import write_train_test_streaming

In [14]:
df = pd.read_csv("../data/interim/ppd_address_sql_v01.csv")

In [5]:
out = write_train_test_streaming(
    df,
    output_dir="../data/processed/ppd_xml/2025-08-22",
    n_total=1_000_000,                # pick this many rows in a balanced way; None = use all available
    test_ratio=0.20,                  # 20% to holdout
    rows_per_shard=1_000_000,           # file size control , # >= n_total → no rollover
    seed=2025,
    swap_locality_prob=0.25,          # keep your randomisation
    include_property_type_in_strata=True,  # stratify by F vs non-F too
    train_prefix="train",
    holdout_prefix="holdout",
)

out["counts"], out["files"]["train"][:2], out["files"]["test"][:2]

({'train': 1599694, 'test': 400306},
 ['../data/processed/ppd_xml/2025-08-23_2M/train_0000.xml'],
 ['../data/processed/ppd_xml/2025-08-23_2M/holdout_0000.xml'])

In [16]:
out = write_train_test_streaming(
    df,
    output_dir="../data/processed/ppd_xml/2025-08-24_4M",
    n_total=4_000_000,                # pick this many rows in a balanced way; None = use all available
    test_ratio=0.20,                  # 20% to holdout
    rows_per_shard=4_000_000,           # file size control , # >= n_total → no rollover
    seed=2025,
    swap_locality_prob=0.25,          # keep your randomisation
    include_property_type_in_strata=True,  # stratify by F vs non-F too
    train_prefix="train",
    holdout_prefix="holdout",
)

out["counts"], out["files"]["train"][:2], out["files"]["test"][:2]

({'train': 3199782, 'test': 800218},
 ['../data/processed/ppd_xml/2025-08-24_4M/train_0000.xml'],
 ['../data/processed/ppd_xml/2025-08-24_4M/holdout_0000.xml'])

## Model Training

In [1]:
from machine_learning.address_parser.src.train_and_tune import train, tune

Train

In [2]:
scores = train(
    training_xml="../data/processed/ppd_xml/2025-08-22/train_0000.xml",
    holdout_xml="../data/processed/ppd_xml/2025-08-22/holdout_0000.xml",
    algorithm="lbfgs",
    c1=0.3,
    c2=0.001,
    min_freq=0.001,
    random_state=42,
    train_subset=None,   # e.g. 10000 to speed up
    eval_subset=None,
    model_name="address_crf_1M_v1.crfsuite",
    model_dir="../configs/model/training"
)
scores  # {'f1': ..., 'sequence_accuracy': ...}


Reading training XML: ../data/processed/ppd_xml/2025-08-23/train_0000.xml
Training sequences: 800039
Reading holdout XML:  ../data/processed/ppd_xml/2025-08-23/holdout_0000.xml
Holdout sequences:   199961

Start training...


loading training data to CRFsuite: 100%|██████████| 800039/800039 [02:22<00:00, 5625.17it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.001000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 472334
Seconds required: 33.284

L-BFGS optimization
c1: 0.300000
c2: 0.001000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=10.43 loss=9019705.93 active=447853 feature_norm=1.00
Iter 2   time=5.08  loss=5638637.89 active=445027 feature_norm=3.04
Iter 3   time=5.04  loss=4059576.40 active=446523 feature_norm=4.37
Iter 4   time=5.08  loss=3313221.12 active=452346 feature_norm=5.42
Iter 5   time=5.10  loss=2823547.02 active=455819 feature_norm=6.43
Iter 6   time=5.08  loss=2416002.55 active=449388 feature_norm=7.80
Iter 7   time=5.11  loss=2128140.53 active=428116 feature_norm=9.18
Iter 8   time=5.08  loss=1938849.36 active=414960 feature_norm=10.64
Iter 9   time=5.12  loss=1815433.55 active=39

{'f1': 0.9625740513394881, 'sequence_accuracy': 0.8137336780672231}

Train with 2M

In [2]:
scores = train(
    training_xml="../data/processed/ppd_xml/2025-08-23_2M/train_0000.xml",
    holdout_xml="../data/processed/ppd_xml/2025-08-23_2M/holdout_0000.xml",
    algorithm="lbfgs",
    c1=0.3,
    c2=0.001,
    min_freq=0.001,
    random_state=42,
    train_subset=None,   # e.g. 10000 to speed up
    eval_subset=None,
    model_name="address_crf_2M_v1.crfsuite",
    model_dir="../configs/model/training"
)
scores  # {'f1': ..., 'sequence_accuracy': ...}

Reading training XML: ../data/processed/ppd_xml/2025-08-23_2M/train_0000.xml
Training sequences: 1599694
Reading holdout XML:  ../data/processed/ppd_xml/2025-08-23_2M/holdout_0000.xml
Holdout sequences:   400306

Start training...


loading training data to CRFsuite: 100%|██████████| 1599694/1599694 [04:49<00:00, 5520.01it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.001000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 595548
Seconds required: 69.218

L-BFGS optimization
c1: 0.300000
c2: 0.001000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=19.30 loss=17554902.24 active=568102 feature_norm=1.00
Iter 2   time=9.28  loss=10843815.71 active=562118 feature_norm=3.03
Iter 3   time=9.28  loss=7730000.88 active=566170 feature_norm=4.30
Iter 4   time=9.32  loss=6216600.11 active=572031 feature_norm=5.41
Iter 5   time=9.30  loss=5182593.43 active=574421 feature_norm=6.66
Iter 6   time=9.28  loss=4569497.74 active=568005 feature_norm=7.97
Iter 7   time=10.54 loss=4204927.95 active=557518 feature_norm=8.83
Iter 8   time=9.39  loss=3741474.99 active=536957 feature_norm=10.50
Iter 9   time=9.43  loss=3355970.35 active=

{'f1': 0.96639982720771, 'sequence_accuracy': 0.83622029147702}

Train with 4M

In [None]:
scores = train(
    training_xml="../data/processed/ppd_xml/2025-08-24_4M/train_0000.xml",
    holdout_xml="../data/processed/ppd_xml/2025-08-24_4M/holdout_0000.xml",
    algorithm="lbfgs",
    c1=0.3,
    c2=0.001,
    min_freq=0.001,
    random_state=42,
    train_subset=None,   # e.g. 10000 to speed up
    eval_subset=None,
    model_name="address_crf_4M_v1.crfsuite",
    model_dir="../configs/model/training"
)
scores  # {'f1': ..., 'sequence_accuracy': ...}

Reading training XML: ../data/processed/ppd_xml/2025-08-24_4M/train_0000.xml


Hyperparameter search (Not Implemented)

In [None]:
scores, best_params = tune(
    training_xml="../data/processed/ppd_xml/2025-08-22/train_0000.xml",
    holdout_xml="../data/processed/ppd_xml/2025-08-22/holdout_0000.xml",
    n_iter=50,
    cv=3,
    random_state=42,
    sequence_optimisation=True,   # optimise for sequence accuracy
    plot_path="hyperparams.png",  # optional
    pickle_path="optimisation.pickle"
)
best_params, scores


## Model Inference

In [4]:
from machine_learning.address_parser.src.parser import tag, parse

In [12]:
tag(
    raw="10 QUEEN STREET FLAT 2 BURY BL8 1JG",
    model_path="../configs/model/training/address_crf_2M_v1.crfsuite"
)

{'BuildingNumber': '10',
 'StreetName': 'QUEEN STREET',
 'SubBuildingName': 'FLAT 2',
 'TownName': 'BURY',
 'Postcode': 'BL8 1JG'}

In [6]:
parse(
    raw="FLAT 2 10 QUEEN STREET BURY BL8 1JG",
    model_path="../configs/model/training/address_crf_2M_v1.crfsuite"
)

[('FLAT', 'SubBuildingName'),
 ('2', 'SubBuildingName'),
 ('10', 'BuildingNumber'),
 ('QUEEN', 'StreetName'),
 ('STREET', 'StreetName'),
 ('BURY', 'TownName'),
 ('BL8', 'Postcode'),
 ('1JG', 'Postcode')]