In [1]:
import duckdb
con = duckdb.connect(database=':memory:')

In [2]:
addr_root = "../data/raw/ppd_addresses"
con.execute(f"""
    CREATE OR REPLACE VIEW ppd_src AS
    SELECT *
    FROM parquet_scan('{addr_root}/*.parquet', hive_partitioning=1)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x104003cf0>

In [18]:
# Quick sanity check: expect only GL / 2025 rows below
df = con.execute("""
    SELECT *
    FROM ppd_src
    LIMIT 1
""").fetchdf()
df

Unnamed: 0,property_type,postcode,paon,saon,street,locality,town_city,district,county
0,S,AL35NA,41,,GRANGE STREET,ST. ALBANS,ST. ALBANS,ST ALBANS,HERTFORDSHIRE


In [19]:
df.columns

Index(['property_type', 'postcode', 'paon', 'saon', 'street', 'locality',
       'town_city', 'district', 'county'],
      dtype='object')

In [None]:
LABELS = [
    'OrganisationName', 'DepartmentName', 'SubBuildingName', 'BuildingName',
    'BuildingNumber', 'StreetName', 'Locality', 'TownName', 'Postcode'
]


{
    'BuildingNumber': '', # It comes from paon (it can only be a number, or number + buildingname (seperated by comma), or it has flat number)
    'BuildingName': '',
    'SubBuildingName': 'saon',
    'StreetName': 'street',
    'Locality': 'locality',
    'TownName': 'town_city',
    'Postcode': 'Postcode'
}

Checking patterns of addresses in PAON or SAON

In [20]:
tdf = con.execute("""
    SELECT DISTINCT property_type, paon, saon
    FROM ppd_src
    WHERE paon IS NOT NULL
      AND TRIM(paon) <> ''
      AND LOWER(paon) LIKE '%flat%'
      AND property_type != 'F'
    LIMIT 10
""").fetchdf()
tdf

Unnamed: 0,property_type,paon,saon
0,O,"FLAT 3, 46",
1,O,"FLAT 2, 30",
2,O,EBENEZER CHURCH FLATS,
3,O,HATCHETT FLATS,
4,O,"FLAT, 43",
5,O,ROOF AND AIRSPACE ABOVE FLATS 173 AND 183,
6,O,OSMUND FLATTS FARM,
7,O,"FLATS 1-8, 1 - 2",
8,S,GARDEN COTTAGE FLATS,4.0
9,O,"FLAT 1, 12",


Flat addresses also occur in PAON, but when they do SAON is usually empty but sometime has values. PAON has comma seperated values sometimes.

In [23]:
tdf = con.execute("""
    SELECT DISTINCT property_type, paon, saon
    FROM ppd_src
    WHERE saon IS NOT NULL
      AND property_type != 'F'

""").fetchdf()
tdf

Unnamed: 0,property_type,paon,saon
0,S,DUNORLAN FARM COTTAGE,2
1,S,IVY VILLAS,1
2,D,LONGSPRING WOOD,19
3,T,HILLSIDE,12
4,D,FARTHINGFIELD,3
...,...,...,...
253713,O,"MALTON ENTERPRISE PARK, 13",UNIT 1
253714,O,"MALTON ENTERPRISE PARK, 21",UNIT 1B
253715,O,FRIARS OVEN FARM,LOT 3
253716,O,GREATE BENTLEY ESTATE,"PLOTS 61 TO 65, 79, 80"


paon has (building name, building number) or (building number - can be alphanumeric) or (building name)

saon has (sub building name - like FLAT 2 or FLAT C) or (building number) or (building name - can be alphanumeric) or (building name)

## Filter data on more strict conditions

In [35]:
sql = """
WITH cleaned AS (
  SELECT
    TRIM(property_type)               AS property_type,
    UPPER(TRIM(postcode))             AS postcode,
    TRIM(paon)                        AS paon_raw,
    TRIM(saon)                        AS saon_raw,
    TRIM(street)                      AS street,
    TRIM(locality)                    AS locality,
    TRIM(town_city)                   AS town_city,
    TRIM(district)                    AS district,
    TRIM(county)                      AS county
  FROM ppd_src
),
patterns AS (
  SELECT
    *,
    -- PAON presence flags
    REGEXP_MATCHES(paon_raw, '^\s*\d+[A-Za-z]?(?:\s*-\s*\d+[A-Za-z]?)?\s*$')                                        AS paon_is_num_only,
    REGEXP_MATCHES(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$')                         AS paon_is_name_num,
    REGEXP_MATCHES(paon_raw, '^\s*[A-Za-z][A-Za-z\s''&\.-]*\s*$')                                                   AS paon_is_name_only,

    -- PAON extracted parts (explicit capture groups!)
    REGEXP_EXTRACT(paon_raw, '^\s*(\d+[A-Za-z]?(?:\s*-\s*\d+[A-Za-z]?)?)\s*$', 1)                                   AS paon_num_only_val,
    REGEXP_EXTRACT(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$', 1)                      AS paon_name_part,
    REGEXP_EXTRACT(paon_raw, '^([A-Za-z][A-Za-z\s''&\.-]*?)\s*(?:,?\s*)(\d+[A-Za-z]?)\s*$', 2)                      AS paon_num_from_name,
    REGEXP_EXTRACT(paon_raw, '^\s*([A-Za-z][A-Za-z\s''&\.-]*)\s*$', 1)                                              AS paon_name_only,

    -- SAON patterns (we use them only as validity checks)
    REGEXP_MATCHES(saon_raw, '(?i)^\s*(flat|apartment|apt|appts|room|unit|annexe?|block|blk|studio)\s+[0-9]+[A-Za-z]?\s*$') AS saon_is_flat_pat,
    REGEXP_MATCHES(saon_raw, '^\s*\d+[A-Za-z]?\s*$')                                                                  AS saon_is_num_only,
    REGEXP_MATCHES(saon_raw, '^[A-Za-z0-9][A-Za-z0-9\s''&\.-]*$')                                                     AS saon_is_name_like,

    -- Postcode split (validated)
    REGEXP_EXTRACT(
      postcode,
      '(?i)^\s*((?:GIR\s?0AA)|(?:[A-PR-UWYZ][0-9][0-9A-HJKMNPR-Y]?)|(?:[A-PR-UWYZ][A-HK-Y][0-9][0-9A-HJKMNPR-Y]?))\s*([0-9][ABD-HJLNP-UW-Z]{2})\s*$',
      1
    ) AS outcode,
    REGEXP_EXTRACT(
      postcode,
      '(?i)^\s*((?:GIR\s?0AA)|(?:[A-PR-UWYZ][0-9][0-9A-HJKMNPR-Y]?)|(?:[A-PR-UWYZ][A-HK-Y][0-9][0-9A-HJKMNPR-Y]?))\s*([0-9][ABD-HJLNP-UW-Z]{2})\s*$',
      2
    ) AS incode
  FROM cleaned
),
cohort AS (
  SELECT
    property_type,
    street, locality, town_city, district, county,

    -- Normalised PAON parts
    CASE
      WHEN paon_is_name_num THEN paon_name_part
      WHEN paon_is_name_only THEN paon_name_only
      ELSE NULL
    END AS paon_building_name,
    CASE
      WHEN paon_is_name_num THEN paon_num_from_name
      WHEN paon_is_num_only THEN paon_num_only_val
      ELSE NULL
    END AS paon_building_number,

    -- SAON accepted shapes (keep original text)
    CASE
      WHEN saon_is_flat_pat OR saon_is_num_only OR saon_is_name_like THEN saon_raw
      ELSE NULL
    END AS saon_norm,

    outcode, incode
  FROM patterns
  WHERE
    (paon_is_num_only OR paon_is_name_num OR paon_is_name_only)
    AND outcode IS NOT NULL AND incode IS NOT NULL
    AND (
      (property_type = 'F' AND (saon_is_flat_pat OR saon_is_num_only OR saon_is_name_like))
      OR
      (property_type <> 'F' AND (saon_raw IS NULL OR saon_raw = ''))
    )
)
SELECT
  property_type,
  paon_building_name                    AS BuildingName,
  paon_building_number                  AS BuildingNumber,
  saon_norm                             AS SubBuildingName,
  street                                AS StreetName,
  locality                              AS Locality,
  town_city                             AS TownName,
  outcode || ' ' || incode              AS postcode,
  outcode                               AS Outcode,
  incode                                AS Incode
FROM cohort;
"""

In [36]:

{
    'BuildingNumber': '', # It comes from paon (it can only be a number, or number + buildingname (seperated by comma), or it has flat number)
    'BuildingName': '',
    'SubBuildingName': 'saon',
    'StreetName': 'street',
    'Locality': 'locality',
    'TownName': 'town_city',
    'Postcode': 'Postcode'
}

{'BuildingNumber': '',
 'BuildingName': '',
 'SubBuildingName': 'saon',
 'StreetName': 'street',
 'Locality': 'locality',
 'TownName': 'town_city',
 'Postcode': 'Postcode'}

In [None]:
df = con.execute(sql).fetchdf()
df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [30]:
df[df['property_type'] == 'F'].head()

Unnamed: 0,property_type,paon,saon,street,locality,town_city,postcode
8,F,JORDANS,14,HILLY FIELDS,WELWYN GARDEN CITY,WELWYN GARDEN CITY,AL7 2HD
31,F,11,FLAT 6,AVENUE ROAD,ST. ALBANS,ST. ALBANS,AL1 3QG
40,F,ST JOHNS COURT,3,ST JOHNS ROAD,HARPENDEN,HARPENDEN,AL5 1DL
55,F,HOLWELL COURT,FLAT 11,HOLWELL,ESSENDON,HATFIELD,AL9 5RL
75,F,18,FLAT 2,LEMSFORD ROAD,ST. ALBANS,ST. ALBANS,AL1 3PB


## Evaluating/Fixing Existing Code

In [15]:
from machine_learning.address_parser.src.tokens import tokenize, tokens2features

In [16]:
tokenize("flat 25 willowbrook drive GL510pu cheltenham gloucestershire")

['FLAT', '25', 'WILLOWBROOK', 'DRIVE', 'GL510PU', 'CHELTENHAM']

In [17]:
tokens2features(['FLAT', '25', 'WILLOWBROOK', 'DRIVE', 'GL510PU', 'CHELTENHAM'])

[{'digits': 'no_digits',
  'word': 'FLAT',
  'length': 'w:4',
  'endsinpunc': False,
  'directional': False,
  'outcode': False,
  'posttown': False,
  'has.vowels': True,
  'flat': True,
  'company': False,
  'road': False,
  'residential': False,
  'business': False,
  'locational': False,
  'ordinal': False,
  'hyphenations': 0,
  'next': {'digits': 'all_digits',
   'word': False,
   'length': 'd:2',
   'endsinpunc': False,
   'directional': False,
   'outcode': False,
   'posttown': False,
   'has.vowels': False,
   'flat': False,
   'company': False,
   'road': False,
   'residential': False,
   'business': False,
   'locational': False,
   'ordinal': False,
   'hyphenations': 0},
  'rawstring.start': True},
 {'digits': 'all_digits',
  'word': False,
  'length': 'd:2',
  'endsinpunc': False,
  'directional': False,
  'outcode': False,
  'posttown': False,
  'has.vowels': False,
  'flat': False,
  'company': False,
  'road': False,
  'residential': False,
  'business': False,
  'lo