In [1]:
import json
import os

import pandas as pd
from postal.parser import parse_address

from utils import to_dict

In [2]:
# Display all columns of tables, with scrolling
pd.set_option('display.max_columns', None)

# Libpostal, Reborn!

This notebook reproduces the code examples from the blog post Libpostal, Reborn!

## Structured Address Matching

Let's parse two similar addresses to see how Libpostal is useful for address matching.

In [3]:
address_a = "#03-28, 400 Orchard Rd, 238875 SG"
address_b = "400 Orchard Tower #03-28 Orchard Road, Singapore, 238875 Singapore"

In [4]:
parsed_a = parse_address(address_a)
parsed_a

[('#03-28', 'unit'),
 ('400', 'house_number'),
 ('orchard rd', 'road'),
 ('238875', 'postcode'),
 ('sg', 'country')]

In [5]:
# A utility for pretty printing
print(
    json.dumps(
        to_dict(parsed_a),
        indent=4,
        sort_keys=True,
    )
)

{
    "country": "sg",
    "house_number": "400",
    "postcode": "238875",
    "road": "orchard rd",
    "unit": "#03-28"
}


In [6]:
parsed_b = parse_address(address_b)
parsed_b

[('400', 'house_number'),
 ('orchard tower', 'road'),
 ('#03-28', 'unit'),
 ('orchard road', 'road'),
 ('singapore', 'city'),
 ('238875', 'postcode'),
 ('singapore', 'country')]

In [7]:
# A utility for pretty printing
print(
    json.dumps(
        to_dict(parsed_b),
        indent=4,
        sort_keys=True,
    )
)

{
    "city": "singapore",
    "country": "singapore",
    "house_number": "400",
    "postcode": "238875",
    "road": [
        "orchard tower",
        "orchard road"
    ],
    "unit": "#03-28"
}


### How similar are the two parsed addresses?

How similar are the parsed results in literal terms? Three of them are exactly the same.

In [8]:
intersect = set(parsed_a).intersection(set(parsed_b))
intersect

{('#03-28', 'unit'), ('238875', 'postcode'), ('400', 'house_number')}

### How similar are the parsed addresses semantically?

When we take into account commonalities like abbreviations and missing values, how similar are they?

In [9]:
# What is unique to address a?
set(parsed_a) - intersect

{('orchard rd', 'road'), ('sg', 'country')}

In [10]:
# What is unique to address b?
set(parsed_b) - intersect

{('orchard road', 'road'),
 ('orchard tower', 'road'),
 ('singapore', 'city'),
 ('singapore', 'country')}

### Logical Results

There are two road names once `orchard tower` is parsed as a `road`, and `singapore` is both a `city` and `country`. `sg` and `singapore` match semantically. Libpostal doesn't do the matching for you, that is up to you, but it gives you a huge headstart!

# Exploring Senzing's New Validation Dataset

Senzing created a dataset of 12K validation records, which we pull from Github below. Let's check it out!

In [11]:
TEST_DATA_PATH = "data/test_data.csv"
TEST_DATA_URL = "https://github.com/Senzing/libpostal-data/raw/main/files/tests/v1.1.0/test_data.csv"

if not os.path.exists(TEST_DATA_PATH):
    test_df = pd.read_csv(TEST_DATA_URL)
    test_df.to_csv(TEST_DATA_PATH)
else:
    test_df = pd.read_csv(TEST_DATA_PATH)

test_df = test_df.drop(columns=['Unnamed: 0'])

In [12]:
test_df.head(20)

Unnamed: 0,record_id,full_address,house,house_number,road,po_box,unit,level,staircase,suburb,city_district,city,state_district,state,postcode,country,world_region,category,near,country_code,source
0,1,Black Alliance for Just Immigration 660 Nostra...,black alliance for just immigration,660,nostrand ave,,,,,,brooklyn,,,n.y.,11216.0,,,,,us,libpostal
1,2,"Planned Parenthood, 44 Court St, 6th Floor, Br...",planned parenthood,44,court st,,,6th floor,,,brooklyn,,,,11201.0,,,,,us,libpostal
2,3,"Congresswoman Yvette Clarke 222 Lenox Road, St...",congresswoman yvette clarke,222,lenox road,,ste 1,,,,brooklyn,,,new york,11226.0,,,,,us,libpostal
3,4,"ACLU DC P.O. Box 11637 Washington, DC 20008 Un...",aclu dc,,,p.o. box 11637,,,,,,washington,,dc,20008.0,united states,,,,us,libpostal
4,5,Make the Road New York 92-10 Roosevelt Avenue ...,make the road new york,92-10,roosevelt avenue,,,,,jackson heights,queens,,,,11372.0,,,,,us,libpostal
5,6,"Do the Right Thing Way, Bed-Stuy, BK",,,do the right thing way,,,,,bed-stuy,bk,,,,,,,,,us,libpostal
6,7,book stores near me,,,,,,,,,,,,,,,,book stores,near me,us,libpostal
7,8,theatres in Fort Greene Brooklyn,,,,,,,,fort greene,brooklyn,,,,,,,theatres,in,us,libpostal
8,9,Barboncino 781 Franklin Ave Crown Heights Broo...,barboncino,781,franklin ave,,,,,crown heights,brooklyn,nyc,,ny,11216.0,usa,,,,us,libpostal
9,10,"103 BEAL PKWY SE, FT WALTON BEACH, FL",,103,beal pkwy se,,,,,,,ft walton beach,,fl,,,,,,us,libpostal


# Senzing Model Performance Improvements by Country

The Senzing model performance improvements are published in Markdown format, which can be read as CSV by Pandas.

In [13]:
perf_df = pd.read_csv("https://raw.githubusercontent.com/Senzing/libpostal-data/main/files/stats/v1/Parsing_comparison.md", delimiter="|")

# Drop unnamed columns from Markdown pipes
perf_df = perf_df.loc[:, ~perf_df.columns.str.contains('^Unnamed')]

# Replace the --- for nulls with None
perf_df = perf_df.replace("---", None)

# Make all columns after country code floats
perf_df.iloc[:, 1:] = perf_df.iloc[:, 1:].astype(float)

# Sort the table to show largest improvement per country
perf_df = perf_df.sort_values(by="% improvement", ascending=False)

# 27 countries have a 10% or greater accuracy improvement
perf_df.head(27)

Unnamed: 0,Country,% improvement,Total records,Failures - default libpostal,Percent failed - default libposta,Failures - Senzing,Percent_failed - Senzing
45,kr,86.52,51.0,48.0,94.12,37.0,72.55
42,jm,82.4,50.0,45.0,90.0,33.0,66.0
24,eg,82.0,50.0,50.0,100.0,9.0,18.0
1,ae,74.0,50.0,47.0,94.0,10.0,20.0
43,jp,56.4,50.0,32.0,64.0,33.0,66.0
6,az,46.0,50.0,30.0,60.0,7.0,14.0
79,tw,45.76,177.0,175.0,98.87,94.0,53.11
60,pe,38.0,50.0,24.0,48.0,5.0,10.0
64,pr,26.0,50.0,50.0,100.0,37.0,74.0
68,rs,26.0,50.0,15.0,30.0,2.0,4.0
