In [1]:
# Import the standard and non standard libraries we will use.
# re, glob and math are from the standard library
# pandas allows the constuction of a dataframe for the results we generate
# pdftotext is self-explanatory
# folium handles the map generation
# pgeocode handles turning postcodes into latitude and longitude
import re
import glob
import math
import pandas as pd
import pdftotext
import folium
from folium.plugins import FastMarkerCluster
import pgeocode

#Define the constants that we need in one place
REQUEST_FORM_POSTCODES = ['CV34 6DA', 'B15 2TH', 'WV10 0QP']
REQUESTS_PATH = './requests/'
MAP_LOCATION = (52.5236, -2.0)
COUNTRY = 'gb'
OUTPUT_SHEET = 'PET-CT_postcode_data.csv'
OUTPUT_MAP = 'PET-CT_map.html'

def extract_postcode(page) :
    """Given the first page of the request, pull out and reformat (if necessary) the postcode"""

    postcode_pattern_class2 = "[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}"
    postcode_pattern_class1 = f"Postcode: +{postcode_pattern_class2}"
    match_string = re.findall(postcode_pattern_class1, page, re.IGNORECASE)
    
    # First look for a definite match
    if len(match_string) == 1 :
        code_match = match_string[0].partition('Postcode: ')[2]
        cleaned_postcode = reformat_postcode(code_match)
        return (cleaned_postcode, 1)
    match_string = re.findall(postcode_pattern_class2, page, re.IGNORECASE)
    
    # No match
    if len(match_string) == 0 :
        return ('No match', 0)
    
    # More relaxed match, but less reliable
    for code in match_string :
        if not any(code == c for c in REQUEST_FORM_POSTCODES):
            cleaned_postcode = reformat_postcode(code)
            return (cleaned_postcode, 2)
    return ('No match', 0)

def reformat_postcode(code) :
    """Given a postcode, tidy it up with a space and capitalisation if necessary"""
    suff = code[-3:]
    area = code[:-3].partition(' ')[0]
    return f'{area} {suff}'.upper()



def main_processing() :
    """Pull in the file list and process sequentially, storing the results in df"""
    files = glob.glob(f'{REQUESTS_PATH}*.pdf')
    pet_map = folium.Map(location=MAP_LOCATION)
    nomi = pgeocode.Nominatim(COUNTRY)
    df = pd.DataFrame({'Filename' : files})
    df['Postcode'] = ''
    df['Area code'] = ''
    df['Match class'] = ''
    map_points = []
    for idx, row in df.iterrows() :
        with open(row['Filename'], "rb") as f:
            pdf = pdftotext.PDF(f) 
        postcode = extract_postcode(pdf[0])
        print(f'Processing form with index {idx}:  Postcode = {postcode[0]}', )
        df.at[idx, 'Postcode'] = postcode[0]
        df.at[idx, 'Area code'] = 'No match' if postcode[1] == 0 else postcode[0][:-4]
        df.at[idx, 'Match class'] = postcode[1]
        location_info = nomi.query_postal_code(postcode[0])
        lat, long = location_info.latitude, location_info.longitude
        if not any(math.isnan(coord) for coord in [lat, long]):
            map_points.append([lat, long])
    df = df.sort_values(['Postcode'])
    df.to_csv(OUTPUT_SHEET)
    pet_map.add_child(FastMarkerCluster(map_points))
    pet_map.save(OUTPUT_MAP)
    
def unit_tests() :
    """Ensure that the postcode extraction function behaves as intended"""
    assert extract_postcode('Postcode: wr3 8Pz') == ('WR3 8PZ', 1)
    assert extract_postcode('CV34 6DA Postco wr3 8Pz') == ('WR3 8PZ', 2)
    assert extract_postcode('CV34 6DA Postcode not present') == ('No match', 0)

            
unit_tests()
main_processing()

# After running a .csv and .html will be present in the working directory with extracted 
# postcode data in the .csv and the locations plotted on a map in the .html



Processing form with index 0:  Postcode = WV11 1UP
Processing form with index 1:  Postcode = WV16 4BB
