In [None]:
import re
import glob
import math
import pandas as pd
import pdftotext
import folium
from folium.plugins import FastMarkerCluster
import pgeocode

files = glob.glob('requests/*.pdf')

df = pd.DataFrame({'Filename' : files})
pet_map = folium.Map(location=(52.5236, -2.0))
nomi = pgeocode.Nominatim('gb')

df['Postcode'] = ''
df['Area code'] = ''
df['Match class'] = ''


def extract_postcode(page) :
    postcode_pattern_class2 = "[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}"
    postcode_pattern_class1 = f"Postcode: +{postcode_pattern_class2}"
    match_string = re.findall(postcode_pattern_class1, page, re.IGNORECASE)
    if len(match_string) == 1 :
        code_match = match_string[0].partition('Postcode: ')[2]
        cleaned_postcode = reformat_postcode(code_match)
        return (cleaned_postcode, 1)
    match_string = re.findall(postcode_pattern_class2, page, re.IGNORECASE)
    if len(match_string) == 0 :
        return ('No match', 0)
    for code in match_string :
        if code != 'CV34 6DA' and code != 'B15 2TH' and code != 'WV10 0QP':
            cleaned_postcode = reformat_postcode(code)
            return (cleaned_postcode, 2)
    return ('No match', 0)

def reformat_postcode(code) :
    suff = code[-3:]
    area = code[:-3].partition(' ')[0]
    return f'{area} {suff}'.upper()

assert extract_postcode('Postcode: wr3 8Pz') == ('WR3 8PZ', 1)
assert extract_postcode('CV34 6DA Postco wr3 8Pz') == ('WR3 8PZ', 2)
assert extract_postcode('CV34 6DA Postcode not present') == ('No match', 0)
        
    
map_points = []
for idx, row in df.iterrows() :
    print(f'Processing form index {idx}')
    with open(row['Filename'], "rb") as f:
        pdf = pdftotext.PDF(f) 
    postcode = extract_postcode(pdf[0])
    df.loc[idx, 'Postcode'] = postcode[0]
    df.loc[idx, 'Area code'] = 'No match' if postcode[1] == 0 else postcode[0][:-4]
    df.loc[idx, 'Match class'] = postcode[1]
    lat = nomi.query_postal_code(postcode[0]).latitude
    long = nomi.query_postal_code(postcode[0]).longitude
    if math.isnan(lat) or math.isnan(long) :
        pass
    else :
        map_points.append([lat, long])
    
df = df.sort_values(['Postcode'])
df.to_csv('PET-CT_postcode_data.csv')
pet_map.add_child(FastMarkerCluster(map_points))
pet_map.save('PET-CT_map.html')
