In [10]:
#importing necessary libraries
from address_parser import Parser
import pandas as pd
from os import path
import csv
from IPython.display import clear_output
import usaddress
import json
from collections import OrderedDict

#creating functions to parse addresses


# this function creates a new database and failure log but wont overwrite existing one
def create_dataframe(database_name, header):
    if path.exists(database_name):
        print('CSV already exists.')
    else:
        with open(database_name, 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(header)
        print('New CSV created.')
        
# this function converts an OrderedDict into a Pandas dataframe
def convert_ord_dict(ordered_dict):
    keys = list(ordered_dict[0].keys())
    values = list(ordered_dict[0].values())
    df1 = pd.DataFrame(keys).transpose()
    df2 = pd.DataFrame(values).transpose()
    df3 = df1.append(df2)
    new_header1 = df3.iloc[0] #grab the first row for the header
    df3.reset_index()
    df3.columns = new_header1 #set the header row as the df header
    df3 = df3.iloc[[1]]
    return df3

# this function converts a dict into a Pandas dataframe
def convert_dict(dict):
    df1 = pd.DataFrame(dict).transpose()
    new_header1 = df1.iloc[1]
    df1.reset_index()
    df1.columns = new_header1 #set the header row as the df header
    df1 = df1.iloc[[0]]
    return df1

# this function wipes the extra information from a str when calling a position in a pandas dataframe
def clear_pandas_format(variable):
    variable = variable.split('0    ')[1].split('Name:')[0].strip()
    return variable

# this is the entrypoint function, input is a dataframe with required columns 'defendant_address' and 'case_number', output is the name of the dataframe with parsed addresses
def howard_address_parse(input, output):
    
    #failures is a list of errors
    global failures
    failures = list()
    
    #reading input dataframe
    input_table = pd.read_csv(input)
    
    #creating output dataframe
    create_dataframe(output, ['case_number', 'address_number', 'street', 'city', 'state', 'zipcode', 'occupancy_type', 'occupancy_identifier', 'street_name', 'street_type', 'street_directional', 'unparsed_address'])
    
    #defining parser
    parser = Parser()
    
    #looping through defendant_address column in input
    for enum, i in enumerate(input_table.defendant_address):
        
        # report position in loop, number of errors
        print("Starting " +  str(enum + 1) + ". " + str(len(input_table.defendant_address)-(enum + 1)) + " remain. There have been " + str(len(failures)) + " errors." )

        # assign case number based on input
        case_number = input_table.case_number[enum]
        unparsed_address = i
        
        
        ###This code fixes formating in addresses that improve accuracy of this parse and geocoding
        
        #st. in st. petersburg causes errors, replacing st. and st with saint if it comes next petersburg
        parse_start = str(i).replace('st petersburg', 'saint peterburg').replace('st. petersburg', 'saint peterburg').replace('-', ' ')
        
        # removing any characters inside parentheses
        try:
            first_half = parse_start.split('(')[0].strip()
            second_half = parse_start.split(')')[1].strip()
            parse_ready = (first_half + second_half)
        except:
            parse_ready = parse_start

        # parse address
        try:
            # tries to use more complete tag parsing, if that fails, uses less-featured .parse
            try:
                #parsing address, converting output from an OrderedDict to a Pandas dataframe called temp
                temp = convert_ord_dict(usaddress.tag(parse_ready))
            except:
                #if .tag fails, use the less specific .parse
                temp = convert_dict(usaddress.parse(parse_ready))

            # converting information from Pandas dataframe to cleaned Python objects
            try:
                address_number = clear_pandas_format(str(temp.AddressNumber))
            except:
                address_number = None
            

            try:
                street_name = clear_pandas_format(str(temp.StreetName))
            except:
                street_name = None              
            
            
            #test if street_name is only a number, adds suffix if needed
            # this change improves accuracy
            try:
                #see if streetname is just a number
                int(street_name)
                #if it is, create a suffix to attach based on the last digit
                if street_name[-1] == '1':
                    suffix = 'st'
                elif street_name[-1] == '2':
                    suffix = 'nd'
                elif street_name[-1] == '3':
                    suffix = 'rd'
                else:
                    suffix = 'th'
                #append the suffix
                street_name = (str(street_name) + suffix)
            except:
                #if it is not only a number, pass
                pass
            
            try:
                occupancy_type = clear_pandas_format(str(temp.OccupancyType))
            except:
                occupancy_type = None                      
            

            try:
                occupancy_identifier = clear_pandas_format(str(temp.OccupancyIdentifier)).replace(' ', '')
            except:
                occupancy_identifier = None                      
            

            try:
                city = clear_pandas_format(str(temp.PlaceName))
            except:
                city = None                      
            

            try:
                state = clear_pandas_format(str(temp.StateName))
            except:
                state = None                      
            

            try:
                zipcode = clear_pandas_format(str(temp.ZipCode))
            except:
                zipcode = None                      
            

            # street_type and street_directional can be output in pre and post, accomodating for both locations
            try:
                street_type_post = clear_pandas_format(str(temp.StreetNamePostType))
                street_type = street_type_post
            except:
                street_type_post = None
                
            try:
                street_type_pre = clear_pandas_format(str(temp.StreetNamePreType))
                street_type = street_type_pre
            except:
                street_type_pre = None
                
            try:
                street_type
            except:
                street_type = None

            try:
                street_directional_pre = clear_pandas_format(str(temp.StreetNamePreDirectional))
                street_directional = street_directional_pre
            except:
                street_directional_pre = None
                
            try:
                street_directional_post = clear_pandas_format(str(temp.StreetNamePostDirectional))
                street_directional = street_directional_post
            except:
                street_directional_post = None
            
            try:
                street_directional
            except:
                street_directional = None
            
            #this function builds a street address from the individual parsed parts
            #doing this removes superflous info and generally improves succesful geocodes
            street = (str(address_number) + ' ' + str(street_directional_pre) + ' ' + str(street_type_pre)  + ' ' + str(street_name) + ' ' + str(street_type_post) + ' ' + str(street_directional_post)).replace('None', '').replace(',', '').replace('  ', ' ')
    
        
        except:
            
            #writing nones if loop breaks
            address_number = None
            street_name = None
            occupancy_type = None
            occupancy_identifier = None
            street = None
            city = None
            state = None
            zipcode = None
            street_type = None
            street_directional = None
            failures.append(enum)

        #write data to output dataframe    
        with open(output, 'a', newline='') as outfile:
            writer = csv.writer(outfile, delimiter=',')
            data_out = [case_number, address_number, street, city, state, zipcode, occupancy_type, occupancy_identifier, street_name, street_type, street_directional, unparsed_address]
            writer.writerow(data_out)
        
        #this clears the output in the terminal after every loop
        clear_output(wait=True)

In [None]:
### CHANGE LOG ####
#remove commas before placename
    # add code to replace commas with nothing
    
#remove anything between parentheses
    # added code the splits before parens and after parents in two halves and puts together
    # would break if their is multiple parentheses
    
#fix pre/post positional street direction
    # created pre and post for the purposes of creating the street address, still not assigning pre or post into dataframe

#add a th if a street name is just digits and does not have it
    # creating function that tests if int, then appends suffix


In [11]:
howard_address_parse('./input/fl_pinellas_clean.csv', './output/fl_pinellas_parsed.csv')

Starting 8167. 0 remain. There have been 0 errors.


In [5]:
howard_address_parse('./input/fl_hills_clean.csv', './output/fl_hills_parsed.csv')

Starting 15195. 0 remain. There have been 0 errors.


In [6]:
howard_address_parse('./input/ga_dekalb_clean.csv', './output/ga_dekalb_parsed.csv')

Starting 45070. 0 remain. There have been 0 errors.


In [7]:
howard_address_parse('./input/ga_fulton_clean.csv', './output/ga_fulton_parsed.csv')

Starting 57956. 0 remain. There have been 0 errors.


In [8]:
howard_address_parse('./input/tn_shelby_clean.csv', './output/tn_shelby_parsed.csv')

Starting 36410. 0 remain. There have been 0 errors.


In [9]:
howard_address_parse('./input/wi_milw_clean.csv', './output/wi_milw_parsed.csv')

Starting 19423. 0 remain. There have been 0 errors.
