In [1]:
import pandas as pd
import numpy as np
import glob
import re 
from tqdm import tqdm

In [2]:
from scourgify import normalize_address_record
import usaddress

In [3]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [4]:
#['address_number','street_name','street_type','unit_number'']
#['address_number','street_name','street_type','unit_number']

def tag_street(address):
    try:
        out = usaddress.tag(address, tag_mapping={
       'Recipient': 'unit_number',
       'AddressNumber': 'address_number',
       'AddressNumberPrefix': 'address_number',
       'AddressNumberSuffix': 'address_number',
       'StreetName': 'street_name',
       'StreetNamePreDirectional': 'street_name',
       'StreetNamePreModifier': 'street_name',
       'StreetNamePreType': 'street_name',
       'StreetNamePostDirectional': 'street_name',
       'StreetNamePostModifier': 'street_name',
       'SecondStreetName': 'street_name',
       'StreetNamePostType': 'street_type',
       'SecondStreetNamePostType': 'street_type',
       'CornerOf': 'street_name',
       'IntersectionSeparator': 'street_name',
       'LandmarkName': 'street_name',
       'USPSBoxGroupID': 'street_name',
       'USPSBoxGroupType': 'street_name',
       'USPSBoxID': 'street_name',
       'USPSBoxType': 'street_name',
       'BuildingName': 'unit_number',
       'OccupancyType': 'unit_number',
       'OccupancyIdentifier': 'unit_number',
       'SubaddressIdentifier': 'unit_number',
       'SubaddressType': 'unit_number',
       'PlaceName': 'city',
       'StateName': 'state',
       'ZipCode': 'postal_code'
        })
        return dict(out[0])
    except Exception as e_tag:
        return {'e_tag': e_tag}

In [5]:
def catch_norm_tag(full_address, func=normalize_address_record):
    try:
        out = func(full_address)
        out.update(tag_street(out['address_line_1']))
        return out
    
    except Exception as e_norm:
        out = tag_street(full_address)
        e_dict = {'e_norm': e_norm}
        out.update(e_dict)
        return out

In [6]:
# Deal with D+\sd+, d+\sD+, D+\s&\sD+ unit numbers
# Deal with "Buena Vista Avenue" which get confused:
    
def unit_extract(addy_series):
    
    df = pd.DataFrame(index=addy_series.index)

    # Split the last element of 'address' on the right - "suffix"
    df[['street_name','suffix']] = addy_series.str.strip().str.upper().str.rsplit(n=1, expand=True)

    ## Drop leading spaces & special characters in 'suffix'
    df['street_name'] = df['street_name'].str.strip()
    df['suffix'] = df['suffix'].str.strip()

    ## Find 'unit' numbers, such as: 1, 123, 12345D, 15CD, A, A1, A123:
    pat = '(\d+\D*|^[\D]\d+$|^[ABCD]$|^AB$|^CD$|^EF$|^FE$)'
    df['unit'] = df['suffix'].str.extract(pat)

    ## Streets such as 23rd, 1st, 102nd also match 'pat', remove them:
    df['unit']=df['unit'].str.replace('(\d*[RSTN][DTHD])','')

    ## Whatever is NOT unit should be added back to addess:
    m = df['unit'].isna()
    df.loc[m,'street_name'] = df.loc[m,'street_name'] + ' ' + df.loc[m,'suffix']
    df.loc[~m,'street_name'] = df.loc[~m,'street_name'] + ', UNIT ' + df.loc[~m,'suffix']

    #df.drop('suffix', axis=1, inplace=True)
    
    return df['street_name']

In [7]:
# Use fuzzywuzzy to match zip codes:

def fuzzy_correct(check_value, correct_set, thresh=80, fscorer=fuzz.WRatio, imp='#####'):
    
    # From great tuttorial here: https://michelleful.github.io/
    
    if check_value in correct_set:  # might want to make this a dict for O(1) lookups
        return check_value, 100

    new_value, score = process.extractOne(check_value, correct_set, scorer=fscorer)
    if score < thresh:
        return imp, score
    else:
        return new_value, score
    
def fuzzy_series(check_series, correct_areas, thresh=90, fscorer=fuzz.WRatio):
    
    # Series holders for return:
    check_new_areas = check_series.copy()
    check_count_areas = check_new_areas.copy()
    
    # List of unique values to check:
    check_areas = list(check_new_areas.unique())    

    # Dictionaries for mapping:
    count_dict = {}
    area_dict = {}

    # Build dictionaries for mapping with FuzzyWuzzy:
    for area in tqdm(check_areas):
        correct_area , count = fuzzy_correct(area, correct_areas, thresh, fscorer)
        count_dict[area] = count
        area_dict[area] = correct_area
    
    # Map dictionaries
    check_new_areas = check_new_areas.map(area_dict, na_action='ignore')
    check_count_areas = check_count_areas.map(count_dict, na_action='ignore')

    return check_new_areas, check_count_areas

# 1. Canonical addresses

In [8]:
# Column sthat we need for deduping:
addy_columns = ['Address', 'Address Number', 'Address Number Suffix', 'Street Name', 'Street Type', 
           'Zipcode', 'Longitude', 'Latitude']
column_names = ['address', 'address_number', 'address_number_suffix', 'street_name', 'street_type', 
           'zip', 'long', 'lat']

# Import Assessor's data:
addy_path = r'C:\SFSU\Spring 2020\Project\Data\sf-addresses-enterprise-addressing-system\addresses-enterprise-addressing-system.csv' # use your path
addy_frame = pd.read_csv(addy_path, header=0, index_col=False, 
                        dtype={'Zipcode':str, 'Address Number':str},
                        usecols= addy_columns).drop_duplicates()
addy_frame.columns = column_names
addy_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211956 entries, 0 to 212553
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   address                211956 non-null  object 
 1   address_number         211956 non-null  object 
 2   address_number_suffix  3700 non-null    object 
 3   street_name            211956 non-null  object 
 4   street_type            209573 non-null  object 
 5   zip                    211956 non-null  object 
 6   long                   211956 non-null  float64
 7   lat                    211956 non-null  float64
dtypes: float64(2), object(6)
memory usage: 14.6+ MB


In [9]:
addy_frame['street_name'] = addy_frame.street_name.str.lstrip('0')

In [10]:
m = ~addy_frame.address_number_suffix.isna()
addy_frame[m].head()

Unnamed: 0,address,address_number,address_number_suffix,street_name,street_type,zip,long,lat
1,2433A 23RD AVE,2433,A,23RD,AVE,94116,-122.480264,37.742226
2,312A UNION ST,312,A,UNION,ST,94133,-122.404777,37.801114
37,369A DUNCAN ST,369,A,DUNCAN,ST,94131,-122.42829,37.745765
69,1726A CABRILLO ST,1726,A,CABRILLO,ST,94121,-122.477383,37.775
74,3233A 16TH ST,3233,A,16TH,ST,94110,-122.424803,37.764482


# 2. Sales

In [11]:
# Columns that we need for deduping:
sales_cols = ['Address','Zip Code']

# Import sales data:
sales_path = r'C:\SFSU\Spring 2020\Project\Data\salesframe.csv' # use your path
sales_frame = pd.read_csv(sales_path, header=0, index_col=False, usecols=sales_cols, 
                         dtype={'Zip Code':str})
sales_frame.columns = ['address', 'zip']

# Drop records without address & replace nan zips
sales_frame['zip'].fillna('99999', inplace=True)
sales_frame.drop(sales_frame[sales_frame['address'].isna()].index, inplace=True)

sales_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112970 entries, 0 to 112969
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   address  112970 non-null  object
 1   zip      112970 non-null  object
dtypes: object(2)
memory usage: 2.6+ MB


### 2.1. Sales frame preparation

In [12]:
# Split Address convenient columns:
sales_frame[['no1','street_name']] = sales_frame['address'].copy().str.split(n=1, expand=True)
sales_frame[['street_name','street_type']] = sales_frame['street_name'].copy().str.rsplit(n=1, expand=True)

# Process columns:
sales_frame['unit_number'] = sales_frame['street_type'].str.replace(r"[a-zA-Z]",'')
sales_frame['street_type'] = sales_frame['street_type'].str.replace(r"[0-9]",'')

# Build comparison column:
sales_frame['street_suffix'] = (sales_frame['street_name'].fillna('')
                                 +' ' + sales_frame['street_type'].fillna('')).str.lower()

sales_frame['address_number'] = sales_frame.no1

### 2.2. Full address

In [13]:
sales_frame['full_address'] = sales_frame.address

# 3. Pipeline

Although, addresses at the head of dataset look clean, there are in fact misformatted addresses. Such as addresses where street_name includes unit_number which will prevent us from mathcing the addresses as units also come in a vaiety of forms such as UNIT123, # 123, APP123, etc

### 3.1. Extract Unit numbers:

In [14]:
# Extract 'Unit' as parsers confuse them with street numbers:
sales_frame['full_address'] = unit_extract(sales_frame['full_address'])

# Construct full address for parser:
sales_frame['full_address'] = sales_frame['full_address'] + ', SAN-FRANCISCO, CA ' + sales_frame['zip']

### 3.2. Normalize full address:

In [15]:
# Parse, concatenate, drop duplicate columns:
df_out=pd.DataFrame(list(sales_frame.full_address.apply(lambda x: catch_norm_tag(x))), index=sales_frame.index)
df_out.index = sales_frame.index

In [16]:
m = df_out.e_norm.isna() | df_out.e_tag.isna()

# Keep exceptions:
sales_frame[['e_tag', 'e_norm']]= df_out[['e_tag', 'e_norm']]

# Replace with normalized columns:
for c in ['address_number','street_name','street_type','unit_number']:
    sales_frame.loc[m,c] = df_out.loc[m,c]

sales_frame.head(5)

Unnamed: 0,address,zip,no1,street_name,street_type,unit_number,street_suffix,address_number,full_address,e_tag,e_norm
0,1 NORTHWOOD DRIVE,94112,1,NORTHWOOD,DR,,northwood drive,1,"1 NORTHWOOD DRIVE, SAN-FRANCISCO, CA 94112",,
1,2909 Jennings Street,94124,2909,JENNINGS,ST,,jennings street,2909,"2909 JENNINGS STREET, SAN-FRANCISCO, CA 94124",,
2,631 Ofarrell 503,94109,631,OFARRELL,,,ofarrell,631,"631 OFARRELL, UNIT 503, SAN-FRANCISCO, CA 94109",,
3,250 Minerva,94112,250,MINERVA,,,minerva,250,"250 MINERVA, SAN-FRANCISCO, CA 94112",,
4,4021 Folsom,94110,4021,FOLSOM,,,folsom,4021,"4021 FOLSOM, SAN-FRANCISCO, CA 94110",,


### 3.3 Match streets with FuzzyWuzzy

#### FuzzyWuzzy application

In [17]:
# Set of streets:
sales_frame['street_name_type'] = (sales_frame.street_name.fillna('') +\
                                    ' '+sales_frame.street_type.fillna('')).str.strip() 

In [18]:
# Set of streets without suffix:
street_name_set = set(addy_frame['street_name'].dropna().unique())
                      
sales_frame['addy_street_name'], sales_frame['addy_street_name_score'] =\
fuzzy_series(sales_frame['street_name_type'], street_name_set, thresh=10)

100%|██████████████████████████████████████████████████████████████████████████████| 3591/3591 [02:45<00:00, 21.75it/s]


In [19]:
# Set of streets with suffix:
street_name_type_set = set((addy_frame.street_name.fillna('')+' '+addy_frame.street_type.fillna('')).unique())

sales_frame['addy_street_name_type'], sales_frame['addy_street_name_type_score'] =\
fuzzy_series(sales_frame['street_name_type'], street_name_type_set, thresh=10)

100%|██████████████████████████████████████████████████████████████████████████████| 3591/3591 [03:07<00:00, 19.12it/s]


In [20]:
m = (sales_frame['addy_street_name_score']>=91)|(sales_frame['addy_street_name_type_score']>=91)
sum(m)/sales_frame.shape[0]

0.9823315924581747

In [21]:
sum(sales_frame.street_name_type.str.strip() == '')/sales_frame.shape[0]

0.0026113127378950165

### 3.4 Match street and numbers:
#### 3.4.1 Replace canonical street_name_type
Depending on FuzzyWuzzy score, replace original street name with canonical street name with or without street type.
Note: we cannot autoatically add street_type at this point if it is missing.

In [22]:
sales_frame['number_street_type'] = sales_frame['street_name_type']

In [23]:
m91 = (sales_frame['addy_street_name_score']>=91)|(sales_frame['addy_street_name_type_score']>=91)
msfx = sales_frame['addy_street_name_type_score'] >= sales_frame['addy_street_name_score']

In [24]:
sales_frame.loc[~m91,'number_street_type'] = '#####'
sales_frame.loc[m91 & ~msfx,'number_street_type'] = sales_frame.loc[~msfx,'addy_street_name']
sales_frame.loc[m91 & msfx,'number_street_type'] = sales_frame.loc[msfx,'addy_street_name_type']

In [25]:
sales_frame['number_street_type'] = sales_frame['address_number']+' '+ sales_frame['number_street_type']#+' '+sales_frame['unit'].fillna('')

#### 3.4.2 Find mathcing addresses
After street_type is adjusted add street_number and look for mathcing addresses

In [26]:
# Addy without suffix:
addy_frame['number_street'] = addy_frame.address_number.astype(str)+\
' '+addy_frame.street_name.str.strip() 
addy_frame['number_street_type'] = addy_frame.address_number.astype(str)+\
' '+addy_frame.street_name.str.strip() +' '+addy_frame.street_type.str.strip() 

# Set of streets with suffix:
addy_number_street_set = set(addy_frame['number_street'].dropna().unique())
addy_number_street_type_set = set(addy_frame['number_street_type'].dropna().unique())

In [27]:
# Add type to streets that match without street_type:
m = sales_frame['number_street_type'].isin(addy_number_street_set)
ser = sales_frame['number_street_type'][m]
 
for idx in tqdm(ser.index):
    jdx = addy_frame[addy_frame['number_street'].isin([ser.loc[idx]])].index[0]
    sales_frame.loc[idx, 'street_type'] = addy_frame.loc[jdx, 'street_type']
    sales_frame.loc[idx, 'number_street_type'] = addy_frame.loc[jdx, 'number_street_type']

 32%|████████████████████████▍                                                    | 8987/28296 [03:14<06:46, 47.51it/s]

KeyboardInterrupt: 

In [None]:
sales_frame['found'] = sales_frame['number_street_type'].isin(addy_number_street_type_set)

In [None]:
sales_frame['found'].sum()/sales_frame.shape[0]

### Merge

In [None]:
addy_frame.columns

In [None]:
addy_frame.head()

In [None]:
sales_frame.head()

In [None]:
sales_out = pd.merge(sales_frame.loc[sales_frame.found, ['address','number_street_type']],
         addy_frame[['number_street_type','long','lat']],
         how='inner', 
         on =['number_street_type'])

In [None]:
sales_out.head()

In [None]:
# Import sales data:
sales_path = r'C:\SFSU\Spring 2020\Project\Data\salesframe.csv' # use your path
sales_frame = pd.read_csv(sales_path, header=0, index_col=False, 
                         dtype={'Zip Code':str})

#### 3.4.3 Match and geotag

In [None]:
sales_out = pd.merge(sales_out, sales_frame,
                      how='left', 
                      left_on = ['address'],
                      right_on =['Address'])

In [None]:
sales_out.info()

In [None]:
sales_path = r'C:\SFSU\Spring 2020\Project\Data\salesframe_tagged.csv'
sales_out.to_csv(sales_path)

In [None]:
sales_path = r'C:\SFSU\Spring 2020\Project\Data\salesframe_tagged.csv'
sales_out = pd.read_csv(sales_path)

In [None]:
assess_path = r'C:\SFSU\Spring 2020\Project\Data\assessor\Historic_Secured_Property_Tax_Rolls_Processed.csv'
assess_out = pd.read_csv(assess_path)

In [None]:
for c in ['Number of Bedrooms','Property Area in Square Feet','Lot Area']:
    assess_out[c] = pd.to_numeric(assess_out[c], errors='coerce', downcast='integer')

In [None]:
for c in ['Beds','Property Area in Square Feet','Lot Area']:
    sales_out[c] = pd.to_numeric(sales_out[c], errors='coerce', downcast='integer')

In [None]:
merged_df = pd.merge(sales_out, assess_out,
                      how='inner', 
                      left_on = ['number_street_type','Beds'],
                      right_on =['number_street_type','Number of Bedrooms'])

In [None]:
sales_out.info()