In [2]:
import pandas as pd
import numpy as np
import glob
import re 
from tqdm import tqdm

In [3]:
from scourgify import normalize_address_record
import usaddress

In [4]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [4]:
#['address_number','street_name','street_type','unit_number'']
#['address_number','street_name','street_type','unit_number']

def tag_street(address):
    try:
        out = usaddress.tag(address, tag_mapping={
       'Recipient': 'unit_number',
       'AddressNumber': 'address_number',
       'AddressNumberPrefix': 'address_number',
       'AddressNumberSuffix': 'address_number',
       'StreetName': 'street_name',
       'StreetNamePreDirectional': 'street_name',
       'StreetNamePreModifier': 'street_name',
       'StreetNamePreType': 'street_name',
       'StreetNamePostDirectional': 'street_name',
       'StreetNamePostModifier': 'street_name',
       'SecondStreetName': 'street_name',
       'StreetNamePostType': 'street_type',
       'SecondStreetNamePostType': 'street_type',
       'CornerOf': 'street_name',
       'IntersectionSeparator': 'street_name',
       'LandmarkName': 'street_name',
       'USPSBoxGroupID': 'street_name',
       'USPSBoxGroupType': 'street_name',
       'USPSBoxID': 'street_name',
       'USPSBoxType': 'street_name',
       'BuildingName': 'unit_number',
       'OccupancyType': 'unit_number',
       'OccupancyIdentifier': 'unit_number',
       'SubaddressIdentifier': 'unit_number',
       'SubaddressType': 'unit_number',
       'PlaceName': 'city',
       'StateName': 'state',
       'ZipCode': 'postal_code'
        })
        return dict(out[0])
    except Exception as e_tag:
        return {'e_tag': e_tag}

In [5]:
def catch_norm_tag(full_address, func=normalize_address_record):
    try:
        out = func(full_address)
        out.update(tag_street(out['address_line_1']))
        return out
    
    except Exception as e_norm:
        out = tag_street(full_address)
        e_dict = {'e_norm': e_norm}
        out.update(e_dict)
        return out

In [6]:
# Deal with D+\sd+, d+\sD+, D+\s&\sD+ unit numbers
# Deal with "Buena Vista Avenue" which get confused:
    
def unit_extract(addy_series):
    
    df = pd.DataFrame(index=addy_series.index)

    # Split the last element of 'address' on the right - "suffix"
    df[['street_name','suffix']] = addy_series.str.strip().str.upper().str.rsplit(n=1, expand=True)

    ## Drop leading spaces & special characters in 'suffix'
    df['street_name'] = df['street_name'].str.strip()
    df['suffix'] = df['suffix'].str.strip()

    ## Find 'unit' numbers, such as: 1, 123, 12345D, 15CD, A, A1, A123:
    pat = '(\d+\D*|^[\D]\d+$|^[ABCD]$|^AB$|^CD$|^EF$|^FE$)'
    df['unit'] = df['suffix'].str.extract(pat)

    ## Streets such as 23rd, 1st, 102nd also match 'pat', remove them:
    df['unit']=df['unit'].str.replace('(\d*[RSTN][DTHD])','')

    ## Whatever is NOT unit should be added back to addess:
    m = df['unit'].isna()
    df.loc[m,'street_name'] = df.loc[m,'street_name'] + ' ' + df.loc[m,'suffix']
    df.loc[~m,'street_name'] = df.loc[~m,'street_name'] + ', UNIT ' + df.loc[~m,'suffix']

    #df.drop('suffix', axis=1, inplace=True)
    
    return df['street_name']

In [7]:
# Use fuzzywuzzy to match zip codes:

def fuzzy_correct(check_value, correct_set, thresh=80, fscorer=fuzz.WRatio, imp='#####'):
    
    # From great tuttorial here: https://michelleful.github.io/
    
    if check_value in correct_set:  # might want to make this a dict for O(1) lookups
        return check_value, 100

    new_value, score = process.extractOne(check_value, correct_set, scorer=fscorer)
    if score < thresh:
        return imp, score
    else:
        return new_value, score
    
def fuzzy_series(check_series, correct_areas, thresh=90, fscorer=fuzz.WRatio):
    
    # Series holders for return:
    check_new_areas = check_series.copy()
    check_count_areas = check_new_areas.copy()
    
    # List of unique values to check:
    check_areas = list(check_new_areas.unique())    

    # Dictionaries for mapping:
    count_dict = {}
    area_dict = {}

    # Build dictionaries for mapping with FuzzyWuzzy:
    for area in tqdm(check_areas):
        correct_area , count = fuzzy_correct(area, correct_areas, thresh, fscorer)
        count_dict[area] = count
        area_dict[area] = correct_area
    
    # Map dictionaries
    check_new_areas = check_new_areas.map(area_dict, na_action='ignore')
    check_count_areas = check_count_areas.map(count_dict, na_action='ignore')

    return check_new_areas, check_count_areas

# 1. Canonical addresses

In [8]:
# Column sthat we need for deduping:
addy_columns = ['Address', 'Address Number', 'Address Number Suffix', 'Street Name', 'Street Type', 
           'Zipcode', 'Longitude', 'Latitude']
column_names = ['address', 'address_number', 'address_number_suffix', 'street_name', 'street_type', 
           'zip', 'long', 'lat']

# Import Assessor's data:
addy_path = r'C:\SFSU\Spring 2020\Project\Data\sf-addresses-enterprise-addressing-system\addresses-enterprise-addressing-system.csv' # use your path
addy_frame = pd.read_csv(addy_path, header=0, index_col=False, 
                        dtype={'Zipcode':str, 'Address Number':str},
                        usecols= addy_columns).drop_duplicates()
addy_frame.columns = column_names
addy_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211956 entries, 0 to 212553
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   address                211956 non-null  object 
 1   address_number         211956 non-null  object 
 2   address_number_suffix  3700 non-null    object 
 3   street_name            211956 non-null  object 
 4   street_type            209573 non-null  object 
 5   zip                    211956 non-null  object 
 6   long                   211956 non-null  float64
 7   lat                    211956 non-null  float64
dtypes: float64(2), object(6)
memory usage: 14.6+ MB


In [9]:
addy_frame['street_name'] = addy_frame.street_name.str.lstrip('0')

In [10]:
m = ~addy_frame.address_number_suffix.isna()
addy_frame[m].head()

Unnamed: 0,address,address_number,address_number_suffix,street_name,street_type,zip,long,lat
1,2433A 23RD AVE,2433,A,23RD,AVE,94116,-122.480264,37.742226
2,312A UNION ST,312,A,UNION,ST,94133,-122.404777,37.801114
37,369A DUNCAN ST,369,A,DUNCAN,ST,94131,-122.42829,37.745765
69,1726A CABRILLO ST,1726,A,CABRILLO,ST,94121,-122.477383,37.775
74,3233A 16TH ST,3233,A,16TH,ST,94110,-122.424803,37.764482


# 2. Assessor Data

In [11]:
# Column sthat we need for deduping:
assess_cols = ['Property Location', 'Zipcode of Parcel']

# Import Assessor's data:
assess_path = r'C:\SFSU\Spring 2020\Project\Data\assessor\Historic_Secured_Property_Tax_Rolls.csv' # use your path
assess_frame = pd.read_csv(assess_path, header=0, index_col=False, usecols=assess_cols, 
                          dtype={'Zipcode of Parcel':str}).drop_duplicates()
assess_frame.columns = ['address', 'zip']
assess_frame.fillna('99999', inplace=True)
assess_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200297 entries, 0 to 1611451
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   address  200297 non-null  object
 1   zip      200297 non-null  object
dtypes: object(2)
memory usage: 4.6+ MB


### 2.1. Assess frame preparation

In [12]:
# Split Address convenient columns:
assess_frame[['no1','address_number','street_name']] = assess_frame['address'].copy().str.split(n=2, expand=True)
assess_frame[['street_name','street_type']] = assess_frame['street_name'].copy().str.rsplit(n=1, expand=True)

# Process columns:
assess_frame['no1'] = assess_frame['no1'].str.lstrip('0')
assess_frame['address_number'] = assess_frame['address_number'].str.lstrip('0')

# Strip zeros on left and right:
assess_frame['street_name'] = assess_frame['street_name'].str.strip().str.lstrip('0')

# Extract 'unit number':
pat = '([0]\D*\d*\D*$|\d+\D*$)'
assess_frame['unit_number'] = assess_frame['street_type'].str.extract(pat)[0].str.lstrip('0')

# Extract street type:
pat = '(^\D+)'
assess_frame['street_type'] = assess_frame['street_type'].str.extract(pat)[0]

# Replace nans with empty strings (None+str=None):
assess_frame.fillna('', inplace=True)

#Drop rows without Number or street name:
m = (assess_frame.address_number == '') | (assess_frame.street_name == '')
assess_frame.drop(assess_frame[m].index, inplace=True)

assess_frame.head()

Unnamed: 0,address,zip,no1,address_number,street_name,street_type,unit_number
0,0000 2530 BROADWAY 0000,94123,,2530,BROADWAY,,
1,1114 1108 CHURCH ST0000,94114,1114.0,1108,CHURCH,ST,
3,0000 2625 24TH ST0000,99999,,2625,24TH,ST,
5,0000 3636 WEBSTER ST0000,94123,,3636,WEBSTER,ST,
6,0000 1453 07TH AV0000,94122,,1453,7TH,AV,


### 2.2. Full address

In [13]:
## Whatever is NOT unit should be added back to addess:
assess_frame['full_address'] = assess_frame.address_number +' '+ assess_frame.street_name + ' '+assess_frame.street_type

# Construct full address for parser:
m = assess_frame['unit_number']==''
assess_frame.loc[~m,'full_address'] = assess_frame.loc[~m,'full_address'] + ' '+assess_frame.loc[~m,'unit_number']
assess_frame['full_address']=assess_frame['full_address'].str.strip()

assess_frame.head(5)

Unnamed: 0,address,zip,no1,address_number,street_name,street_type,unit_number,full_address
0,0000 2530 BROADWAY 0000,94123,,2530,BROADWAY,,,2530 BROADWAY
1,1114 1108 CHURCH ST0000,94114,1114.0,1108,CHURCH,ST,,1108 CHURCH ST
3,0000 2625 24TH ST0000,99999,,2625,24TH,ST,,2625 24TH ST
5,0000 3636 WEBSTER ST0000,94123,,3636,WEBSTER,ST,,3636 WEBSTER ST
6,0000 1453 07TH AV0000,94122,,1453,7TH,AV,,1453 7TH AV


# 3. Pipeline

Although, addresses at the head of dataset look clean, there are in fact misformatted addresses. Such as addresses where street_name includes unit_number which will prevent us from mathcing the addresses as units also come in a vaiety of forms such as UNIT123, # 123, APP123, etc

### 3.1. Extract Unit numbers:

In [14]:
# Extract 'Unit' as parsers confuse them with street numbers:
assess_frame['full_address'] = unit_extract(assess_frame['full_address'])

# Construct full address for parser:
assess_frame['full_address'] = assess_frame['full_address'] + ', SAN-FRANCISCO, CA ' + assess_frame['zip']

### 3.2. Normalize full address:

In [15]:
# Parse, concatenate, drop duplicate columns:
df_out=pd.DataFrame(list(assess_frame.full_address.apply(lambda x: catch_norm_tag(x))), index=assess_frame.index)
df_out.index = assess_frame.index

In [16]:
m = df_out.e_norm.isna() | df_out.e_tag.isna()

# Keep exceptions:
assess_frame[['e_tag', 'e_norm']]= df_out[['e_tag', 'e_norm']]

# Replace with normalized columns:
for c in ['address_number','street_name','street_type','unit_number']:
    assess_frame.loc[m,c] = df_out.loc[m,c]

assess_frame.head(5)

Unnamed: 0,address,zip,no1,address_number,street_name,street_type,unit_number,full_address,e_tag,e_norm
0,0000 2530 BROADWAY 0000,94123,,2530,BROADWAY,,,"2530 BROADWAY, SAN-FRANCISCO, CA 94123",,
1,1114 1108 CHURCH ST0000,94114,1114.0,1108,CHURCH,ST,,"1108 CHURCH ST, SAN-FRANCISCO, CA 94114",,
3,0000 2625 24TH ST0000,99999,,2625,24TH,ST,,"2625 24TH ST, SAN-FRANCISCO, CA 99999",,
5,0000 3636 WEBSTER ST0000,94123,,3636,WEBSTER,ST,,"3636 WEBSTER ST, SAN-FRANCISCO, CA 94123",,
6,0000 1453 07TH AV0000,94122,,1453,7TH,AVE,,"1453 7TH AV, SAN-FRANCISCO, CA 94122",,


### 3.3 Match streets with FuzzyWuzzy

#### FuzzyWuzzy application

In [17]:
# Set of streets:
assess_frame['street_name_type'] = (assess_frame.street_name.fillna('') +\
                                    ' '+assess_frame.street_type.fillna('')).str.strip() 

In [18]:
# Set of streets without suffix:
street_name_set = set(addy_frame['street_name'].dropna().unique())
                      
assess_frame['addy_street_name'], assess_frame['addy_street_name_score'] =\
fuzzy_series(assess_frame['street_name_type'], street_name_set, thresh=10)

100%|██████████████████████████████████████████████████████████████████████████████| 2943/2943 [03:12<00:00, 15.29it/s]


In [19]:
# Set of streets with suffix:
street_name_type_set = set((addy_frame.street_name.fillna('')+' '+addy_frame.street_type.fillna('')).unique())

assess_frame['addy_street_name_type'], assess_frame['addy_street_name_type_score'] =\
fuzzy_series(assess_frame['street_name_type'], street_name_type_set, thresh=10)

100%|██████████████████████████████████████████████████████████████████████████████| 2943/2943 [01:34<00:00, 31.19it/s]


In [20]:
m = (assess_frame['addy_street_name_score']>=91)|(assess_frame['addy_street_name_type_score']>=91)
sum(m)/assess_frame.shape[0]

0.9580949900321576

In [21]:
sum(assess_frame.street_name_type.str.strip() == '')/assess_frame.shape[0]

0.018332815740174913

### 3.4 Match street and numbers:
#### 3.4.1 Replace canonical street_name_type
Depending on FuzzyWuzzy score, replace original street name with canonical street name with or without street type.
Note: we cannot autoatically add street_type at this point if it is missing.

In [22]:
assess_frame['number_street_type'] = assess_frame['street_name_type']

In [23]:
m91 = (assess_frame['addy_street_name_score']>=91)|(assess_frame['addy_street_name_type_score']>=91)
msfx = assess_frame['addy_street_name_type_score'] >= assess_frame['addy_street_name_score']

In [24]:
assess_frame.loc[~m91,'number_street_type'] = '#####'
assess_frame.loc[m91 & ~msfx,'number_street_type'] = assess_frame.loc[~msfx,'addy_street_name']
assess_frame.loc[m91 & msfx,'number_street_type'] = assess_frame.loc[msfx,'addy_street_name_type']

In [25]:
assess_frame['number_street_type'] = assess_frame['address_number']+' '+ assess_frame['number_street_type']#+' '+sales_frame['unit'].fillna('')

#### 3.4.2 Find mathcing addresses
After street_type is adjusted add street_number and look for mathcing addresses

In [26]:
# Addy without suffix:
addy_frame['number_street'] = addy_frame.address_number.astype(str)+\
' '+addy_frame.street_name.str.strip() 
addy_frame['number_street_type'] = addy_frame.address_number.astype(str)+\
' '+addy_frame.street_name.str.strip() +' '+addy_frame.street_type.str.strip() 

# Set of streets with suffix:
addy_number_street_set = set(addy_frame['number_street'].dropna().unique())
addy_number_street_type_set = set(addy_frame['number_street_type'].dropna().unique())

In [27]:
# Add type to streets that match without street_type:
m = assess_frame['number_street_type'].isin(addy_number_street_set)
ser = assess_frame['number_street_type'][m]
 
for idx in tqdm(ser.index):
    jdx = addy_frame[addy_frame['number_street'].isin([ser.loc[idx]])].index[0]
    assess_frame.loc[idx, 'street_type'] = addy_frame.loc[jdx, 'street_type']
    assess_frame.loc[idx, 'number_street_type'] = addy_frame.loc[jdx, 'number_street_type']

100%|██████████████████████████████████████████████████████████████████████████████| 3118/3118 [01:53<00:00, 27.42it/s]


In [28]:
assess_frame['found'] = assess_frame['number_street_type'].isin(addy_number_street_type_set)

In [29]:
assess_frame['found'].sum()/assess_frame.shape[0]

0.9250859037677442

### Merge

In [30]:
addy_frame.columns

Index(['address', 'address_number', 'address_number_suffix', 'street_name',
       'street_type', 'zip', 'long', 'lat', 'number_street',
       'number_street_type'],
      dtype='object')

In [33]:
addy_frame.head()

Unnamed: 0,address,address_number,address_number_suffix,street_name,street_type,zip,long,lat,number_street,number_street_type
0,1411 16TH AVE,1411,,16TH,AVE,94122,-122.473947,37.761593,1411 16TH,1411 16TH AVE
1,2433A 23RD AVE,2433,A,23RD,AVE,94116,-122.480264,37.742226,2433 23RD,2433 23RD AVE
2,312A UNION ST,312,A,UNION,ST,94133,-122.404777,37.801114,312 UNION,312 UNION ST
3,2293 POWELL ST,2293,,POWELL,ST,94133,-122.412084,37.805643,2293 POWELL,2293 POWELL ST
4,309 BAY ST,309,,BAY,ST,94133,-122.412411,37.805648,309 BAY,309 BAY ST


In [35]:
assess_frame.head()

Unnamed: 0,address,zip,no1,address_number,street_name,street_type,unit_number,full_address,e_tag,e_norm,street_name_type,addy_street_name,addy_street_name_score,addy_street_name_type,addy_street_name_type_score,number_street_type,found
0,0000 2530 BROADWAY 0000,94123,,2530,BROADWAY,,,"2530 BROADWAY, SAN-FRANCISCO, CA 94123",,,BROADWAY,BROADWAY,100,BROADWAY,100,2530 BROADWAY,False
1,1114 1108 CHURCH ST0000,94114,1114.0,1108,CHURCH,ST,,"1108 CHURCH ST, SAN-FRANCISCO, CA 94114",,,CHURCH ST,CHURCH,90,CHURCH ST,100,1108 CHURCH ST,True
3,0000 2625 24TH ST0000,99999,,2625,24TH,ST,,"2625 24TH ST, SAN-FRANCISCO, CA 99999",,,24TH ST,4TH,90,24TH ST,100,2625 24TH ST,True
5,0000 3636 WEBSTER ST0000,94123,,3636,WEBSTER,ST,,"3636 WEBSTER ST, SAN-FRANCISCO, CA 94123",,,WEBSTER ST,WEBSTER,95,WEBSTER ST,100,3636 WEBSTER ST,True
6,0000 1453 07TH AV0000,94122,,1453,7TH,AVE,,"1453 7TH AV, SAN-FRANCISCO, CA 94122",,,7TH AVE,7TH,90,7TH AVE,100,1453 7TH AVE,True


In [37]:
assess_out = pd.merge(assess_frame.loc[assess_frame.found, ['address','number_street_type']],
         addy_frame[['number_street_type','long','lat']],
         how='inner', 
         on =['number_street_type'])

#### 3.4.3 Match and geotag

In [43]:
cols = ['Closed Roll Fiscal Year', 
'Property Location', 
'Neighborhood Code Definition', 
'Block and Lot Number', 
'Property Class Code Definition',
'Year Property Built', 
'Number of Bathrooms', 
'Number of Bedrooms',
'Number of Rooms', 
'Number of Stories', 
'Number of Units',
'Characteristics Change Date',
'Zoning Code',
'Construction Type',
'Lot Depth', 
'Lot Frontage',
'Property Area in Square Feet',
'Basement Area',
'Lot Area',
'Prior Sales Date',
'Recordation Date',
'Current Sales Date',
'Closed Roll Assessed Fixtures Value',
'Closed Roll Assessed Improvement Value',
'Closed Roll Assessed Land Value',
'Closed Roll Assessed Personal Prop Value', 
'Zipcode of Parcel',
'Location']

In [44]:
# Import Assessor's data:
assess_path = r'C:\SFSU\Spring 2020\Project\Data\assessor\Historic_Secured_Property_Tax_Rolls.csv' # use your path
assess_frame = pd.read_csv(assess_path, header=0, index_col=False,
                           usecols= cols,
                           dtype={'Zipcode of Parcel':str}).drop_duplicates()

In [46]:
assess_out = pd.merge(assess_out, assess_frame,
                      how='left', 
                      left_on = ['address'],
                      right_on =['Property Location'])

In [49]:
assess_path = r'C:\SFSU\Spring 2020\Project\Data\assessor\Historic_Secured_Property_Tax_Rolls_Processed.csv'
assess_out.to_csv(assess_path)

In [7]:
assess_path = r'C:\SFSU\Spring 2020\Project\Data\assessor\Historic_Secured_Property_Tax_Rolls_Processed.csv'
assess_out = pd.read_csv(assess_path)

In [18]:
assess_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1497941 entries, 0 to 1497940
Data columns (total 33 columns):
 #   Column                                    Non-Null Count    Dtype  
---  ------                                    --------------    -----  
 0   Unnamed: 0                                1497941 non-null  int64  
 1   address                                   1497941 non-null  object 
 2   number_street_type                        1497941 non-null  object 
 3   long                                      1497941 non-null  float64
 4   lat                                       1497941 non-null  float64
 5   Closed Roll Fiscal Year                   1497940 non-null  float64
 6   Property Location                         1497941 non-null  object 
 7   Neighborhood Code Definition              1454501 non-null  object 
 8   Block and Lot Number                      1497941 non-null  object 
 9   Property Class Code Definition            1485261 non-null  object 
 10  Year P