In [1]:
import pandas as pd
from timeit import default_timer as timer
import reverse_geocoder as rg
from iso3166 import countries
import numpy as np
import more_itertools as mit
from fuzzywuzzy import fuzz
import itertools
%matplotlib inline

In [2]:
path_to_data = '../data/decahose/parsed/locations/'

In [3]:
print('Import Location Data...')
start = timer()

# Account Location, Average User Geolocation, Number of Users
account_locations  = pd.read_pickle(path_to_data+'selected-locations.pkl')
print('# Selected Account locations (After String Cleaning):', account_locations.shape[0])

# Account Locations Geocoded Using the Google API
geocoded_locations = pd.read_pickle(path_to_data+'geocoded-locations.pkl')
print('# Geocoded locations:', geocoded_locations.shape[0])

print("Done in", round(timer()-start), "sec")

Import Location Data...
# Selected Account locations (After String Cleaning): 3362049
# Geocoded locations: 428400
Done in 16 sec


# Geocoded Account Locations

In [4]:
print('% Users at geocoded locations among selected locations:',
round(account_locations[
account_locations['LOCATION'].isin(list(geocoded_locations['input_string']))]['N'].sum()/\
account_locations['N'].sum(),2))

% Users at geocoded locations among selected locations: 0.79


In [5]:
print('Max # sampled users at selected but non-geocoded location:',
account_locations[(-account_locations['LOCATION'].isin(list(geocoded_locations['input_string'])))]['N'].max())

Max # sampled users at selected but non-geocoded location: 6


In [6]:
account_locations = account_locations[
account_locations['LOCATION'].isin(list(geocoded_locations['input_string']))].reset_index(drop=True).copy()
print('# Geocoded locations:', account_locations.shape[0])

# Geocoded locations: 428399


In [7]:
account_locations.head()

Unnamed: 0,LOCATION,POINT,N
0,Indonesia,"(-5.53034, 106.54255)",98240
1,London,"(49.01730709, 1.43906842)",88896
2,Brasil,"(-17.99120486, -45.60141543)",70885
3,Jakarta,"(-5.19392835, 103.92920358)",55842
4,Philippines,"(14.604133, 120.017236)",51989


# Reversed Geocode For Validation

In [8]:
print("Reverse geocode representative point...")
start = timer()

reversed_geocodes = pd.DataFrame(rg.search(list(account_locations['POINT'].values))).drop(
['lat','lon','name','admin1','admin2'],1).rename(columns=lambda x:x.upper())
print('# Geocodes:',reversed_geocodes.shape[0])

# reversed_geocodes['COUNTRY'] = reversed_geocodes['CC'].apply(lambda x:countries.get(x).name)

account_locations = pd.concat([account_locations,reversed_geocodes],1).drop('POINT',1)
del reversed_geocodes

print("Done in", round(timer()-start), "sec")

Reverse geocode representative point...
Loading formatted geocoded file...
# Geocodes: 428399
Done in 6 sec


In [9]:
account_locations.head(10)

Unnamed: 0,LOCATION,N,CC
0,Indonesia,98240,ID
1,London,88896,FR
2,Brasil,70885,BR
3,Jakarta,55842,ID
4,Philippines,51989,PH
5,İstanbul,50755,TR
6,istanbul,39241,TR
7,indonesia,39057,ID
8,Argentina,36977,AR
9,Bandung,33295,ID


# Clean Geocodes

In [10]:
print('Drop duplicated geocodes:')
geocoded_locations.drop_duplicates('input_string',inplace=True)
geocoded_locations.reset_index(drop=True,inplace=True)
print('# Geocoded locations:', geocoded_locations.shape[0])

Drop duplicated geocodes:
# Geocoded locations: 428399


In [11]:
print('# Geocoded locations with results:', geocoded_locations[
geocoded_locations['status']=='OK'].shape[0])

missing_geocodes = geocoded_locations[geocoded_locations['status']!='OK']['input_string']
print('# Locations with missing geocode:', missing_geocodes.shape[0])

print('% Users at locations with missing geocode:', 
round(account_locations[
account_locations['LOCATION'].isin(missing_geocodes)]['N'].sum()/account_locations['N'].sum(),3))

print('Max # sampled users at locations with missing geocode:', 
account_locations[
account_locations['LOCATION'].isin(missing_geocodes)]['N'].max())

# Geocoded locations with results: 402832
# Locations with missing geocode: 25567
% Users at locations with missing geocode: 0.016
Max # sampled users at locations with missing geocode: 1459


In [13]:
types_of_interest = frozenset([
'country',
'locality',
'sublocality',
'neighborhood',
'administrative_area_level_1',
'administrative_area_level_2',
'administrative_area_level_3',
'administrative_area_level_4',
])

def nth_result(response,n):

    formatted_result = {}
    
    #Only Consider First result
    nth_result = mit.nth(response['results'], n, None)
    
    if not nth_result:
        return pd.Series()
        
    for component in nth_result['address_components']:

        for type_of_interest in types_of_interest:

            if type_of_interest in component['types']:

                formatted_result[type_of_interest+'_long'] = component['long_name']
                formatted_result[type_of_interest+'_short'] = component['short_name']
            
    return pd.Series(formatted_result)

In [14]:
# Discarded 173 locations with 2238 users matched with secondary results
print('Format First Result:')
start = timer()

geocoded_locations = pd.concat([geocoded_locations,
geocoded_locations['response'].apply(lambda x: nth_result(x, 0))],1)

print("Done in", round(timer()-start), "sec")

Format First Result:
Done in 211 sec


In [15]:
geocoded_locations.count().sort_values(ascending=False)

input_string                         428399
number_of_results                    428399
response                             428399
status                               428399
accuracy                             402832
formatted_address                    402832
google_place_id                      402832
latitude                             402832
longitude                            402832
postcode                             402832
type                                 402832
country_long                         401462
country_short                        401462
administrative_area_level_1_short    386605
administrative_area_level_1_long     386605
locality_long                        321513
locality_short                       321513
administrative_area_level_2_long     305600
administrative_area_level_2_short    305600
administrative_area_level_3_long      60747
administrative_area_level_3_short     60747
neighborhood_long                     21713
neighborhood_short              

# Merge Results

In [16]:
print('Merge Location Data...')
start = timer()

locations = pd.merge(
account_locations,
geocoded_locations.drop([
'accuracy',
'google_place_id',
'postcode',
'latitude', 
'longitude',
'number_of_results', 
'response',
],
1,
errors='ignore'),
left_on='LOCATION',
right_on='input_string').drop('input_string',1)
print('# Tested Locations:',locations.shape[0])

print("Done in", round(timer()-start), "sec")

Merge Location Data...
# Tested Locations: 428399
Done in 3 sec


In [17]:
locations.count().sort_values(ascending=False)

LOCATION                             428399
N                                    428399
CC                                   428399
status                               428399
formatted_address                    402832
type                                 402832
country_long                         401462
country_short                        401462
administrative_area_level_1_short    386605
administrative_area_level_1_long     386605
locality_short                       321513
locality_long                        321513
administrative_area_level_2_long     305600
administrative_area_level_2_short    305600
administrative_area_level_3_long      60747
administrative_area_level_3_short     60747
neighborhood_long                     21713
neighborhood_short                    21713
administrative_area_level_4_long      16422
administrative_area_level_4_short     16422
sublocality_long                      16291
sublocality_short                     16291
dtype: int64

# Evaluate Results

In [18]:
print('Compute String Match...')
start = timer()

def match(x):
    try:
        return fuzz.token_set_ratio(x['LOCATION'].lower(),x['formatted_address'].lower())
    except:
        return None

locations['MATCH'] = locations.apply(match,1)

print("Done in", round(timer()-start), "sec")

Compute String Match...
Done in 25 sec


In [19]:
print('Check if a string contains special characters...')
start = timer()

def is_ascii(x):
    try:
        return int(x==x.encode('utf-8').decode('ascii'))
    except:
        return 0
    
locations['ASCII'] = locations['LOCATION'].apply(is_ascii)
print("Done in", round(timer()-start), "sec")

Check if a string contains special characters...
Done in 0 sec


In [100]:
print('Check if a location contains types of interest...')
start = timer()

def of_interest(x):
    try: 
        return int(len(list(set(x.split(',')).intersection(list(types_of_interest)+['airport','political'])))>0)
    except:
        return None
    
locations['INTEREST'] = locations['type'].apply(of_interest)

print("Done in", round(timer()-start), "sec")

Check if a location contains types of interest...
Done in 1 sec


In [659]:
locations.count()

LOCATION                             428399
N                                    428399
CC                                   428399
formatted_address                    402832
status                               428399
type                                 402832
country_long                         401462
country_short                        401462
locality_long                        321513
locality_short                       321513
administrative_area_level_2_long     305600
administrative_area_level_2_short    305600
administrative_area_level_1_long     386605
administrative_area_level_1_short    386605
administrative_area_level_3_long      60747
administrative_area_level_3_short     60747
sublocality_long                      16291
sublocality_short                     16291
neighborhood_long                     21713
neighborhood_short                    21713
administrative_area_level_4_long      16422
administrative_area_level_4_short     16422
MATCH                           

In [660]:
verified_locations = []

In [662]:
# Checked bottom 100 N and Match
verified_locations += list(
locations[
(locations['N']>=25)& # Remove 18% 
(locations['status']=='OK')& # Removes 1% 
(locations['INTEREST']==1)& # Remove 1%
# (locations['ASCII']==1)& # To check string matching
(locations['CC']==locations['country_short'])
]['LOCATION'])

In [1062]:
locations[
(locations['N']>=25)& # Remove 18% 
(locations['status']=='OK')& # Removes 1% 
(locations['INTEREST']==0)& # Remove 1%
# (locations['ASCII']==1)& # To check string matching
(locations['CC']==locations['country_short'])
].sort_values(by='N')[
['LOCATION','CC','N','country_short','country_long','formatted_address','type','MATCH']].tail(50)

Unnamed: 0,LOCATION,CC,N,country_short,country_long,formatted_address,type,MATCH
6430,DM(V),US,190,US,United States,"404 N 14th St, Independence, KS 67301, USA","establishment,local_government_office,point_of...",9.0
6414,"Puerto Rico, USA",PR,190,PR,Puerto Rico,"9227 Marina, Ponce, 00730, Puerto Rico","establishment,point_of_interest,store",85.0
6336,Paris Van Java,ID,193,ID,Indonesia,"Jl. Sukajadi No.131-139, Cipedes, Kec. Sukajad...","establishment,point_of_interest,shopping_mall",20.0
6331,Moscow city,RU,193,RU,Russia,"Presnenskaya nab., 12, Moskva, Russia, 123317","establishment,point_of_interest",16.0
6315,The South,US,193,US,United States,"627 E Silas Brown St, Jackson, MS 39201, USA","establishment,point_of_interest",16.0
6166,Michigan State University,US,198,US,United States,"220 Trowbridge Rd, East Lansing, MI 48824, USA","establishment,point_of_interest,school,university",26.0
6023,Jakarta - Bandung,ID,203,ID,Indonesia,"Jl. Sandang No.2, RT.11/RW.17, Klender, sandan...","establishment,point_of_interest",64.0
5960,Sweet Home Alabama,US,206,US,United States,"4400 Watercrest Rd Suite 101, Killeen, TX 7654...","establishment,food,point_of_interest,restaurant",27.0
5852,My House,US,209,US,United States,"3036 S Fremont Ave, Springfield, MO 65804, USA","clothing_store,establishment,point_of_interest...",16.0
5792,NOVA,US,211,US,United States,"21306 Signal Hill Plaza, Sterling, VA 20164, USA","establishment,point_of_interest,university",12.0


When N reaches ~50 users average geolocation differ from their account location, it sometimes indicates that users manually entered a location that might be identified by geocoding but does correspond to their actual location. Their account location might the city they are from or that they identify with. 

In [1053]:
fix2country={
'AUS':'Australia',
'Almagro':'Argentina',
'Bayonne':'France',
'Carolina':'Spain',
'carolina':'Spain',
'CORDOBA':'Argentina',
'Cordoba':'Argentina', 
'Cordoba ':'Argentina',
'Córdoba':'Argentina',
'Córdoba ':'Argentina',
'córdoba':'Argentina',
'córdoba ':'Argentina',
'cordoba':'Argentina',
'cordoba ':'Argentina',
'Dunedin':'New Zealand',
'Halifax ':'United Kingdom',
'halifax':'United Kingdom',
'KOREA':'South Korea',
'Korea':'South Korea',
'Korea ':'South Korea',
'korea':'South Korea',
'korea ':'South Korea',
'Laguna':'Philippines',
'laguna':'Philippines',
'Lucena':'Spain',
'Merida':'Venezuela',
'Mérida ':'Venezuela',
'Naples':'Italy',
'Toledo ':'Portugal',
'tripoli':'Lebanon',
'Miramar':'Argentina',
'PHL':'Philippines',
'SC':'Brazil',
'SC ':'Brazil',
'sc':'Brazil',
'San isidro':'Argentina',
'St Petersburg':'Russia',
'Woodlands':'Singapore',
'cali':'Mexico',
'cali ':'Mexico',
}

In [1064]:
locations[
(locations['N']>=50)& # Remove 18% 
(locations['status']=='OK')& # Removes 1%
(locations['INTEREST']==0)& # Remove 1%
# (locations['ASCII']==1)& # To check string matching
(locations['CC']!=locations['country_short'])
].sort_values(by='N')[
['LOCATION','CC','N','country_short','country_long','formatted_address','type','MATCH']].tail(50)

Unnamed: 0,LOCATION,CC,N,country_short,country_long,formatted_address,type,MATCH
1744,All around the world,NG,730,US,United States,"541 W McDermott Dr, Allen, TX 75013, USA","establishment,point_of_interest,travel_agency",35.0
1710,Sampa,BR,748,US,United States,"2321 W Evans Ave, Denver, CO 80223, USA","bar,establishment,point_of_interest",10.0
1669,INA,ID,762,US,United States,"110 E 13th St, New York, NY 10003, USA","clothing_store,establishment,point_of_interest...",11.0
1655,D.F,MX,767,US,United States,"2978 State St, South Salt Lake, UT 84115, USA","establishment,point_of_interest,school",4.0
1618,LDN,GB,786,US,United States,"8600 Commodity Cir No 164, Orlando, FL 32819, USA","establishment,finance,point_of_interest",8.0
1609,MG,BR,788,US,United States,"4747 S 102nd E Ave, Tulsa, OK 74146, USA","car_dealer,establishment,point_of_interest,store",0.0
1583,Rj,BR,800,US,United States,"5835 Lamar Ave, Mission, KS 66202, USA","establishment,food,point_of_interest,restaurant",5.0
1529,Somerset,GB,828,US,United States,"2800 W Big Beaver Rd, Troy, MI 48084, USA","establishment,point_of_interest,shopping_mall",13.0
1500,df,MX,842,US,United States,"4119 Culebra Rd, San Antonio, TX 78228, USA","establishment,food,point_of_interest,restaurant",5.0
1469,napoli,IT,859,US,United States,"1301 N Broadway, Pittsburg, KS 66762, USA","establishment,food,point_of_interest,restaurant",14.0


In [1057]:
locations[
(locations['N']>=50)& # Remove 18% 
(locations['status']=='OK')& # Removes 1%
(locations['INTEREST']==1)& # Remove 1%
# (locations['ASCII']==1)& # To check string matching
(locations['CC']!=locations['country_short'])&
# (-locations['LOCATION'].apply(lambda x:x.strip().lower() in homonyms))&
# (locations['LOCATION'].apply(lambda x:x in fix2country))
# (-locations['LOCATION'].apply(lambda x:x.strip().lower() in drop))
].sort_values(by='LOCATION')[
['LOCATION','CC','N','country_short','country_long','formatted_address','type','MATCH']]

Unnamed: 0,LOCATION,CC,N,country_short,country_long,formatted_address,type,MATCH
18165,AUS,ID,63,US,United States,"Austin-Bergstrom International Airport (AUS), ...","airport,establishment,point_of_interest",100.0
7414,Almagro,BR,162,US,United States,"Almagro, Danville, VA 24541, USA","neighborhood,political",100.0
12712,Bayonne,ES,91,US,United States,"Bayonne, NJ, USA","locality,political",100.0
6661,CORDOBA,BR,182,ES,Spain,"Córdoba, Spain","locality,political",63.0
7301,Carolina,PR,165,US,United States,"North Carolina, USA","administrative_area_level_1,political",100.0
486,Cordoba,BR,2562,ES,Spain,"Córdoba, Spain","locality,political",63.0
3990,Cordoba,PY,316,ES,Spain,"Córdoba, Spain","locality,political",63.0
272,Córdoba,BR,4229,ES,Spain,"Córdoba, Spain","locality,political",100.0
2105,Córdoba,BR,598,ES,Spain,"Córdoba, Spain","locality,political",100.0
13253,Dunedin,AU,88,US,United States,"Dunedin, FL, USA","locality,political",100.0


In [967]:
from geopy.geocoders import GoogleV3
google_key = 'AIzaSyBuQhAq-vo_bPmdhhtbd48Ap7TDQ3HqWjE'
geocoder_google = GoogleV3(api_key=google_key)

In [None]:
geocoder_google.geocode()

In [959]:
drop = {
'ALL AROUND THE WORLD',
'Disneyland',
'i',
'Latina',
'I',
'Jüpiter',
'hell',
'o',
'Pluto',
'South',
'North West',
'Forks',
'Stratford',
'Transylvania',
'Júpiter',
'na sua',
'Newtown',
}

drop = set([x.strip().strip('.').lower() for x in drop])

In [960]:
homonyms = {
'abingdon',
'acre',
'airdrie',
'alberton',
'alex',
'alexandria',
'almonte',
'alton',
'amesbury',
'andover',
'ascot',
'ashford',
'atherton',
'aus',
'ayacucho',
'bangor',
'barneveld',
'barrow',
'batavia',
'bedford',
'belem',
'bella vista',
'berga',
'berkshire',
'berwick',
'bexley',
'birmingham',
'boa vista',
'bolivar',
'bolton',
'boulogne',
'bourne',
'braintree',
'brentford',
'brentwood',
'buckley',
'canton',
'carlisle',
'cartagena',
'chacabuco',
'chelmsford',
'cheltenham',
'chester',
'chesterfield',
'city bell',
'claypole',
'coalville',
'colon',
'darlington',
'deal',
'denbigh',
'derby',
'dolores',
'dover',
'dudley',
'dundalk',
'dundee',
'durango',
'durham',
'elgin',
'ely',
'emsworth',
'everton',
'evesham',
'exeter',
'falmouth',
'formosa',
'freeport',
'gloucester',
'grantham',
'guernica',
'haddington',
'hampshire',
'harlingen',
'harwich',
'hastings',
'haverhill',
'hawarden',
'hayes',
'hertford',
'hillsborough',
'hinckley',
'horsham',
'huntingdon',
'id',
'ind',
'ireland',
'isleworth',
'jersey',
'junin',
'junín',
'k-town',
'kenilworth',
'kent',
'kettering',
'kilmarnock',
'la calera',
'la florida',
'ladysmith',
'lancaster',
'las flores',
'las piedras',
'leominster',
'letterkenny',
'lewes',
'lincoln',
'liverpool',
'livingston',
'los angles',
'ludlow',
'madriz',
'maipu',
'maipú',
'malden',
'malvern',
'mansfield',
'margate',
'markham',
'martinez',
'matlock',
'medway',
'mexique',
'middlesex',
'middleton',
'midlands',
'montrose',
'moreno',
'mt',
'my',
'natal',
'nederland',
'new brunswick',
'newbury',
'newmarket',
'newport',
'norfolk',
'northampton',
'norwich',
'oakville',
'ocaña',
'oil city',
'oviedo',
'paisley',
'panama city',
'panamá city',
'parana',
'paraná',
'perthshire',
'pickering',
'plymouth',
'portsmouth',
'potsdam',
'preston',
'prince george',
'providencia',
'queen city of the south',
'ranelagh',
'renfrew',
'richmond hill',
'ringwood',
'rio claro',
'ripon',
'royston',
'rugby',
'rutland',
'salem',
'salinas',
'salisbury',
'san bernardo',
'san carlos',
'san cristobal',
'san cristóbal',
'san felipe',
'san fernando',
'san juan',
'san lorenzo',
'san luis',
'san martin',
'san martín',
'san miguel',
'san nicolas',
'san nicolás',
'san pablo city',
'san pedro',
'san rafael',
'sandy',
'santa ana',
'santa cruz',
'santa fe',
'santa maria',
'santa rosa',
'santarém',
'sarandi',
'sby',
'seaford',
'shrewsbury',
'sjc',
'smyrna',
'sonora',
'southport',
'sps',
'st-petersburg',
'st. petersburg',
'st.-petersburg',
'st.petersburg',
'stafford',
'stamford',
'suffolk',
'sussex',
'sutton',
'são vicente',
'tambun',
'tangamandapio',
'tanjung',
'taunton',
'tipton',
'tiverton',
'truro',
'uxbridge',
'valparaiso',
'van',
'villa elisa',
'villarrica',
'vitoria',
'wakefield',
'wallingford',
'wallington',
'wantage',
'ware',
'warminster',
'warwick',
'waterford',
'welling',
'westbury',
'westland',
'weymouth',
'whitehaven',
'wickford',
'worcester',
'york'}


homonyms = set([x.strip().strip('.').lower() for x in homonyms])

In [961]:
cols = [
'LOCATION',
'N',
# 'CC',
# 'formatted_address',
# 'status',
# 'type',
'country_long',
'country_short',
'locality_long',
'locality_short',
'sublocality_long',
'sublocality_short',
'neighborhood_long',
'neighborhood_short',
'administrative_area_level_1_long',
'administrative_area_level_1_short',
'administrative_area_level_2_long',
'administrative_area_level_2_short',
'administrative_area_level_3_long',
'administrative_area_level_3_short',
'administrative_area_level_4_long',
'administrative_area_level_4_short',
# 'MATCH',
# 'ASCII',
# 'INTEREST',
]