In [3]:
import pandas as pd
from timeit import default_timer as timer
import numpy as np
import more_itertools as mit
from fuzzywuzzy import fuzz
import itertools
%matplotlib inline

In [12]:
path_to_data = '../../data/locations/profiles/geocoding/'

In [3]:
print('Import Acount Location Data...')
start = timer()

# Account Location, Average User Geolocation, Number of Users
account_locations = pd.read_pickle(path_to_data+'account-locations-to-geocode.pkl')
print('# Selected Account locations (After String Cleaning):', account_locations.shape[0])

print("Done in", round(timer()-start), "sec")

Import Acount Location Data...
# Selected Account locations (After String Cleaning): 41115
Done in 0 sec


In [4]:
account_locations.head()

Unnamed: 0,LOCATION,POINT,N,CC,COUNTRY
0,Indonesia,"(-5.53034, 106.54255)",98240,ID,Indonesia
1,London,"(49.01730709, 1.43906842)",88896,FR,France
2,Brasil,"(-17.99120486, -45.60141543)",70885,BR,Brazil
3,Jakarta,"(-5.19392835, 103.92920358)",55842,ID,Indonesia
4,Philippines,"(14.604133, 120.017236)",51989,PH,Philippines


# Import, Merge, and Format Geocodes

In [5]:
types_of_interest = frozenset([
'country',
'locality',
'sublocality',
'neighborhood',
'administrative_area_level_1',
'administrative_area_level_2',
'administrative_area_level_3',
'administrative_area_level_4',
])

def nth_result(response,n):

    formatted_result = {}
    
    #Only Consider First result
    nth_result = mit.nth(response['results'], n, None)
    
    if not nth_result:
        return pd.Series()
        
    for component in nth_result['address_components']:

        for type_of_interest in types_of_interest:

            if type_of_interest in component['types']:

                formatted_result[type_of_interest+'_long'] = component['long_name']
                formatted_result[type_of_interest+'_short'] = component['short_name']
            
    return pd.Series(formatted_result)

def match(x):
    try:
        return fuzz.token_set_ratio(
        x['LOCATION'].lower(),
        x['formatted_address'].lower())
    except:
        return None

def is_ascii(x):
    try:
        return int(x==x.encode('utf-8').decode('ascii'))
    except:
        return 0

def of_interest(x):
    try: 
        return int(len(list(set(x.split(',')).intersection(list(types_of_interest))))>0)
    except:
        return None

In [6]:
print('Import Geocoded Data...')
start = timer()

# Account Locations Geocoded Using the Google API
geocoded_locations = pd.read_pickle(path_to_data+'account-locations-geocoded.pkl')
print('# Geocoded locations:', geocoded_locations.shape[0])

print("Done in", round(timer()-start), "sec")

Import Geocoded Data...
# Geocoded locations: 428399
Done in 15 sec


In [7]:
geocoded_locations.head()

Unnamed: 0,accuracy,formatted_address,google_place_id,input_string,latitude,longitude,number_of_results,postcode,response,status,type
0,APPROXIMATE,Indonesia,ChIJtwRkSdcHTCwRhfStG-dNe-M,Indonesia,-0.789275,113.921327,1,,{'results': [{'address_components': [{'long_na...,OK,"country,political"
1,APPROXIMATE,"London, UK",ChIJdd4hrwug2EcRmSrV3Vo6llI,London,51.507351,-0.127758,1,,{'results': [{'address_components': [{'long_na...,OK,"locality,political"
2,APPROXIMATE,Brazil,ChIJzyjM68dZnAARYz4p8gYVWik,Brasil,-14.235004,-51.92528,1,,{'results': [{'address_components': [{'long_na...,OK,"country,political"
3,APPROXIMATE,"Jakarta, Indonesia",ChIJnUvjRenzaS4RoobX2g-_cVM,Jakarta,-6.208763,106.845599,1,,{'results': [{'address_components': [{'long_na...,OK,"colloquial_area,locality,political"
4,APPROXIMATE,Philippines,ChIJY96HXyFTQDIRV9opeu-QR3g,Philippines,12.879721,121.774017,1,,{'results': [{'address_components': [{'long_na...,OK,"country,political"


In [8]:
print('Drop extra geocodes:')
geocoded_locations = geocoded_locations[
geocoded_locations['input_string'].isin(account_locations['LOCATION'])].copy()
print('# Geocoded locations:', geocoded_locations.shape[0])

Drop extra geocodes:
# Geocoded locations: 40994


In [9]:
# Discarded 173 locations with 2238 users matched with secondary results
print('Format First Geocoded Result:')
start = timer()

geocoded_locations = pd.concat([geocoded_locations,
geocoded_locations['response'].apply(lambda x: nth_result(x, 0))],1)

print("Done in", round(timer()-start), "sec")

Format First Geocoded Result:
Done in 19 sec


In [10]:
print('Merge Location Data...')
start = timer()

# Could measure distance with geocoordinates to identify homonyms and missclassified reversed geocodes
locations = pd.merge(
account_locations,
geocoded_locations.drop([
'accuracy',
'google_place_id',
'postcode',
'number_of_results', 
'response',
],1,errors='ignore'),
left_on='LOCATION',
right_on='input_string',
how='left').drop(
'input_string',1).sort_values(
by='N',ascending=False)

del geocoded_locations

locations['MATCH']    = locations.apply(match,1)
locations['ASCII']    = locations['LOCATION'].apply(is_ascii)
locations['INTEREST'] = locations['type'].apply(of_interest)

print('# Locations:',locations.shape[0])
print('#Users At Selected Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Merge Location Data...
# Locations: 41115
#Users At Selected Locations: 9050984
Done in 3 sec


# Reprocess With Bias

In [11]:
# Update Locations To Be Reprocessed With Bias Iteratively, Discarding The Ones Already Inlcuded (Gives Priority To Manually Included Bias)
def update_reprocess(locations_and_bias,reprocess_with_bias):
    reprocess_with_bias.update({k:v for k,v in locations_and_bias.items() if k not in reprocess_with_bias})

In [12]:
reprocess_with_bias = {}

In [13]:
# Account Locations With Manually Included Bias (Some Belong To Homonyms)
location2region = []
with open(path_to_data+'account-locations-to-region.txt', 'r') as f:
    for line in f:
        location2region.append(line.strip('\n').split('\t'))
location2region = dict(location2region)
print('# Locations With Manually Included Bias:',len(location2region))
list(location2region.items())[:10]

# Locations With Manually Included Bias: 39


[('(d)mv', 'Delaware'),
 ('+65', 'Singapore'),
 ('ad', 'UAE'),
 ('almagro', 'Argentina'),
 ('amman_jordan', 'Jordan'),
 ('aus', 'Australia'),
 ('bayonne', 'France'),
 ('brighton', 'GB'),
 ('cali', 'Mexico'),
 ('canterbury', 'GB')]

In [14]:
# Find Locations
locations_with_fixed_region = \
list(locations.loc[locations['LOCATION'].apply(lambda x:x.strip().lower() in location2region),'LOCATION'])

# Include Bias
locations_with_fixed_region = \
dict([[x,location2region[x.lower().strip()]] for x in locations_with_fixed_region])

# Update list
update_reprocess(locations_with_fixed_region, reprocess_with_bias)

print('Locations To Reprocess With Manually Added Regional Bias:', len(locations_with_fixed_region))
print('Total Reprocessed With Bias:', len(reprocess_with_bias))
list(locations_with_fixed_region.items())[:10]

Locations To Reprocess With Manually Added Regional Bias: 117
Total Reprocessed With Bias: 117


[('Ireland', 'Ireland'),
 ('Valencia', 'Spain'),
 ('Cali', 'Mexico'),
 ('Córdoba', 'Argentina'),
 ('Brighton', 'GB'),
 ('Korea', 'South Korea'),
 ('Cordoba', 'Argentina'),
 ('korea', 'South Korea'),
 ('valencia', 'Spain'),
 ('DMV', 'Delaware, Maryland, Virginia')]

In [15]:
locations_forgotten = dict(locations.loc[locations['status'].isnull(),['LOCATION','COUNTRY']].values.tolist())

# Update list
update_reprocess(locations_forgotten, reprocess_with_bias)

print('Locations To Reprocess With Geolocated Bias:', len(locations_forgotten))
print('Total Reprocessed With Bias:', len(reprocess_with_bias))
list(locations_forgotten.items())[:10]

Locations To Reprocess With Geolocated Bias: 121
Total Reprocessed With Bias: 233


[('Trinidad & Tobago', 'Trinidad and Tobago'),
 ('Texas!', 'United States of America'),
 ('Jakarta | Indonesia', 'Indonesia'),
 ('Jakarta_Indonesia', 'Indonesia'),
 ('Singapore!', 'Malaysia'),
 ('Republic of the Philippines :)', 'Philippines'),
 ('Philippines :)', 'Philippines'),
 ('Colombia!', 'Colombia'),
 ('Venezuela!', 'Venezuela, Bolivarian Republic of'),
 ('California!', 'United States of America')]

In [16]:
locations_missed = dict(locations.loc[locations['status']=='ZERO_RESULTS',['LOCATION','COUNTRY']].values.tolist())

# Update list
update_reprocess(locations_missed, reprocess_with_bias)

print('Missed Locations To Reprocess With Geolocated Bias:', len(locations_missed))
print('Total Reprocessed With Bias:', len(reprocess_with_bias))
list(locations_missed.items())[:10]

Missed Locations To Reprocess With Geolocated Bias: 48
Total Reprocessed With Bias: 278


[('関西', 'Japan'),
 ('関東', 'Japan'),
 ('North East England',
  'United Kingdom of Great Britain and Northern Ireland'),
 ('North West, England',
  'United Kingdom of Great Britain and Northern Ireland'),
 ('SG', 'Malaysia'),
 ('North East, England',
  'United Kingdom of Great Britain and Northern Ireland'),
 ('Venezuela♥', 'Venezuela, Bolivarian Republic of'),
 ('North West England',
  'United Kingdom of Great Britain and Northern Ireland'),
 ('Argentina♥', 'Argentina'),
 ('sg', 'Malaysia')]

In [17]:
# Account Locations With Homonyms (Geocoded Using Geolocated Country Bias)
homonyms = []
with open(path_to_data+'account-locations-homonyms.txt', 'r') as f:
    for line in f:
        homonyms.append(line.strip('\n'))
print('# Homonyms:',len(homonyms))
homonyms[:10]

# Homonyms: 252


['abingdon',
 'acre',
 'airdrie',
 'alberton',
 'alex',
 'alexandria',
 'almagro',
 'almonte',
 'alton',
 'amesbury']

In [18]:
locations_with_homonyms = dict(locations.loc[
locations['LOCATION'].apply(lambda x:x.strip().lower() in homonyms),
['LOCATION','COUNTRY']].values.tolist())

# Update list
update_reprocess(locations_with_homonyms, reprocess_with_bias)

print('Locations With Homonyms To Reprocess With Geolocated Bias:', len(locations_with_homonyms))
print('Total Reprocessed With Bias:', len(reprocess_with_bias))
list(locations_with_homonyms.items())[:10]

Locations With Homonyms To Reprocess With Geolocated Bias: 570
Total Reprocessed With Bias: 795


[('Liverpool', 'United Kingdom of Great Britain and Northern Ireland'),
 ('Birmingham', 'United Kingdom of Great Britain and Northern Ireland'),
 ('Nederland', 'Belgium'),
 ('Cali', 'Mexico'),
 ('Córdoba', 'Brazil'),
 ('liverpool', 'United Kingdom of Great Britain and Northern Ireland'),
 ('Kent', 'United Kingdom of Great Britain and Northern Ireland'),
 ('Norwich', 'United Kingdom of Great Britain and Northern Ireland'),
 ('Cordoba', 'Brazil'),
 ('Jersey', 'United States of America')]

In [19]:
locations_uninteresting = dict(locations.loc[locations['INTEREST']==0,['LOCATION','COUNTRY']].values.tolist())

# Update list
update_reprocess(locations_uninteresting, reprocess_with_bias)

print('Uninteresting Location To Reprocess With Geolocated Bias:', len(locations_uninteresting))
print('Total Reprocessed With Bias:', len(reprocess_with_bias))
list(locations_uninteresting.items())[:10]

Uninteresting Location To Reprocess With Geolocated Bias: 3312
Total Reprocessed With Bias: 4021


[('Barcelona', 'Spain'),
 ('Ireland', 'Ireland'),
 ('Valencia', 'Portugal'),
 ('Roma', 'Italy'),
 ('Milano', 'Italy'),
 ('Italia', 'Italy'),
 ('MNL', 'Philippines'),
 ('Bali', 'Indonesia'),
 ('Adana', 'Turkey'),
 ('New York, New York', 'United States of America')]

In [20]:
print('Save Locations To Be Reprocessed With Bias:')
reprocess_with_bias = pd.DataFrame(pd.Series(reprocess_with_bias,name='BIAS'))
reprocess_with_bias.index.name='LOCATION'
reprocess_with_bias.reset_index(inplace=True)
reprocess_with_bias.to_pickle(path_to_data+'account-locations-to-geocode-with-bias.pkl')
print('Done')

Save Locations To Be Reprocessed With Bias:
Done


In [21]:
reprocess_with_bias.head()

Unnamed: 0,LOCATION,BIAS
0,Ireland,Ireland
1,Valencia,Spain
2,Cali,Mexico
3,Córdoba,Argentina
4,Brighton,GB


##### Verified Locations

In [22]:
# Remove Locations To be Reprocessed
locations = locations[-locations['LOCATION'].isin(reprocess_with_bias['LOCATION'])].copy()

verified = []
dropped  = []

When N reaches ~50 and users average geolocation differ from their account location, it sometimes indicates that users manually entered a location that might be identified by geocoding but does correspond to their actual location. Their account location might the city they are from or that they identify with. We decided to drop them.

In [23]:
dropped += list(locations.loc[(locations['N']<50)&(locations['CC']!=locations['country_short']),'LOCATION'])
print('# dropped locations:',len(dropped),'of',locations.shape[0])

# Checked 
verified += list(locations.loc[(locations['N']>=50)&(locations['CC']!=locations['country_short']),'LOCATION'])
print('# verified locations:',len(verified),'of',locations.shape[0]-len(dropped))

# Checked bottom 500 N and Match
verified += list(locations.loc[(locations['CC']==locations['country_short']),'LOCATION'])
print('# verified locations:',len(verified),'of',locations.shape[0]-len(dropped))

# dropped locations: 775 of 37094
# verified locations: 1280 of 36319
# verified locations: 36319 of 36319


# Import, Merge, and Format Geocodes Reprocessed With Bias

In [24]:
verified_with_bias = []
dropped_with_bias  = []

In [25]:
# Account Locations Geocoded Using the Google API
geocoded_locations_with_bias = pd.read_pickle(path_to_data+'account-locations-geocoded-with-bias.pkl')
print('# Geocoded locations:', geocoded_locations_with_bias.shape[0])

# Geocoded locations: 5004


In [26]:
print('Merge and Format:')
start = timer()

# Merge Geocoded Locations With Reprocessed Locations (The Latter Keeps Being Updated So Use Left Merge)
locations_with_bias = pd.merge(
reprocess_with_bias[['LOCATION']],
geocoded_locations_with_bias,
left_on='LOCATION',
right_on='raw_string',
how='left').drop('raw_string',1)

locations_with_bias = pd.merge(
locations_with_bias,
account_locations[['LOCATION','N','CC']],
how='left')

del geocoded_locations_with_bias

print('# Locations Geocoded With Bias:', locations_with_bias.shape[0])

locations_with_bias.sort_values(by='N',ascending=False,inplace=True)

dropped_with_bias+=list(locations_with_bias.loc[locations_with_bias['status']!='OK','LOCATION'])

locations_with_bias = locations_with_bias[locations_with_bias['status']=='OK'].copy()

print('# Locations Geocoded With Bias With Result:', locations_with_bias.shape[0])

locations_with_bias = locations_with_bias[[
'LOCATION', 
'input_string',
'N',
'CC',
'formatted_address', 
'latitude', 
'longitude', 
'response',
'type',
]].reset_index(drop=True).copy()

locations_with_bias = pd.concat([locations_with_bias,
locations_with_bias['response'].apply(lambda x: nth_result(x, 0))],1)
del locations_with_bias['response']

locations_with_bias['MATCH'] = locations_with_bias.apply(match,1)
locations_with_bias['ASCII'] = locations_with_bias['LOCATION'].apply(is_ascii)

print("Done in", round(timer()-start), "sec")

Merge and Format:
# Locations Geocoded With Bias: 4021
# Locations Geocoded With Bias With Result: 3469
Done in 2 sec


In [27]:
# Locations That Were Not Identified With Country Bias --> Reprocessed With Manually Included Bias
location2manual = {}
with open(path_to_data+'account-locations-to-manual.txt', 'r') as f:
    for line in f:
        key,value = line.strip('\n').split('\t',1)
        location2manual.update({key:value})
list(location2manual.items())[:10]

[('cali', 'Baja California, Mexico'),
 ('dmv', 'USA'),
 ('toledo', 'Toledo, Spain'),
 ('mia', 'Miami, USA'),
 ('qc', 'Qatar City'),
 ('(d)mv', 'Delaware, USA'),
 ('d(m)v', 'Maryland, USA'),
 ('dm(v)', 'Virginia, USA'),
 ('brighton & hove', 'Brighton, UK'),
 ('brighton & hove, uk', 'Brighton, UK')]

In [28]:
print('Save Locations To Be Reprocessed Manually:')

# Find Locations
reprocess_manually = \
list(locations_with_bias.loc[locations_with_bias['LOCATION'].apply(
lambda x:x.strip().lower() in location2manual),'LOCATION'])

# Map Into fixed locations
reprocess_manually = \
dict([[x,location2manual[x.lower().strip()]] for x in reprocess_manually])

reprocess_manually = pd.DataFrame(pd.Series(reprocess_manually,name='FIXED'))
reprocess_manually.index.name='LOCATION'
reprocess_manually.reset_index(inplace=True)
reprocess_manually.to_pickle(path_to_data+'account-locations-to-geocode-manually.pkl')

print('Done')

Save Locations To Be Reprocessed Manually:
Done


In [29]:
reprocess_manually.head()

Unnamed: 0,LOCATION,FIXED
0,Liverpool,"Liverpool, UK"
1,Nederland,Netherland
2,Cali,"Baja California, Mexico"
3,PH,Philippines
4,liverpool,"Liverpool, UK"


In [30]:
locations_with_bias = \
locations_with_bias[-locations_with_bias['LOCATION'].isin(reprocess_manually['LOCATION'])].copy()

verified_with_bias += list(locations_with_bias.loc[
locations_with_bias['LOCATION'].isin(locations_with_fixed_region),'LOCATION'])
verified_with_bias = list(set(verified_with_bias))
print('# verified locations:',len(verified_with_bias),'of',locations_with_bias.shape[0])

verified_with_bias += list(locations_with_bias.loc[
locations_with_bias['LOCATION'].isin(locations_forgotten),'LOCATION'])
verified_with_bias = list(set(verified_with_bias))
print('# verified locations:',len(verified_with_bias),'of',locations_with_bias.shape[0])

verified_with_bias += list(locations_with_bias.loc[
locations_with_bias['LOCATION'].isin(locations_missed),'LOCATION'])
verified_with_bias = list(set(verified_with_bias))
print('# verified locations:',len(verified_with_bias),'of',locations_with_bias.shape[0])

verified_with_bias += list(locations_with_bias.loc[
locations_with_bias['LOCATION'].isin(locations_with_homonyms)]['LOCATION'])
verified_with_bias = list(set(verified_with_bias))
print('# verified locations:',len(verified_with_bias),'of',locations_with_bias.shape[0])

verified_with_bias += list(locations_with_bias.loc[
locations_with_bias['LOCATION'].isin(locations_uninteresting)]['LOCATION'])
verified_with_bias = list(set(verified_with_bias))
print('# verified locations:',len(verified_with_bias),'of',locations_with_bias.shape[0])

# verified locations: 92 of 2810
# verified locations: 197 of 2810
# verified locations: 204 of 2810
# verified locations: 667 of 2810
# verified locations: 2810 of 2810


# Import, Merge, and format manually processed geocodes

In [31]:
# Account Locations Geocoded Using the Google API
geocoded_locations_manually = pd.read_pickle(path_to_data+'account-locations-geocoded-manually.pkl')
print('# Geocoded locations:', geocoded_locations_manually.shape[0])

# Geocoded locations: 659


In [32]:
print('Merge and Format:')
start = timer()

# Merge Geocoded Locations With Reprocessed Locations (The Latter Keeps Being Updated So Use Left Merge)
locations_manually = pd.merge(
reprocess_manually[['LOCATION']],
geocoded_locations_manually,
left_on='LOCATION',
right_on='raw_string',
how='left').drop('raw_string',1)

locations_manually = pd.merge(
locations_manually,
account_locations[['LOCATION','N','CC']],
how='left')

del geocoded_locations_manually

print('# Locations Geocoded With Bias:', locations_manually.shape[0])

locations_manually.sort_values(by='N',ascending=False,inplace=True)

dropped_manually=list(locations_manually.loc[locations_manually['status']!='OK','LOCATION'])

locations_manually = locations_manually[locations_manually['status']=='OK'].copy()

print('# Locations Geocoded With Bias With Result:', locations_manually.shape[0])

locations_manually = locations_manually[[
'LOCATION', 
'input_string',
'N',
'CC',
'formatted_address', 
'latitude', 
'longitude', 
'response',
'type',
]].reset_index(drop=True).copy()

locations_manually = pd.concat([locations_manually,
locations_manually['response'].apply(lambda x: nth_result(x, 0))],1)
del locations_manually['response']

verified_manually = list(locations_manually['LOCATION'])

print("Done in", round(timer()-start), "sec")

Merge and Format:
# Locations Geocoded With Bias: 659
# Locations Geocoded With Bias With Result: 650
Done in 1 sec


# Final List of Locations

In [54]:
cols = [
'LOCATION',
'N',
'latitude', 
'longitude',
'country_long',
'country_short',
'locality_long',
'locality_short',
'administrative_area_level_1_long',
'administrative_area_level_1_short',
'administrative_area_level_2_long',
'administrative_area_level_2_short',
]

In [80]:
locations_final = pd.concat([
locations.loc[locations['LOCATION'].isin(verified),cols],
locations_with_bias.loc[locations_with_bias['LOCATION'].isin(verified_with_bias),cols],
locations_manually.loc[locations_manually['LOCATION'].isin(verified_manually),cols],
]).sort_values(by='N',ascending=False).reset_index(drop=True)

locations_final.rename(columns={'LOCATION':'user_location','N':'n_users'},inplace=True,errors='ignore')

In [32]:
locations_final.to_pickle(path_to_data+'../account-locations.pkl')
locations_final.to_csv(path_to_data+'../account-locations.csv',float_format='%.10f')

# Check Final Dataset

In [29]:
locations_final=pd.read_pickle(path_to_data+'../account-locations.pkl')

In [33]:
locations.user_location.unique().shape[0]

39779

In [34]:
locations_final.country_short.unique().shape[0]

202

In [35]:
locations_final.administrative_area_level_1_short.unique().shape[0]

1246

In [36]:
locations_final.administrative_area_level_2_short.unique().shape[0]

3651

In [37]:
locations_final.locality_short.unique().shape[0]

7700