In [1]:
from timeit import default_timer as timer
import sys
import os
import numpy as np
import pandas as pd
import re
import string
import reverse_geocoder as rg
from iso3166 import countries

In [2]:
cutoff = 25
print("# Min number of sampled users:", cutoff)

path_to_output_files = '../../data/locations/profiles/geocoding/'

# Min number of sampled users: 25


In [3]:
print("Load...")
start = timer()

locations = pd.read_pickle(path_to_output_files+'account-locations-with-geocoordinates.pkl')
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Load...
# Locations: 4416858
#Users At Sampled Locations: 27081990
Done in 2 sec


In [4]:
print('Remove locations with not enough sampled users...')
start = timer()

# Many valid locations indicate multiple locations
# Could parse locations with low N later on

locations = locations.loc[(locations['N']>=cutoff)].copy()
print('# Locations:',locations.shape[0])
print('# Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove locations with not enough sampled users...
# Locations: 44027
# Users At Sampled Locations: 20691832
Done in 1 sec


In [5]:
import string
bad_locations = []
with open(path_to_output_files+'account-locations-bad.txt', 'r', encoding='utf-8') as f:
    for line in f:
        cleaned_line = line.lower().strip().strip('\n').translate(str.maketrans('', '', string.punctuation))
        bad_locations.append(cleaned_line)
bad_locations = sorted(set(bad_locations))
print('# Some Bad Locations:',len(bad_locations))
print(' \n '.join(bad_locations))

# Some Bad Locations: 1367
 
    
  ͡° ͜ʖ ͡° 
 a 
 a place 
 ab 
 abbey road 
 abc 
 above you 
 ac 
 acampamento meiosangue 
 achter je 
 ad ♥ 
 ad♥ 
 africa 
 afrika 
 aincrad 
 aki 
 al fondo a la derecha 
 alberti 
 albion 
 all around 
 all around the world 
 all over 
 all over the place 
 all over the world 
 allah swt 
 allahs 
 allianz arena 
 almost heaven 
 ambon manise 
 america latina 
 américa latina 
 andalasia 
 andromeda 
 andromeda galaxy 
 anfield 
 anfield liverpool 
 anfield road 
 anfieldliverpool 
 another world 
 antartica 
 anytown usa 
 anywhere 
 anywhere and everywhere 
 anywhere but here 
 anywhere in the world 
 anywhere you want 
 aor 
 aqui 
 aqui y alla 
 aquí 
 aquí y ahora 
 aquí y allá 
 araf 
 arendelle 
 arg obvio 
 arkham asylum 
 arkham city 
 around 
 around the world 
 around you 
 arsenal 
 asgard 
 ashburton grove 
 asia 
 asia pacific 
 asia tenggara 
 asian 
 at home 
 at my house 
 atlantic ocean 
 atlantis 
 atlien 
 aussie 
 avalon 
 aze

In [6]:
print('Remove bad locations...')
start = timer()

locations = locations[-locations['LOCATION'].apply(
lambda x:x.lower().strip().strip('\n').translate(str.maketrans('', '', string.punctuation)) in bad_locations)].copy()

locations = locations[-locations['LOCATION'].apply(lambda x:'twitter' in x.lower())].copy()

print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove bad locations...
# Locations: 41526
#Users At Sampled Locations: 9085577
Done in 1 sec


In [7]:
print('Keep Phone Codes:')

phone_codes = locations.loc[(locations['LOCATION'].apply(lambda x:'+' in x))].copy()

print('# Phone Codes:', phone_codes.shape[0])

Keep Phone Codes:
# Phone Codes: 20


In [8]:
phone_codes.head()

Unnamed: 0,LOCATION,POINT,N
688,62,"(-5.86454, 106.62741)",1883
2582,65,"(1.4607, 103.763443)",508
5901,965,"(29.09105374, 46.85423523)",211
8491,254,"(-1.0941961, 36.2774081)",143
12956,6221,"(-6.25490778, 107.06305685)",91


In [9]:
def has_digit(x):
    
    # Above N = 25, A few phone codes, mostly noise.
    # Below N = 5: Some phone codes, Some geocoordinates, Some zipcodes.
    
    return bool(re.search(r'\d', x))

print('Remove locations with any digit...')
start = timer()

locations = locations.loc[-locations['LOCATION'].apply(has_digit)].copy()
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove locations with any digit...
# Locations: 41178
#Users At Sampled Locations: 9054796
Done in 0 sec


In [10]:
def is_punctuation(x):
    invalidChars = frozenset(string.punctuation)
    if all(char in invalidChars for char in x):
        return True
    else:
        return False
    
print('Remove locations with ONLY punctuation...')
start = timer()

locations = locations.loc[-locations['LOCATION'].apply(is_punctuation)].copy()
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove locations with ONLY punctuation...
# Locations: 41178
#Users At Sampled Locations: 9054796
Done in 0 sec


In [11]:
def is_space(x):
    return x.isspace()

print('Remove locations with only white space...')
start = timer()

locations = locations.loc[-locations['LOCATION'].apply(is_space)].copy()
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove locations with only white space...
# Locations: 41178
#Users At Sampled Locations: 9054796
Done in 0 sec


In [12]:
def has_punctuation(x):
    
    # Weird punctuations are often a good predictor of a bad location. Yet, we saved some:
    # '.' used e.g. in Washington, D.C. Mexico, D.F., etc.
    # '-' often used in <city> - <country>
    # '()' often used in <city> (<country>)
    # '/' used to indicate states in Brazil
    # '!, |, _, :)' used in valid locations and bad ones that were removed up to N=25
    # Removes 3% of locations

    x = x.replace(':)','')
    x = x.replace(
    'Trinidad & Tobago','Trinidad and Tobago').replace(
    'Trinidad & Tobago','Trinidad and Tobago').replace(
    'Trinidad&Tobago','Trinidad and Tobago').replace(
    'Brighton & Hove','Brighton and Hove').replace(
    '#Puebla','Puebla').replace(
    '@istanbul','istanbul').replace(
    'Trinidad&Tobago','Trinidad and Tobago').replace(
    'Trinidad & Tobago ','Trinidad and Tobago').replace(
    '@jakarta','jakarta').replace(
    '@Thailand','Thailand').replace(
    'Buenos Aires; Argentina','Buenos Aires, Argentina')
    
    invalidChars = frozenset(
    string.punctuation.replace(
    '_','').replace(
    ',','').replace(
    '.','').replace(
    '-','').replace(
    '/','').replace(
    "'","").replace(
    "(","").replace(
    ")","").replace(
    "!","").replace(
    "|",""))
    
    if any(char in invalidChars for char in x):
        return True
    else:
        return False

print('Remove locations with some punctuation...')
start = timer()

locations = locations.loc[-locations['LOCATION'].apply(has_punctuation)].copy()
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove locations with some punctuation...
# Locations: 41115
#Users At Sampled Locations: 9050984
Done in 0 sec


In [13]:
def is_empty(x):
    if not x:
        return True
    else:
        return False
    
print('Remove Empty Locations...')
start = timer()
   
locations = locations.loc[-locations['LOCATION'].apply(is_empty)].copy()
print('# Locations:',locations.shape[0])
print('#Users At Sampled Locations:',locations['N'].sum())

print("Done in", round(timer()-start), "sec")

Remove Empty Locations...
# Locations: 41115
#Users At Sampled Locations: 9050984
Done in 0 sec


In [14]:
print('Selected Locations to be Geocoded...')
start = timer()

locations = pd.concat([locations,
#                        phone_codes,
                      ]).sort_values(by='N',ascending=False).reset_index(drop=True)
print('# Selected Locations:',locations.shape[0])
print('#Users At Selected Locations:',locations['N'].sum())

locations.head()

Selected Locations to be Geocoded...
# Selected Locations: 41115
#Users At Selected Locations: 9050984


Unnamed: 0,LOCATION,POINT,N
0,Indonesia,"(-5.53034, 106.54255)",98240
1,London,"(49.01730709, 1.43906842)",88896
2,Brasil,"(-17.99120486, -45.60141543)",70885
3,Jakarta,"(-5.19392835, 103.92920358)",55842
4,Philippines,"(14.604133, 120.017236)",51989


# Reverse Geocode Representative Point

In [15]:
print("Reverse geocode representative point...")
start = timer()

reversed_geocodes = pd.DataFrame(rg.search(list(locations['POINT'].values))).drop(
['lat','lon','name','admin1','admin2'],1).rename(columns=lambda x:x.upper())
print('# Geocodes:',reversed_geocodes.shape[0])

reversed_geocodes['COUNTRY'] = reversed_geocodes['CC'].apply(lambda x:countries.get(x).name)

locations = pd.concat([locations,reversed_geocodes],1)

del reversed_geocodes

print("Done in", round(timer()-start), "sec")

Reverse geocode representative point...
Loading formatted geocoded file...
# Geocodes: 41115
Done in 2 sec


# Save

In [16]:
print('Save selected locations to be geocoded:')
start = timer()

locations.to_pickle(path_to_output_files+'account-locations-to-geocode.pkl')

print("Done in", round(timer()-start), "sec")

Save selected locations to be geocoded:
Done in 0 sec


In [17]:
locations.head(10)

Unnamed: 0,LOCATION,POINT,N,CC,COUNTRY
0,Indonesia,"(-5.53034, 106.54255)",98240,ID,Indonesia
1,London,"(49.01730709, 1.43906842)",88896,FR,France
2,Brasil,"(-17.99120486, -45.60141543)",70885,BR,Brazil
3,Jakarta,"(-5.19392835, 103.92920358)",55842,ID,Indonesia
4,Philippines,"(14.604133, 120.017236)",51989,PH,Philippines
5,İstanbul,"(40.78565557, 28.87995098)",50755,TR,Turkey
6,istanbul,"(40.76090685, 28.8750945)",39241,TR,Turkey
7,indonesia,"(-5.3072557, 106.6885)",39057,ID,Indonesia
8,Argentina,"(-31.9880371, -59.2789376)",36977,AR,Argentina
9,Bandung,"(-6.74666667, 105.33361111)",33295,ID,Indonesia
