In [222]:
from timeit import default_timer as timer
import sys
import os
import numpy as np
import pandas as pd
from shapely.geometry import MultiPoint

# Params

In [7]:
path_to_input_files = '/scratch/spf248/twitter/data/tweets/tweets-with-geocoordinates/'
path_to_output_files = '/scratch/spf248/twitter/data/locations/profiles/geocoding/'

input_files = sorted([x for x in os.listdir(path_to_input_files) if '.pkl' in x and 'tweets' in x])
print('# Input Files:', len(input_files))

Tweets from Decahose: v4
# Input Files: 94


# Import Geolocated Tweets

In [3]:
def import_data(input_file):
    
    return pd.read_pickle(
    path_to_input_files+input_file,compression='xz')[['USER ID','USER LOCATION','LAT','LON']].dropna()

In [4]:
print("Import...")
start = timer()

tweets = pd.DataFrame()

for i,input_file in enumerate(input_files):
    
    tweets = pd.concat([tweets,import_data(input_file)])
    
    if not i%10:
        print('# Tweets:', tweets.shape[0])

print("Done in", round(timer()-start), "sec")

Import...
# Tweets: 0
Done in 4 sec


# Get Users Geolocation

In [5]:
print("Prepare data to be aggregated...")
start = timer()

print('# Tweets:', tweets.shape[0])

tweets['POINT'] = list(zip(tweets['LAT'], tweets['LON']))
tweets.drop(['LAT','LON'],1,inplace=True)

# Sort Once To Prepare Aggregation
tweets.sort_values(by='USER ID',inplace=True)

print("Done in", round(timer()-start), "sec")

Prepare data to be aggregated...
# Tweets: 436444
Done in 1 sec


In [6]:
tweets.head()

Unnamed: 0,USER ID,USER LOCATION,POINT
70310,100001920,"puebla, mexico","(17.54886728, -98.57749795)"
23630,100002734,Taubaté - SP - Brazil,"(-23.0418578, -45.55798539)"
21569,100002734,Taubaté - SP - Brazil,"(-23.0418578, -45.55798539)"
9895,100002734,Taubaté - SP - Brazil,"(-23.0418578, -45.55798539)"
19360,100002734,Taubaté - SP - Brazil,"(-23.0418578, -45.55798539)"


In [7]:
def groupby_apply(df, func, sort=True):
    
    # Groupby name_key then apply func on the values in name_value
    # Speed up pandas groupby a fractor of 10 ;)
    
    # Input a dataframe with columns [name_key, name_value]
    name_key, name_value = df.columns
    
    if sort:
        keys, values = df.sort_values(by=name_key).values.T
    else:
        keys, values = df.values.T
        
    # Extract Unique Keys And Index Of Transitions
    unique_keys, index = np.unique(keys, True)
    
    # Split Values According to Transitions
    arrays = np.split(values, index[1:])
    
    # Apply func to each array of values corresponding to a given key
    return pd.Series([func(a) for a in arrays], name=name_value, index=pd.Index(unique_keys,name=name_key))

In [8]:
print("Groupby users...")
start = timer()

users = pd.concat([
groupby_apply(tweets[['USER ID','USER LOCATION']],lambda x:set(x),sort=False).rename('LOCATION'),
groupby_apply(tweets[['USER ID','POINT']],lambda x:x.tolist(),sort=False),
# tweets.groupby('USER ID')['POINT'].count().rename('N'),
],1)

del tweets

print('# Users:',users.shape[0])

print("Done in", round(timer()-start), "sec")

Groupby users...
# Users: 167730
Done in 2 sec


In [9]:
users.head()

Unnamed: 0_level_0,LOCATION,POINT
USER ID,Unnamed: 1_level_1,Unnamed: 2_level_1
100001920,"{puebla, mexico}","[(17.54886728, -98.57749795)]"
100002734,{Taubaté - SP - Brazil},"[(-23.0418578, -45.55798539), (-23.0418578, -4..."
100002918,{(;},"[(47.59587574, -52.68829447)]"
100003297,"{Daytona Beach, FL}","[(29.195698, -81.063029), (29.1987957, -81.042..."
100006175,{},"[(-7.11710053, -34.83502584)]"


In [10]:
print("Keep users with only one location...")
start = timer()

# Presumably users with multiple locations are harder to geolocate
users = users.loc[users['LOCATION'].apply(len)==1].copy()
users['LOCATION'] = users['LOCATION'].apply(lambda x:x.pop())
print('# Users:',users.shape[0])

print("Done in", round(timer()-start), "sec")

Keep users with only one location...
# Users: 161634
Done in 0 sec


In [11]:
users.head()

Unnamed: 0_level_0,LOCATION,POINT
USER ID,Unnamed: 1_level_1,Unnamed: 2_level_1
100001920,"puebla, mexico","[(17.54886728, -98.57749795)]"
100002734,Taubaté - SP - Brazil,"[(-23.0418578, -45.55798539), (-23.0418578, -4..."
100002918,(;,"[(47.59587574, -52.68829447)]"
100003297,"Daytona Beach, FL","[(29.195698, -81.063029), (29.1987957, -81.042..."
100006175,,"[(-7.11710053, -34.83502584)]"


In [12]:
print("Compute representative point per user...")
start = timer()

users['POINT'] = users['POINT'].apply(lambda x: MultiPoint(x).representative_point())

print("Done in", round(timer()-start), "sec")

Compute representative point per user...
Done in 9 sec


# Get Account Locations' Geocoordinates

In [13]:
print("Sort by location...")
start = timer()

users.sort_values(by='LOCATION',inplace=True)

print("Done in", round(timer()-start), "sec")

Sort by location...
Done in 0 sec


In [14]:
print("Groupby location...")
start = timer()

locations = pd.concat([
groupby_apply(users[['LOCATION','POINT']],lambda x:x.tolist(),sort=False),
groupby_apply(users[['LOCATION','POINT']],lambda x:x.shape[0],sort=False).rename('N'),
],1)

del users

print('# Locations:', locations.shape[0])

locations.sort_values(by='N',ascending=False,inplace=True)
locations.reset_index(inplace=True)

print("Done in", round(timer()-start), "sec")

Groupby location...
# Locations: 82093
Done in 1 sec


In [15]:
print("Compute representative point per location...")
start = timer()

locations['POINT'] = locations['POINT'].apply(
lambda x:MultiPoint(x).representative_point()).apply(
lambda point:(point.x,point.y))

print("Done in", round(timer()-start), "sec")

Compute representative point per location...
Done in 6 sec


In [11]:
locations.head(10)

Unnamed: 0,LOCATION,POINT,N
0,,"(25.299, -4.918)",11304513
1,Indonesia,"(-5.53034, 106.54255)",98240
2,London,"(49.01730709, 1.43906842)",88896
3,Brasil,"(-17.99120486, -45.60141543)",70885
4,Jakarta,"(-5.19392835, 103.92920358)",55842
5,Philippines,"(14.604133, 120.017236)",51989
6,İstanbul,"(40.78565557, 28.87995098)",50755
7,istanbul,"(40.76090685, 28.8750945)",39241
8,indonesia,"(-5.3072557, 106.6885)",39057
9,Argentina,"(-31.9880371, -59.2789376)",36977


# Save

In [12]:
print("Save...")
start = timer()

locations.to_pickle(path_to_output_files+'account-locations-with-geocoordinates.pkl')

print("Done in", round(timer()-start), "sec")

Save...
Done in 5 sec
