In [1]:
from glob import glob
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from itertools import chain

In [2]:
def groupby_apply(df, func, sort=True):
    
    # Groupby name_key then apply func on the values in name_value
    # Speed up pandas groupby a fractor of 10 ;)
    
    # Input a dataframe with columns [name_key, name_value]
    name_key, name_value = df.columns
    
    if sort:
        keys, values = df.sort_values(by=name_key).values.T
    else:
        keys, values = df.values.T
        
    # Extract Unique Keys And Index Of Transitions
    unique_keys, index = np.unique(keys, True)
    
    # Split Values According to Transitions
    arrays = np.split(values, index[1:])
    
    # Apply func to each array of values corresponding to a given key
    return pd.Series([func(a) for a in arrays], name=name_value, index=pd.Index(unique_keys,name=name_key))

In [3]:
path_to_input_files  = '../data/decahose/parsed/users/'
path_to_output_files = '../data/decahose/parsed/'

In [4]:
input_files = glob(path_to_input_files+'users-from-decahose*')
print('# Input Files:', len(input_files))

# Input Files: 2


In [5]:
def get_users_and_locations(input_file):
    
    data = pd.read_pickle(input_file,compression='xz')
    
    # Sort Before Concatenation
    data.sort_values(by='USER ID',inplace=True)
    
    # Concatenate By User
    users = groupby_apply(data[['USER ID','USER LOCATION']],lambda x:set(x),sort=False)
    
    # Sort Before Concatenation
    data.sort_values(by='USER LOCATION',inplace=True)

    # Concatenate By Location
    locations = groupby_apply(data[['USER LOCATION','USER ID']],lambda x:set(x),sort=False)
    
    del data
    
    return users, locations

In [6]:
def main():

    for i,input_file in enumerate(sorted(input_files)):
        
        start = timer()

        print()
        print('File:', i, input_file)

        if not i:

            all_users, all_locations = get_users_and_locations(input_file)

        else:

            users, locations = get_users_and_locations(input_file)

            print('# New Users:', users.shape[0])
            print('# News Locations:', locations.shape[0])

            all_users = pd.concat([all_users, users]).sort_index().reset_index()
            del users
            
            print('# All Users after Concatenation:', all_users.shape[0])

            all_users = groupby_apply(all_users,lambda x:set(chain.from_iterable(x)),sort=False)

            print('# All Users after Reduction:', all_users.shape[0])

            all_locations = pd.concat([all_locations, locations]).sort_index().reset_index()
            del locations
            
            print('# All Locations after Concatenation:', all_locations.shape[0])

            all_locations = groupby_apply(all_locations,lambda x:set(chain.from_iterable(x)),sort=False)

            print('# All Locations after Reduction:', all_locations.shape[0])

        print('# All Users:', all_users.shape[0])
        print('# All Locations:', all_locations.shape[0])
        
        print("Done in", round(timer()-start), "sec")
    
    print('Save:')
    start = timer()
    
    all_users.to_pickle(path_to_output_files+'all-users-from-decahose.pkl.xz',compression='xz')
    all_locations.to_pickle(path_to_output_files+'all-locations-from-decahose.pkl.xz',compression='xz')
    
    print("Done in", round(timer()-start), "sec")

    return 0

In [7]:
print("Concatenate Files...")
start = timer()

if __name__ == "__main__":
    main()
    
end = timer()
print()
print('Total Computing Time:', round(end - start), 'Sec')

Concatenate Files...

File: 0 ../data/decahose/parsed/users/users-from-decahose-partition-0-block-0.pkl.xz
# All Users: 202917
# All Locations: 110887
Done in 3 sec

File: 1 ../data/decahose/parsed/users/users-from-decahose-partition-0-block-1.pkl.xz
# New Users: 202917
# News Locations: 110887
# All Users after Concatenation: 405834
# All Users after Reduction: 202917
# All Locations after Concatenation: 221774
# All Locations after Reduction: 110887
# All Users: 202917
# All Locations: 110887
Done in 9 sec
Save:
Done in 9 sec

Total Computing Time: 21 Sec


In [9]:
pd.read_pickle(path_to_output_files+'all-users-from-decahose.pkl.xz',compression='xz')

USER ID
100004737                       {Ucayali, Peru}
1000062301             {En tus sueños húmedos.}
100007440       {Under the Contrails, Michigan}
100007602                    {Obson, Sonora,Mx}
100008258                               {あぁーっと}
100008734                              {Brazil}
100010191                            {海のみえない横浜}
1000114350                              {Metz }
1000170540                        {Puerto Rico}
100017257                    {Salvador - Bahia}
100021049     {Abu Dhabi, United Arab Emirates}
1000219934                          {Argentina}
1000223816                   {Spiro, Oklahoma }
100028313                   {on the road again}
1000306848                             {Earth }
1000309268          {Vicente Lopez,  Argentina}
1000310990                              {vic!¡}
1000328197              {somewhere performing }
1000334659     {#LiveLongChris,Rest Easy Meeka}
100038386                     {Granada, España}
1000385274     {Where my feet r 