In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import multiprocessing as mp
from timeit import default_timer as timer
import glob
from functools import partial

In [2]:
version_mentions  = 'v3'

path_to_timelines = './data/timelines/'
path_to_locations = './data/decahose/parsed/locations/'
path_to_keywords  = './data/keywords/'
path_to_mentions  = './data/mentions/'

n_processes = mp.cpu_count()
print('# Processes:', n_processes)
    
if not os.path.exists(path_to_mentions):
    os.mkdir(path_to_mentions)
       
countries = sorted(set([x.split('-')[0].title() for x in os.listdir(path_to_timelines) if '-' in x]))

print('Countries:\n')
print('\n'.join(countries))

# Processes: 4
Countries:

Indonesia
Malaysia
Myanmar
Pakistan
Saudi Arabia
Thailand


In [7]:
all_keywords = {
'Argentina':['perdí mi trabajo','perdi mi trabajo','acabo de perder mi trabajo','me despidieron','acabo de ser despedido'],
'Brazil':['eu perdi meu emprego','acabei de perder meu emprego','fui demitido','acabei de ser demitido'], 
'Chile':['perdí mi trabajo','perdi mi trabajo','acabo de perder mi trabajo','me despidieron','acabo de ser despedido'],
'Colombia':['perdí mi trabajo','perdi mi trabajo','acabo de perder mi trabajo','me despidieron','acabo de ser despedido'],
'United Arab Emirates':['لقد فقدت وظيفتي','تم طردي','أنا فقط حصلت على النار'],
'Kuwait':['لقد فقدت وظيفتي','تم طردي','أنا فقط حصلت على النار'],
'Indonesia':['saya kehilangan pekerjaan','saya baru saja kehilangan pekerjaan','saya dipecat','saya baru saja dipecat'],
'Malaysia':['saya kehilangan pekerjaan saya','saya baru sahaja kehilangan pekerjaan saya','saya dipecat','saya telah dipecat','saya baru sahaja dipecat'],
'Philippines':['nawalan ako ng trabaho','nawala ko lang ang trabaho ko'],
'Qatar':['لقد فقدت وظيفتي','تم طردي','أنا فقط حصلت على النار'],
'Saudi Arabia':['لقد فقدت وظيفتي','تم طردي','أنا فقط حصلت على النار'],
'Thailand':['ฉันตกงาน','ฉันเพิ่งตกงาน','ฉันโดนไล่ออก','ฉันเพิ่งถูกไล่ออก'],
}

# pd.Series(all_keywords).to_csv('./data/keywords/keywords-v3.csv',encoding='utf8')

In [4]:
def get_locations(country):
    
    return pd.concat([
    pd.read_pickle(file) for file in glob.glob(path_to_locations+'locations*')]).loc[country].index

def get_paths_to_timelines(country):
    
    return glob.glob(path_to_timelines+country.lower()+'*'+'/*'+country.lower()+'*.pkl')

def get_mentions(locations,keywords,paths_to_timelines,n_blocks,index_block):
    
    cols = ['TIME','USER LOCATION','TEXT']
    
    timelines = pd.DataFrame()
    
    for path_to_timeline in paths_to_timelines[n_blocks*index_block:n_blocks*(index_block+1)]:
        
        timeline = pd.read_pickle(path_to_timeline)[cols]

        dates = timeline['TIME'].apply(lambda x:x.split())
        timeline['YEAR']  = dates.apply(lambda x:x[-1])
        timeline['MONTH'] = dates.apply(lambda x:x[1])

        # Select Timeline With Identified Locations
        timeline = timeline.loc[timeline['USER LOCATION'].isin(locations)].copy()

        if len(timeline):
            
            timeline['TEXT'] = timeline['TEXT'].apply(lambda x:x.lower().replace('#',''))

            for keyword in keywords:

                timeline[keyword.lower()] = \
                timeline['TEXT'].apply(lambda x:int(keyword.lower() in x))

            timeline['COUNT'] = 1

            timelines = pd.concat([timelines,
            timeline.drop(['TEXT','TIME'],1)],sort=True).groupby(
            ['YEAR','MONTH','USER LOCATION'],as_index=False).sum()
        
    return timelines

In [5]:
def main(country):
   
    print('Country:', country)
    
    locations = get_locations(country)
    print('# Identified Locations:',len(locations))

    keywords = all_keywords[country]
    print('# Keywords:',len(keywords))

    paths_to_timelines = get_paths_to_timelines(country)
    print('# Timelines:',len(paths_to_timelines))
    
    users = [x.split('/')[-1].split('-')[0] for x in paths_to_timelines]
    if len(set(users)) != len(users):
        print('Check Repeated Users...Exit')
        sys.exit(0)
    del users
    
    start = timer()
    
    # Split Timelines By Block
    n_blocks = len(paths_to_timelines)//n_processes + len(paths_to_timelines)%n_processes
    print('# Users by Block:', n_blocks)  

    print('Collect Mentions...')
    with mp.Pool(processes = n_processes) as pool:
        
        partial_mentions = partial(get_mentions, locations, keywords, paths_to_timelines, n_blocks) 
        
        mentions = pd.concat(pool.map(partial_mentions, range(n_processes))).groupby(
        ['YEAR','MONTH','USER LOCATION'],as_index=False).sum()
    
    end = timer()
    print('Computing Time:', round(end - start,2), 'sec')
    
    print('# Obs:', int(mentions.drop(['YEAR','MONTH','USER LOCATION','COUNT'],1).values.sum()))
    print('# Mentions:', mentions['COUNT'].sum())

    print('Save Timelines...')
    start = timer()

    mentions.to_pickle(path_to_mentions+'mentions-'+country.lower()+'-'+version_mentions+'.pkl',compression='xz')
    del mentions
    
    end = timer()   
    print('Computing Time:', round(end - start,2), 'sec')
    print()

In [6]:
print()
try:
    get_ipython().__class__.__name__
    print('Interactive Mode')
    main(countries[1])
except:
    main(sys.argv[1])


Interactive Mode
Country: Malaysia
# Identified Locations: 504
# Keywords: 5
# Timelines: 14
# Users by Block: 5
Collect Mentions...
Computing Time: 0.66 sec
# Obs: 0
# Mentions: 9162
Save Timelines...
Computing Time: 0.02 sec

