In [1]:
import os
import socket
import numpy as np
import pandas as pd
from glob import glob
from timeit import default_timer as timer
from shapely.geometry import Polygon, Point
import folium 

In [2]:
print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_users='../data/decahose/parsed/users'
    path_to_locations='../data/decahose/parsed/locations'
else:
    path_to_users='/scratch/spf248/twitter/data/decahose/parsed/users'
    path_to_locations='/scratch/spf248/twitter/data/decahose/parsed/locations'

Hostname: c41-04


In [3]:
print('Import Users By Account Locations')
start = timer()

l = []
for filename in sorted(glob(os.path.join(path_to_users,'user-ids-by-account-location-verified-json/*.json'))):
    try:
        df = pd.read_json(filename,lines=True)
        l.append(df)
    except:
        print('error importing', filename)
users_by_account_location=pd.concat(l, axis=0, ignore_index=True)
users_by_account_location=users_by_account_location.set_index('user_location')['user_id']
users_by_account_location=users_by_account_location.apply(eval).apply(lambda x:[str(y) for y in x])
users_by_account_location=users_by_account_location.apply(len)
print('# Locations:', len(users_by_account_location))
print('# Users Total:', users_by_account_location.sum())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Users By Account Locations
# Locations: 39779
# Users Total: 92088032
Computing Time: 196 sec


In [6]:
print('Merge With Identified Locations:')
start=timer()

account_locations_identified=pd.read_pickle(
os.path.join(path_to_locations,'account-locations-identified.pkl'))

account_locations_identified=account_locations_identified[
['LOCATION', 'latitude', 'longitude', 'locality_long']].copy()

users_by_city=pd.merge(account_locations_identified,
users_by_account_location.reset_index().rename(
columns={'user_location':'LOCATION'})).drop(['LOCATION'],1).dropna().groupby(
'locality_long',as_index=False).agg({'latitude':'first','longitude':'first','user_id':'sum'}).rename(
columns={'user_id':'n_users'}).sort_values(by='n_users',ascending=False).reset_index(drop=True)

end=timer()
print('Computing Time:', round(end - start), 'sec')

Merge With Identified Locations:
Computing Time: 0 sec


In [7]:
print("# Identified Cities:", users_by_city.shape[0])

# Identified Cities: 7700


In [8]:
print("# Users Identified At the City Level:", users_by_city['n_users'].sum())

# Users Identified At the City Level: 57067437


In [11]:
print('Map Twitter Users Per City:')
start = timer()

map_city = folium.Map(
location = [20, 0], 
zoom_start=2, 
tiles='cartodbpositron',
)

for i, (city, lat, lon, n_users) in enumerate(zip(
users_by_city['locality_long'],
users_by_city['latitude'],
users_by_city['longitude'],
users_by_city['n_users'])):
    folium.CircleMarker(
    [lat, lon],
    radius=0.01*np.sqrt(n_users),
    tooltip = folium.Popup(
    city.title() + ' (' + '{:,}'.format(n_users) + ' Users)', 
    parse_html=True,
    max_width=10000),
    color='b',
    fill=True,
    fill_opacity=0.5,
    ).add_to(map_city)
    
end = timer()
print('Computing Time:', round(end - start), 'sec')

In [None]:
map_city.save('../fig/map-twitter-users-per-city.html')