# edgelist w/ home-home distance
data preparation for top 50 US metros

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec

import geopandas as gpd
import shapely
from shapely.geometry import Point,Polygon,LineString

import json
import time
from copy import deepcopy

In [11]:
# follower network -- data
edgelist = pd.read_csv('../data/usageousers_city_follower_networks.rpt.gz')

# read in data about individuals -- data2
userinfo = pd.read_csv('../data/usageousers_data_export_with_tract_geoid_top50.csv.gz', index_col=0)
user_geo = userinfo.loc[:,["user_id", "lat_home", "lon_home", "lat_work", "lon_work"]]

In [12]:
# edgelist with geoinfo
edgelist = pd.merge(pd.merge(edgelist, user_geo, left_on="user_id1", right_on="user_id", how="left"),\
               user_geo, left_on="user_id2", right_on="user_id", how="left", suffixes=("1", "2"))

# remove duplicate columns
edgelist = edgelist.loc[:,~edgelist.columns.duplicated()]

In [13]:
# DISTANCE - home-home part

start_time = time.time()

# geometry cols
edgelist["geometry1"] = edgelist.apply(lambda r: Point(r["lon_home1"],r["lat_home1"]), axis=1)
edgelist["geometry2"] = edgelist.apply(lambda r: Point(r["lon_home2"],r["lat_home2"]), axis=1)

geo_edgelist = gpd.GeoDataFrame(edgelist)

# change crs
geo_edgelist = geo_edgelist.set_geometry('geometry2')
geo_edgelist.crs = {'init': 'epsg:4326'}
geo_edgelist = geo_edgelist.to_crs({'init': 'epsg:3857'})

geo_edgelist = geo_edgelist.set_geometry('geometry1')
geo_edgelist.crs = {'init': 'epsg:4326'}
geo_edgelist = geo_edgelist.to_crs({'init': 'epsg:3857'})

# set geometry
geo_edgelist = geo_edgelist.set_geometry('geometry1')

# home-home distance calculation
geo_edgelist['dist_hh'] = geo_edgelist['geometry1'].distance(geo_edgelist['geometry2'])

print("--- %s seconds ---" % (time.time() - start_time))

  return _prepare_from_string(" ".join(pjargs))


--- 390.20663619041443 seconds ---


In [5]:
# DISTANCE - work-work part

start_time = time.time()

# geometry cols
edgelist["geometry1"] = edgelist.apply(lambda r: Point(r["lon_work1"],r["lat_work1"]), axis=1)
edgelist["geometry2"] = edgelist.apply(lambda r: Point(r["lon_work2"],r["lat_work2"]), axis=1)

geo_edgelist2 = gpd.GeoDataFrame(edgelist)

# change crs
geo_edgelist2 = geo_edgelist2.set_geometry('geometry2')
geo_edgelist2.crs = {'init': 'epsg:4326'}
geo_edgelist2 = geo_edgelist2.to_crs({'init': 'epsg:3857'})

geo_edgelist2 = geo_edgelist2.set_geometry('geometry1')
geo_edgelist2.crs = {'init': 'epsg:4326'}
geo_edgelist2 = geo_edgelist2.to_crs({'init': 'epsg:3857'})

# set geometry
geo_edgelist2 = geo_edgelist2.set_geometry('geometry1')

# home-home distance calculation
geo_edgelist2['dist_ww'] = geo_edgelist2['geometry1'].distance(geo_edgelist2['geometry2'])

print("--- %s seconds ---" % (time.time() - start_time))

  return _prepare_from_string(" ".join(pjargs))


--- 428.23562598228455 seconds ---


In [18]:
# combine..
temp = geo_edgelist2.loc[:,["cbsacode", "user_id1", "user_id2", "dist_ww"]]
geo_edgelist = pd.merge(geo_edgelist, temp, on=["cbsacode", "user_id1", "user_id2"], how="left")

In [22]:
# at this point, save the data!!
export = geo_edgelist.drop(columns=['geometry1', 'geometry2'])
export.to_csv("../data/geo_edgelist_top50.csv.gz", compression="gzip", index=False)