In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

import geopandas as gpd
import shapely
from shapely.geometry import Point, Polygon, LineString

import json
import time

## data prep (1) edgelist w/ home-home distance
output : geo_edgelist_top50.csv.gz

In [11]:
# follower network -- data
edgelist = pd.read_csv('../data/usageousers_city_follower_networks.rpt.gz')

# read in data about individuals -- data2
userinfo = pd.read_csv('../data/usageousers_data_export_with_tract_geoid_top50.csv.gz', index_col=0)
user_geo = userinfo.loc[:,["user_id", "lat_home", "lon_home", "lat_work", "lon_work"]]

In [12]:
# edgelist with geoinfo
edgelist = pd.merge(pd.merge(edgelist, user_geo, left_on="user_id1", right_on="user_id", how="left"),\
               user_geo, left_on="user_id2", right_on="user_id", how="left", suffixes=("1", "2"))

# remove duplicate columns
edgelist = edgelist.loc[:,~edgelist.columns.duplicated()]

In [13]:
# DISTANCE - home-home part

start_time = time.time()

# geometry cols
edgelist["geometry1"] = edgelist.apply(lambda r: Point(r["lon_home1"],r["lat_home1"]), axis=1)
edgelist["geometry2"] = edgelist.apply(lambda r: Point(r["lon_home2"],r["lat_home2"]), axis=1)

geo_edgelist = gpd.GeoDataFrame(edgelist)

# change crs
geo_edgelist = geo_edgelist.set_geometry('geometry2')
geo_edgelist.crs = {'init': 'epsg:4326'}
geo_edgelist = geo_edgelist.to_crs({'init': 'epsg:3857'})

geo_edgelist = geo_edgelist.set_geometry('geometry1')
geo_edgelist.crs = {'init': 'epsg:4326'}
geo_edgelist = geo_edgelist.to_crs({'init': 'epsg:3857'})

# set geometry
geo_edgelist = geo_edgelist.set_geometry('geometry1')

# home-home distance calculation
geo_edgelist['dist_hh'] = geo_edgelist['geometry1'].distance(geo_edgelist['geometry2'])

print("--- %s seconds ---" % (time.time() - start_time))

  return _prepare_from_string(" ".join(pjargs))


--- 390.20663619041443 seconds ---


In [5]:
# DISTANCE - work-work part

start_time = time.time()

# geometry cols
edgelist["geometry1"] = edgelist.apply(lambda r: Point(r["lon_work1"],r["lat_work1"]), axis=1)
edgelist["geometry2"] = edgelist.apply(lambda r: Point(r["lon_work2"],r["lat_work2"]), axis=1)

geo_edgelist2 = gpd.GeoDataFrame(edgelist)

# change crs
geo_edgelist2 = geo_edgelist2.set_geometry('geometry2')
geo_edgelist2.crs = {'init': 'epsg:4326'}
geo_edgelist2 = geo_edgelist2.to_crs({'init': 'epsg:3857'})

geo_edgelist2 = geo_edgelist2.set_geometry('geometry1')
geo_edgelist2.crs = {'init': 'epsg:4326'}
geo_edgelist2 = geo_edgelist2.to_crs({'init': 'epsg:3857'})

# set geometry
geo_edgelist2 = geo_edgelist2.set_geometry('geometry1')

# home-home distance calculation
geo_edgelist2['dist_ww'] = geo_edgelist2['geometry1'].distance(geo_edgelist2['geometry2'])

print("--- %s seconds ---" % (time.time() - start_time))

  return _prepare_from_string(" ".join(pjargs))


--- 428.23562598228455 seconds ---


In [18]:
# combine..
temp = geo_edgelist2.loc[:,["cbsacode", "user_id1", "user_id2", "dist_ww"]]
geo_edgelist = pd.merge(geo_edgelist, temp, on=["cbsacode", "user_id1", "user_id2"], how="left")

In [22]:
# at this point, save the data!!
export = geo_edgelist.drop(columns=['geometry1', 'geometry2'])
export.to_csv("../data/geo_edgelist_top50.csv.gz", compression="gzip", index=False)

## data prep (2) degree around home and tract info
output : degree_tab_top50.csv.gz

In [45]:
# census data
census = pd.read_csv('../data/censusdata_top50_2012.csv')

# short names for cbsas
names = pd.read_csv('../data/cbsacode_shortname_tracts.csv', sep = ";", index_col = 0)

In [46]:
# function to create tract geoids
def create_geoid(row):
    state = str(int(row["state"])).zfill(2)
    county = str(int(row["county"])).zfill(3)
    tract = str(int(row["tract"])).zfill(6)
    return "14000US" +state+county+tract

census['geoid'] = census.apply(create_geoid,axis=1)

# add names
census = pd.merge(census, names, on='geoid')

In [47]:
# geojson data, converted to geopandas dataframe
tract_geoms = gpd.GeoDataFrame.from_features(
    [json.loads(e.strip('\n')) for e in open('../data/censustract_geoms_top50.geojson').readlines()]
)

# change projection
init_crs = 4326 # lon,lat
project_crs = 3857 # Cartesian systems
tract_geoms.crs = {'init': 'epsg:' + str(init_crs)}

  return _prepare_from_string(" ".join(pjargs))


In [48]:
# merge census data and geometries of tracts
tract_data = pd.merge(census[['geoid', 'cbsacode', 'short_name', 'population', 'education_bachelor', 'income']],\
    tract_geoms[['geometry', 'full_geoid']],\
    left_on='geoid', right_on='full_geoid', how='left')

# drop those tracts where income < $1000
tract_data = tract_data[(tract_data['income']>1000)]

# median income by cbsacode
tract_data['income_median'] = tract_data['cbsacode'].map(tract_data.groupby('cbsacode')['income'].median().to_dict())
poor = (tract_data['income'] < tract_data["income_median"])
tract_data['poor'] = poor.astype(int).fillna(0)

In [49]:
# variable manipulation
tract_data['log_income'] = np.log(tract_data['income'])
tract_data['log_population'] = np.log(tract_data['population'])
tract_data['BA_share'] = tract_data['education_bachelor']/ tract_data['population']

In [50]:
# read in data about individuals -- userinfo - filter on geoinfo -- user_geo
userinfo = pd.read_csv('../data/usageousers_top50_common_data.csv.gz', index_col=0)
user_geo = userinfo.loc[:,["user_id", "lat_home", "lon_home"]]

# geometry col
user_geo["geometry_h"] = user_geo.apply(lambda r: Point(r["lon_home"],r["lat_home"]), axis=1)

# edgelist with distance
geo_edgelist = pd.read_csv("../data/geo_edgelist_top50.csv.gz")

# filter on important columns
geo_edgelist = geo_edgelist[["user_id1", "user_id2", "dist_hh" ]]

In [51]:
# bin distances
binsize=500
geo_edgelist["dh_bin"] = binsize * (geo_edgelist["dist_hh"] / binsize).map(int) + 500

In [52]:
# degree table -- NOTE -- all ties are mutual
degree_tab = geo_edgelist.groupby("user_id1")["user_id2"].count().reset_index()
degree_tab.columns = ["user_id", "degree"]

In [53]:
# remove users with less than 10 ties (!!!)
degree_tab = degree_tab[degree_tab["degree"] >= 10] 

In [54]:
# loop -- degree at distance around HOME
for d in list(range(500, 10500, 500)):
    
    temp = geo_edgelist.loc[geo_edgelist.dh_bin == d,:].groupby("user_id1")["user_id2"].count().reset_index()
    temp.columns = ["user_id", ("d"+str(d))]
    
    degree_tab = pd.merge(degree_tab, temp, on="user_id", how="left")

# replace NA with 0
degree_tab = degree_tab.fillna(0)

In [55]:
# loop -- cummulative degree around HOME
for d in list(range(500, 10500, 500)):
    
    temp = geo_edgelist.loc[geo_edgelist.dh_bin <= d,:].groupby("user_id1")["user_id2"].count().reset_index()
    temp.columns = ["user_id", ("dcum"+str(d))]
    
    degree_tab = pd.merge(degree_tab, temp, on="user_id", how="left")

# replace NA with 0
degree_tab = degree_tab.fillna(0)

In [56]:
# get the share of each degree
for c in list(degree_tab.columns[22:,]):    
    degree_tab[str(c) + "_share"] = round((degree_tab[str(c)] / degree_tab["degree"]), 3)

In [57]:
# add user geoinfo
degree_tab = pd.merge(degree_tab, user_geo, on="user_id", how="left")
degree_tab = gpd.GeoDataFrame(degree_tab)
degree_tab = degree_tab.set_geometry('geometry_h')
degree_tab.crs = {'init': 'epsg:' + str(init_crs)}

  return _prepare_from_string(" ".join(pjargs))


In [58]:
# spatial join
tract_data = gpd.GeoDataFrame(tract_data)
tract_data = tract_data.set_geometry('geometry')
degree_tab = gpd.sjoin(degree_tab, tract_data[['geometry', 'poor', 'cbsacode', 'short_name']], 'left', 'within')

In [61]:
# clean up
degree_tab = degree_tab.dropna(subset=["short_name"])   # here we lost around 579 / 86.821 users
degree_tab.drop(columns=["index_right", "geometry_h"], inplace=True)

# column reorder
cols = degree_tab.columns.tolist()
dcols = cols[1:62]
newcols = ["user_id", "cbsacode", "short_name", "poor", "lat_home", "lon_home"] + dcols
degree_tab = degree_tab[newcols]

In [83]:
# at this point, save the data!!
degree_tab.to_csv("../data/degree_tab_top50.csv.gz", compression="gzip", index=False)

## data prep (3) local clustering around home
output : clust_tab_top50.csv.gz

In [27]:
# reopen degree_tab
degree_tab = pd.read_csv("../data/degree_tab_top50.csv.gz")
users = list(set(list(degree_tab["user_id"])))

# edgelist with distance
geo_edgelist = pd.read_csv("../data/geo_edgelist_top50.csv.gz")
geo_edgelist = geo_edgelist[["user_id1", "user_id2", "dist_hh" ]]

In [38]:
# clustering around HOME
start_time = time.time()

temp_list = []

for user in users:
    time.sleep(.005)
    r = {}
    r['user_id'] = user
    for dist in [1000, 5000, 10000]:
        # get third edges
        lista = list(geo_edgelist[(geo_edgelist['user_id1'] == user) & (geo_edgelist['dist_hh'] <= dist)]['user_id2'])
        if len(lista) < 2:   # in case the user does not have at least 2 connections
            r['clust' + str(dist)] = np.nan
        else:
            a = geo_edgelist[geo_edgelist['user_id1'].isin(lista)]
            b = a[a['user_id2'].isin(lista)][['user_id1', 'user_id2']]
            # get df with ego network and below the third edges
            c = pd.concat([geo_edgelist[geo_edgelist['user_id1'] == user][['user_id1', 'user_id2']], b], ignore_index = True)

            G = nx.from_pandas_edgelist(c, 'user_id1', 'user_id2')
            #r['clust' + str(dist)] = nx.average_clustering(G)
            r['clust' + str(dist)] = nx.transitivity(G)
    temp_list.append(r)


clust_table = pd.DataFrame(temp_list)

print("--- %s seconds ---" % (time.time() - start_time))

--- 9690.72845506668 seconds ---


In [39]:
# at this point, save the data!!
clust_table.to_csv("../data/clust_tab_top50.csv.gz", compression="gzip", index=False)

## data prep (4) supported ties around home
output : support_tab_top50.csv.gz

In [2]:
# reopen degree_tab
degree_tab = pd.read_csv("../data/degree_tab_top50.csv.gz")
users = list(set(list(degree_tab["user_id"])))

# edgelist with distance
geo_edgelist = pd.read_csv("../data/geo_edgelist_top50.csv.gz")
geo_edgelist = geo_edgelist[["user_id1", "user_id2", "dist_hh" ]]

In [3]:
# supported ties around HOME
# NOTE -- this process only works in case ties are present both ways
start_time = time.time()

temp_list = []

for user in users:
    r = {}
    r['user_id'] = user
    for dist in [1000, 5000, 10000]:
        # get third edges
        lista = list(geo_edgelist[(geo_edgelist['user_id1'] == user) & (geo_edgelist['dist_hh'] <= dist)]['user_id2'])
        if len(lista) < 2:
            r['support' + str(dist)] = np.nan
        else:
            a = geo_edgelist[geo_edgelist['user_id1'].isin(lista)]
            b = a[a['user_id2'].isin(lista)][['user_id1', 'user_id2']]
                       
            r['support' + str(dist)] = len(b['user_id1'].unique()) / len(lista) 
    temp_list.append(r)


support_table = pd.DataFrame(temp_list)

print("--- %s seconds ---" % (time.time() - start_time))

--- 3636.065819978714 seconds ---


In [5]:
# at this point, save the data!!
support_table.to_csv("../data/supp_tab_top50.csv.gz", compression="gzip", index=False)

## data prep (5) census data for regression
output : census_for_regression.csv.gz

In [9]:
# census data
census = pd.read_csv('../data/censusdata_top50_2012.csv')

# short names for cbsas
names = pd.read_csv('../data/cbsacode_shortname_tracts.csv', sep = ";", index_col = 0)

# home tract of users
userinfo = pd.read_csv('../data/usageousers_data_export_with_tract_geoid_top50.csv.gz', index_col=0)
userinfo = userinfo.loc[:,["user_id", "cbsacode", "tract_home"]]

In [10]:
# function to create tract geoids
def create_geoid(row):
    state = str(int(row["state"])).zfill(2)
    county = str(int(row["county"])).zfill(3)
    tract = str(int(row["tract"])).zfill(6)
    return "14000US" +state+county+tract

census['geoid'] = census.apply(create_geoid,axis=1)

# add names
census = pd.merge(census, names, on='geoid')

# keep the key columns
census = census[["geoid", "cbsacode", "short_name", "population", "education_bachelor", "income"]]

In [12]:
# combine info on users
full_userinfo = pd.merge(userinfo, census, left_on=["cbsacode", "tract_home"], right_on=["cbsacode", "geoid"], how="left")

In [17]:
# export
full_userinfo.to_csv("../data/census_for_regression.csv.gz", index=False)

## data prep (6) income of friends
output : edgelist_tractinfo.csv.gz

In [2]:
# census data
census = pd.read_csv('../data/censusdata_top50_2012.csv')

# short names for cbsas
names = pd.read_csv('../data/cbsacode_shortname_tracts.csv', sep = ";", index_col = 0)

In [3]:
# function to create tract geoids
def create_geoid(row):
    state = str(int(row["state"])).zfill(2)
    county = str(int(row["county"])).zfill(3)
    tract = str(int(row["tract"])).zfill(6)
    return "14000US" +state+county+tract

census['geoid'] = census.apply(create_geoid,axis=1)

# add names
census = pd.merge(census, names, on='geoid')

In [4]:
# geojson data, converted to geopandas dataframe
tract_geoms = gpd.GeoDataFrame.from_features(
    [json.loads(e.strip('\n')) for e in open('../data/censustract_geoms_top50.geojson').readlines()]
)

# change projection
init_crs = 4326 # lon,lat
project_crs = 3857 # Cartesian systems
tract_geoms.crs = {'init': 'epsg:' + str(init_crs)}

  return _prepare_from_string(" ".join(pjargs))


In [5]:
# merge census data and geometries of tracts
tract_data = pd.merge(census[['geoid', 'cbsacode', 'short_name', 'population', 'education_bachelor', 'income']],\
    tract_geoms[['geometry', 'full_geoid']],\
    left_on='geoid', right_on='full_geoid', how='left')

# drop those tracts where income < $1000
tract_data = tract_data[(tract_data['income']>1000)]

# median income by cbsacode
tract_data['income_median'] = tract_data['cbsacode'].map(tract_data.groupby('cbsacode')['income'].median().to_dict())
poor = (tract_data['income'] < tract_data["income_median"])
tract_data['poor'] = poor.astype(int).fillna(0)

In [7]:
# edgelist with distance
geo_edgelist = pd.read_csv("../data/geo_edgelist_top50.csv.gz")

# filter on important columns
geo_edgelist = geo_edgelist[["cbsacode", "user_id1", "user_id2", "lon_home1", "lat_home1", "lon_home2", "lat_home2", "dist_hh"]]

# create geometry
geo_edgelist["home_point1"] = geo_edgelist.apply(lambda r: Point(r["lon_home1"],r["lat_home1"]), axis=1)
geo_edgelist["home_point2"] = geo_edgelist.apply(lambda r: Point(r["lon_home2"],r["lat_home2"]), axis=1)
geo_edgelist = gpd.GeoDataFrame(geo_edgelist)

In [8]:
geo_edgelist.shape

(2711185, 10)

In [9]:
# spatial join 1
tract_data = gpd.GeoDataFrame(tract_data)
tract_data = tract_data.set_geometry('geometry')

geo_edgelist = geo_edgelist.set_geometry('home_point1')
geo_edgelist.crs = {'init': 'epsg:' + str(init_crs)}

geo_edgelist = gpd.sjoin(geo_edgelist, tract_data, 'left', 'within')

  return _prepare_from_string(" ".join(pjargs))


In [11]:
# clean up -- 1
geo_edgelist = geo_edgelist[["user_id1", "user_id2", "dist_hh", "home_point2", "geoid", "population", "education_bachelor", "income", "income_median", "poor"]]
geo_edgelist.columns = ["user_id1", "user_id2", "dist_hh", "home_point2", "tract_id1", "population1", "education_bachelor1", "income1", "income_median1", "poor1"]

In [13]:
# spatial join 2
tract_data = gpd.GeoDataFrame(tract_data)
tract_data = tract_data.set_geometry('geometry')

geo_edgelist = geo_edgelist.set_geometry('home_point2')
geo_edgelist.crs = {'init': 'epsg:' + str(init_crs)}

geo_edgelist = gpd.sjoin(geo_edgelist, tract_data, 'left', 'within')

  return _prepare_from_string(" ".join(pjargs))


In [15]:
# clean up -- 2
geo_edgelist = geo_edgelist[["cbsacode", "short_name", "user_id1", "user_id2", "dist_hh", "tract_id1", "population1", "education_bachelor1", "income1", "income_median1", "poor1", "geoid", "population", "education_bachelor", "income", "income_median", "poor"]]
geo_edgelist = geo_edgelist.rename(columns={"geoid": "tract_id2", "population": "population2", "education_bachelor": "education_bachelor2", "income": "income2", "income_median": "income_median2", "poor": "poor2"})

# sort
geo_edgelist = geo_edgelist.sort_values(by=['user_id1', 'user_id2'])

In [16]:
# degree table
degree_tab = pd.read_csv("../data/degree_tab_top50.csv.gz")

# filter geo_edgelist by degree_tab users
dfilter = list(set(list(degree_tab.user_id)))

In [17]:
# filter and keep relevant info
geo_edgelist = geo_edgelist[geo_edgelist["user_id1"].isin(dfilter)]
geo_edgelist = geo_edgelist.dropna(subset=['income2'])

In [19]:
# export
geo_edgelist.to_csv("../data/edgelist_tractinfo.csv.gz", index=False)