In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
import time
import json

In [18]:
# limited data IN
df = pd.read_csv("../outputs/geoc_inv_US_2011_2020.csv", sep=";")

In [19]:
# create locations table
locations = df[["lat", "lng"]].drop_duplicates()
locations["location_id"] = locations.index + 1

In [20]:
# add Point geometry for spatial join
locations["geometry"] = [Point(xy) for xy in zip(locations.lng, locations.lat)]
locations = gpd.GeoDataFrame(locations, geometry="geometry", crs="EPSG:4326")

In [21]:
# census tract geoms
tract_geoms = gpd.GeoDataFrame.from_features(
    [json.loads(e.strip('\n')) for e in open('../data/shape_files/censustract_geoms_top50.geojson').readlines()]
)
tract_geoms = tract_geoms.set_crs("epsg:4326")

In [22]:
def locations_to_census_tracts(points, tracts):
    """spatial join locations to census tracts"""
    location_with_tract = gpd.sjoin(
        points,
        tracts,
        "left",
        "within",
    )
    return location_with_tract

In [26]:
# add census tract IDs to locations
locations = locations_to_census_tracts(locations, tract_geoms)
locations.dropna(subset=["full_geoid"], inplace=True)

In [36]:
# add cbsacode
cbsa = pd.read_csv("../data/cbsacode_shortname_tracts.csv", sep=";", index_col=0)

locations = pd.merge(
    locations,
    cbsa,
    left_on="full_geoid",
    right_on="geoid",
    how="left"
)

In [43]:
# nodelist with key columns ONLY
nodelist = locations[["location_id", "lng", "lat", "geoid", "cbsacode", "short_name"]].rename(columns={"lng":"lon"})

In [45]:
nodelist.head(2)

Unnamed: 0,location_id,lon,lat,geoid,cbsacode,short_name
0,2,-96.9584,32.8629,14000US48113014312,19100.0,Dallas
1,3,-88.0135,42.0521,14000US17031770202,16980.0,Chicago


In [46]:
# join location IDs to df
df2 = pd.merge(
    df,
    locations,
    on=["lat", "lng"],
    how="left"
)

In [47]:
def edgelist_construction(df, key_cols, directed):
    """create location-location edgelist"""
    
    # focus the dataframe
    df = df[key_cols].drop_duplicates()


    # create edgelist by join
    el = pd.merge(
        df,
        df,
        on=key_cols[0],
        suffixes=["1", "2"]
    )

    # directed
    if directed == True:
        el = el[el.iloc[:, 1] != el.iloc[:, 2]]
    else:
        el = el[el.iloc[:, 1] < el.iloc[:, 2]]
    
    # final dataframe
    el = el.iloc[:, 1:]

    return el

In [66]:
# create the edgelist
start_time = time.time()
el = edgelist_construction(df2, key_cols=["appln_id", "location_id"], directed=False)
print("--- %s seconds ---" % round((time.time() - start_time), 3))

--- 0.227 seconds ---


In [67]:
# add the coordinates to the edgelist
el = pd.merge(
    el,
    nodelist,
    left_on="location_id1",
    right_on="location_id",
    how="left"
)
el = pd.merge(
    el,
    nodelist,
    left_on="location_id2",
    right_on="location_id",
    how="left",
    suffixes=["1","2"]
)

# remove duplicate columns
el = el.loc[:,~el.columns.duplicated()].copy()

  el = pd.merge(


In [68]:
# focus on ties INSIDE metropolitan areas
print(el.shape)
el = el[el["short_name1"] == el["short_name2"]]
print(el.shape)

(707750, 12)
(395580, 12)


In [70]:
el.head(2)

Unnamed: 0,location_id1,location_id2,lon1,lat1,geoid1,cbsacode1,short_name1,lon2,lat2,geoid2,cbsacode2,short_name2
0,6.0,7.0,-122.055,37.3207,14000US06085507701,41940.0,San Jose,-122.126,37.4042,14000US06085510600,41940.0,San Jose
1,8.0,9.0,-96.9449,32.8138,14000US48113014902,19100.0,Dallas,-96.935,32.8611,14000US48113014206,19100.0,Dallas
