In [6]:
!pip install fiona
!pip install shapely
!pip install haversine

Collecting haversine
  Downloading haversine-2.5.1-py2.py3-none-any.whl (6.1 kB)
Installing collected packages: haversine
Successfully installed haversine-2.5.1


In [7]:
%matplotlib inline
import sys, os, time, math, csv
import itertools
import collections

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import fiona
import shapely.geometry
import scipy.spatial
import haversine

from MigrationData import IRSMigrationData

In [8]:
migration_data = IRSMigrationData(data_dir="data/raw/migration/")
years = range(2004,2015)

## Calculate largest county intersection from migration data

In [12]:
new_fips_sets

[{'51009',
  '08019',
  '47023',
  '48381',
  '30095',
  '17197',
  '31155',
  '54007',
  '53035',
  '23021',
  '48317',
  '35009',
  '18145',
  '39071',
  '51109',
  '21043',
  '42055',
  '16059',
  '24011',
  '05023',
  '19091',
  '48271',
  '32005',
  '47061',
  '29103',
  '29099',
  '08035',
  '37007',
  '28103',
  '13243',
  '37033',
  '53023',
  '24003',
  '48235',
  '54025',
  '20039',
  '48269',
  '06065',
  '06041',
  '37037',
  '13021',
  '27023',
  '21025',
  '36103',
  '08089',
  '51700',
  '45065',
  '28007',
  '35045',
  '55035',
  '27077',
  '37169',
  '19033',
  '48129',
  '08093',
  '46037',
  '51063',
  '51520',
  '17073',
  '13269',
  '54079',
  '47017',
  '27103',
  '29209',
  '22031',
  '47021',
  '29189',
  '48343',
  '31103',
  '01015',
  '17089',
  '48435',
  '49053',
  '31097',
  '37147',
  '01093',
  '42003',
  '29107',
  '48011',
  '17099',
  '20129',
  '39015',
  '35039',
  '48191',
  '35049',
  '42017',
  '17145',
  '29075',
  '29151',
  '20065',
  '29021',

In [13]:
joined_set = set(new_fips_sets[0])
for s in new_fips_sets[1:]:
    joined_set.intersection_update(s)
print("Total of %d locations in continental states that are common to all years of data." % (len(joined_set)))

# sort FIPS code in numerical order
joined_list = {fips:int(fips) for fips in joined_set}
sorted_joined_list = sorted(joined_list, key=joined_list.get)

Total of 3106 locations in continental states that are common to all years of data.


In [33]:
sorted_joined_list

['01001',
 '01003',
 '01005',
 '01007',
 '01009',
 '01011',
 '01013',
 '01015',
 '01017',
 '01019',
 '01021',
 '01023',
 '01025',
 '01027',
 '01029',
 '01031',
 '01033',
 '01035',
 '01037',
 '01039',
 '01041',
 '01043',
 '01045',
 '01047',
 '01049',
 '01051',
 '01053',
 '01055',
 '01057',
 '01059',
 '01061',
 '01063',
 '01065',
 '01067',
 '01069',
 '01071',
 '01073',
 '01075',
 '01077',
 '01079',
 '01081',
 '01083',
 '01085',
 '01087',
 '01089',
 '01091',
 '01093',
 '01095',
 '01097',
 '01099',
 '01101',
 '01103',
 '01105',
 '01107',
 '01109',
 '01111',
 '01113',
 '01115',
 '01117',
 '01119',
 '01121',
 '01123',
 '01125',
 '01127',
 '01129',
 '01131',
 '01133',
 '04001',
 '04003',
 '04005',
 '04007',
 '04009',
 '04011',
 '04012',
 '04013',
 '04015',
 '04017',
 '04019',
 '04021',
 '04023',
 '04025',
 '04027',
 '05001',
 '05003',
 '05005',
 '05007',
 '05009',
 '05011',
 '05013',
 '05015',
 '05017',
 '05019',
 '05021',
 '05023',
 '05025',
 '05027',
 '05029',
 '05031',
 '05033',
 '05035',


In [14]:
output_fn = "data/processed/county_intersection_list_2004_2014.txt"
f = open(output_fn, "w")
f.write("\n".join(sorted_joined_list))
f.close()

## Calculate distance matrix

In [15]:
f = fiona.open("data/intermediate/boundary_shapefiles/cb_2015_us_county_500k.shp", "r")
used_geoids = set()
data = []
for row in f:
    geoid = row["properties"]["GEOID"]
    if geoid in joined_set:
        used_geoids.add(geoid)
        geom = shapely.geometry.shape(row['geometry'])
        lon, lat = geom.centroid.x, geom.centroid.y
        data.append((geoid, lon, lat))
    else:
        print("GEOID %s not in accepted list" % (geoid))
f.close()

# report whether we matched all geoids in the input list
assert len(joined_set - used_geoids) == 0

GEOID 11001 not in accepted list
GEOID 46102 not in accepted list


In [28]:
data = sorted(data)
np.save("data/")

[('01001', -86.64274411696223, 32.53492120367398),
 ('01003', -87.72257013576561, 30.7274842772312),
 ('01005', -85.39321118511748, 31.869579549806442),
 ('01007', -87.1264751946078, 32.998627980873536),
 ('01009', -86.56737698099167, 33.98087173475772)]

{'51009',
 '08019',
 '47023',
 '48381',
 '30095',
 '17197',
 '31155',
 '54007',
 '23021',
 '53035',
 '48317',
 '35009',
 '18145',
 '39071',
 '51109',
 '21043',
 '42055',
 '16059',
 '24011',
 '19091',
 '05023',
 '48271',
 '47061',
 '32005',
 '29103',
 '29099',
 '08035',
 '37007',
 '28103',
 '13243',
 '37033',
 '53023',
 '24003',
 '48235',
 '54025',
 '20039',
 '48269',
 '06065',
 '06041',
 '37037',
 '13021',
 '27023',
 '21025',
 '36103',
 '08089',
 '45065',
 '51700',
 '28007',
 '35045',
 '55035',
 '27077',
 '37169',
 '19033',
 '48129',
 '08093',
 '46037',
 '51063',
 '51520',
 '17073',
 '13269',
 '47017',
 '54079',
 '27103',
 '29209',
 '22031',
 '47021',
 '29189',
 '48343',
 '31103',
 '01015',
 '17089',
 '48435',
 '49053',
 '31097',
 '37147',
 '01093',
 '42003',
 '29107',
 '48011',
 '17099',
 '20129',
 '39015',
 '35039',
 '48191',
 '35049',
 '42017',
 '17145',
 '29075',
 '29151',
 '20065',
 '29021',
 '54061',
 '17045',
 '37017',
 '41009',
 '26053',
 '54011',
 '48083',
 '47035',
 '47137',


In [17]:
f = open("data/processed/county_centroid_list.csv","w")
f.write("geoid,lon,lat\n")
for fipsCode,lon,lat in data:
    f.write("%s,%f,%f\n" % (fipsCode,lon,lat))
f.close()

In [18]:
coords = [(coord[2], coord[1]) for coord in data]
distance_matrix = scipy.spatial.distance.cdist(coords, coords, haversine.haversine)
np.save("data/processed/county_distance_matrix.npy", distance_matrix)

In [31]:
np.shape(distance_matrix)

(3106, 3106)

## Save migration matrices to file

In [19]:
for year in years:
    print(year)
    migration_matrix = migration_data.get_processed_data(year, sorted_joined_list)
    np.save("data/processed/migration/migration_matrix_%d.npy" % (year), migration_matrix)

2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014


In [20]:
year

2014

In [21]:
migration_matrix

array([[ 42354,      0,      0, ...,      0,      0,      0],
       [     0, 151534,      0, ...,      0,      0,      0],
       [     0,      0,  17277, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ...,  18079,      0,      0],
       [     0,      0,      0, ...,      0,   6886,      0],
       [     0,      0,      0, ...,      0,      0,   5749]], dtype=int32)