## Normalized Region --> NYC (Final Map)

#### NB Setup

In [1]:
# read in packages
%matplotlib inline
import pandas as pd
import json
import geopandas as gpd
import numpy as np
from shapely import wkt
import os
import plotly.express as px

In [2]:
# make display wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:88% !important; }</style>"))

# allow max rows and colums to be displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# set directory explicitly
os.chdir(r'C:\Users\steve\GitHub\rp-covid-migration')

#### Read in year data, clean up

In [4]:
# 2020
usa_df = pd.read_csv(r'data/nyc_x_us_county/dest_nycxUS_daily_01-09-2020.csv')
# alter file, convert geoid field to string
usa_df = usa_df.rename(columns={'dest_stco':'id'})
usa_df['id'] = usa_df['id'].str[1:]
# create list for date columns
list_date = usa_df.columns.to_list()
list_date = list_date[1:]  
# melt 
dff_20 = pd.melt(usa_df, id_vars=['id'],value_vars = list_date)
#convert variable to DateTime
dff_20['variable'] = pd.to_datetime(dff_20['variable'])

# 2019
usa_df = pd.read_csv(r'data/nyc_x_us_county/dest_nycxUS_daily_01-09-2019.csv')
# alter file, convert geoid field to string
usa_df = usa_df.rename(columns={'dest_stco':'id'})
usa_df['id'] = usa_df['id'].str[1:]
# create list for date columns
list_date = usa_df.columns.to_list()
list_date = list_date[1:]  
# melt
dff_19 = pd.melt(usa_df, id_vars=['id'],value_vars = list_date)
#convert variable to DateTime
dff_19['variable'] = pd.to_datetime(dff_19['variable'])

#### GeoJSON, centroid creation

In [5]:
# read in counties GeoJSON, convert to geopandas
counties = r'data/shapefiles/UScounties.geojson'
counties = gpd.read_file(counties)
# create centroid, safe as file
counties['geometry'] = counties.centroid
counties = counties.set_geometry('geometry')
counties.to_file("data/shapefiles/UScounties_centroid.geojson", driver="GeoJSON")
# add independent lat longs from couties
# add independent lat/longs for KeplerGl
counties['lon'] = counties['geometry'].x
counties['lat'] = counties['geometry'].y
counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,lon,lat
0,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482
1,21,17,516855,0500000US21017,21017,Bourbon,6,750439351,4829777,POINT (-84.21715 38.20673),-84.217151,38.206735
2,21,31,516862,0500000US21031,21031,Butler,6,1103571974,13943044,POINT (-86.68162 37.20728),-86.681623,37.207285
3,21,65,516879,0500000US21065,21065,Estill,6,655509930,6516335,POINT (-83.96431 37.69244),-83.964311,37.692444
4,21,69,516881,0500000US21069,21069,Fleming,6,902727151,7182793,POINT (-83.69666 38.37012),-83.696656,38.370118


In [6]:
# join based on geoid and couties
usa_centroid20 = pd.merge(counties, dff_20, left_on="GEOID", right_on="id", how="inner")
usa_centroid19 =  pd.merge(counties, dff_19, left_on="GEOID", right_on="id", how="inner")


#### Data refinement, complexities

In [54]:
# THIS WAS DONE ALREADY in EXCEL,
# IF THEY ARE INCLUDED, REMOVE IN LINES BELOW

# removing the NYC counties 
# nyc_geoid = ['36005','36047','36061','36081','36085'] # select specifc geoid strings
# usa_centroid20 = usa_centroid20[~usa_centroid20['GEOID'].isin(nyc_geoid)]
# usa_centroid19 = usa_centroid19[~usa_centroid19['GEOID'].isin(nyc_geoid)]

In [7]:
# Aleutian Islands gives odd geography -- let's just remove from analysis
# Get names of indexes for which column Age has value 30
indexNames_20 = usa_centroid20[usa_centroid20['NAME'] == 'Aleutians West' ].index
indexNames_19 = usa_centroid19[usa_centroid19['NAME'] == 'Aleutians West' ].index
# Delete these row indexes from dataFrame
usa_centroid20.drop(indexNames_20, inplace=True)
usa_centroid19.drop(indexNames_19, inplace=True)

# we can do this for Reno County, which seems like an abnormal sink for pings..
# Hashing out now, can remove if we want
reno_20 = usa_centroid20[usa_centroid20['NAME'] == 'Reno' ].index
reno_19 = usa_centroid19[usa_centroid19['NAME'] == 'Reno' ].index
# Delete these row indexes from dataFrame
usa_centroid20.drop(reno_20, inplace=True)
usa_centroid19.drop(reno_19, inplace=True)

# view length of dataframes
print(len(usa_centroid20))
print(len(usa_centroid19))

880636
1173110


#### Crete function to save these files..

In [8]:
usa_centroid20.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry,lon,lat,id,variable,value
0,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482,21007,2020-01-01,0.0
1,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482,21007,2020-01-02,0.0
2,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482,21007,2020-01-03,0.0
3,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482,21007,2020-01-04,0.0
4,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99926 37.05848),-88.999256,37.058482,21007,2020-01-05,0.0


In [9]:
usa20 = 'usa20'
usa19 = 'usa19'
subplaces = 'subplaces'
decimals = 0
def us_func(df,geo_name):
    df['date'] = pd.to_datetime(df['variable'])
    df['date'] = df["date"].dt.strftime('%m/%d/%Y')
    df['datetime'] = df['date'].astype(str) + ' 0:00'
    #df_centroid.drop(columns = 'geometry')A
    #df['geom'] = df['geom'].apply(wkt.loads)
    df['value'] = df['value'].astype(float)
    df['value'] = df['value'].apply(lambda x: round(x, decimals)) # round to nearest whole num
    df['value'] = df['value'].astype(int) # remove decimal
    # selecting rows based on value being greater than 0
    df = df.loc[df['value'] >= 1] 
    #renaming tooltip columns for Kepler.gl hover
    df = df.rename(columns = {'NAME':'Name','date':'Date','value':'Trips'})
    #df_centroid = gpd.GeoDataFrame(df_centroid, geometry='geom')
    df.to_csv(f'data/nycxuscounty/nyc_to_{geo_name}.csv', index = False)

In [10]:
# read in the function
us_func(usa_centroid20, usa20)
us_func(usa_centroid19, usa19)
# save intermediary files
us20 = pd.read_csv(f'data/nycxuscounty/nyc_to_usa20.csv')
us19 = pd.read_csv(f'data/nycxuscounty/nyc_to_usa19.csv')

In [11]:
us20.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,Name,LSAD,ALAND,AWATER,geometry,lon,lat,id,variable,Trips,Date,datetime
0,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99925588833923 37.05848196169255),-88.999256,37.058482,21007,2020-07-01,1,07/01/2020,07/01/2020 0:00
1,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99925588833923 37.05848196169255),-88.999256,37.058482,21007,2020-08-10,1,08/10/2020,08/10/2020 0:00
2,21,7,516850,0500000US21007,21007,Ballard,6,639387454,69473325,POINT (-88.99925588833923 37.05848196169255),-88.999256,37.058482,21007,2020-09-09,1,09/09/2020,09/09/2020 0:00
3,21,17,516855,0500000US21017,21017,Bourbon,6,750439351,4829777,POINT (-84.21715057392247 38.20673452494958),-84.217151,38.206735,21017,2020-01-31,1,01/31/2020,01/31/2020 0:00
4,21,17,516855,0500000US21017,21017,Bourbon,6,750439351,4829777,POINT (-84.21715057392247 38.20673452494958),-84.217151,38.206735,21017,2020-06-03,1,06/03/2020,06/03/2020 0:00


In [12]:
# read in data align csv, this is to account for the dates being different for each year.
dates = pd.read_csv(f'data/nycxuscounty/date_align_19-20.csv')
dates['date'] = pd.to_datetime(dates['date'])
# convert to format that can allow join based off existing fields
dates['date'] = dates['date'].dt.strftime('%m/%d/%Y')
dates.head()

Unnamed: 0,date,date_viz
0,01/02/2019,01/01
1,01/03/2019,01/02
2,01/04/2019,01/03
3,01/05/2019,01/04
4,01/06/2019,01/05


In [13]:
# join with the date viz
us20 = pd.merge(dates, us20, left_on="date", right_on="Date", how="inner")
us19 = pd.merge(dates, us19, left_on="date", right_on="Date", how="inner")
us20.head()

Unnamed: 0,date,date_viz,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,Name,LSAD,ALAND,AWATER,geometry,lon,lat,id,variable,Trips,Date,datetime
0,01/01/2020,01/01,17,91,424247,0500000US17091,17091,Kankakee,6,1752121058,12440760,POINT (-87.86183129275948 41.13771025937172),-87.861831,41.13771,17091,2020-01-01,1,01/01/2020,01/01/2020 0:00
1,01/01/2020,01/01,17,197,1785190,0500000US17197,17197,Will,6,2164927644,34548925,POINT (-87.97864242148339 41.44504270382512),-87.978642,41.445043,17197,2020-01-01,9,01/01/2020,01/01/2020 0:00
2,01/01/2020,01/01,1,89,161570,0500000US01089,1089,Madison,6,2076091216,28831382,POINT (-86.55022471636647 34.7630904735827),-86.550225,34.76309,1089,2020-01-01,2,01/01/2020,01/01/2020 0:00
3,01/01/2020,01/01,1,117,161584,0500000US01117,1117,Shelby,6,2034112797,62537787,POINT (-86.66066541307325 33.26427760112504),-86.660665,33.264278,1117,2020-01-01,4,01/01/2020,01/01/2020 0:00
4,01/01/2020,01/01,5,11,63757,0500000US05011,5011,Bradley,6,1681514472,9532926,POINT (-92.1623982398049 33.46641993334539),-92.162398,33.46642,5011,2020-01-01,1,01/01/2020,01/01/2020 0:00


In [14]:
# sort our dataframes, add year column for visualization
us19 = us19.sort_values(by = 'date_viz')
us19['Year'] = '2019'

us20 = us20.sort_values(by = 'date_viz')
us20['Year'] = '2020'
# combine the two dataframes 
df_us = pd.concat([us19,us20], ignore_index =True)
# drop repetitive fields, you can remove many more as most are unnecessary
df_us = df_us.drop(columns = ['date','datetime'], axis = 1)
df_us = df_us.sort_values(by = 'Date')
df_us.head()

Unnamed: 0,date_viz,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,Name,LSAD,ALAND,AWATER,geometry,lon,lat,id,variable,Trips,Date,Year
253747,01/01,17,91,424247,0500000US17091,17091,Kankakee,6,1752121058,12440760,POINT (-87.86183129275948 41.13771025937172),-87.861831,41.13771,17091,2020-01-01,1,01/01/2020,2020
254346,01/01,17,197,1785190,0500000US17197,17197,Will,6,2164927644,34548925,POINT (-87.97864242148339 41.44504270382512),-87.978642,41.445043,17197,2020-01-01,9,01/01/2020,2020
254347,01/01,1,89,161570,0500000US01089,1089,Madison,6,2076091216,28831382,POINT (-86.55022471636647 34.7630904735827),-86.550225,34.76309,1089,2020-01-01,2,01/01/2020,2020
254348,01/01,1,117,161584,0500000US01117,1117,Shelby,6,2034112797,62537787,POINT (-86.66066541307325 33.26427760112504),-86.660665,33.264278,1117,2020-01-01,4,01/01/2020,2020
254349,01/01,5,11,63757,0500000US05011,5011,Bradley,6,1681514472,9532926,POINT (-92.1623982398049 33.46641993334539),-92.162398,33.46642,5011,2020-01-01,1,01/01/2020,2020


#### Creating our Map

In [15]:
fig = px.scatter_mapbox(df_us,
                        lat = 'lat', lon = 'lon', # explcitly call in lat/longs
                        size = 'Trips', 
                        color = 'Year', # legend properties, change color based off year
                        color_discrete_map={"2019":"#4472C4", "2020":"#ED7D31"}, # colors blue and orange
                        animation_frame = 'date_viz', # animating over the dates
                        animation_group = 'Year',
                        hover_data = {'Name':True,'date_viz':True,'Trips':True, 'lat':False, 'lon':False},
                        labels ={'date_viz':'Date '}, # change name for clarity
                        size_max = 60)
fig.update_layout(mapbox_zoom = 3,
                 mapbox_center = {'lat':37.09, 'lon':-95.71},
                 mapbox_style="carto-darkmatter",
                  # change size of visualization
                 width = 950, 
                 height = 780, 
                 font = dict(family = 'Arial', color = 'black'),
                 legend=dict(
                      title = 'Year',
                      title_font_color = 'white',
                      bgcolor = 'black',
                      bordercolor = 'gray', borderwidth = 1,
                      font = dict(family = 'Arial', color = 'white'),
                      yanchor="top",
                      y=0.25,
                      xanchor="left",
                      x=0.02))
## THIS IS TO BE ADDED IF STANDALONE VIZ, we are placing in HTML, this is not necessary

# TITLE
# fig.add_annotation(text = "Where NYC Residents Traveled<br>in the U.S. (by County)",
#                   align = 'left', x = 0.01, y = 0.95, showarrow = False,
#                   bordercolor = None,
#                   bgcolor = None,
#                   font = dict(family = "Arial", color = 'white', size = 19))

# SUBTITLE
# fig.add_annotation(text = 'January 1 to September 30<br>2019 vs. 2020',
#                   align = 'left', x = 0.01, y = 0.87, showarrow = False,
#                   bordercolor = None,
#                   bgcolor = None,
#                   font = dict(family = "Arial", color = 'white', size = 13))

# EXPLANATORY TEXT
fig.add_annotation(text = 'Bubble size = # of Trips',
                  align = 'right', x = 0.02, y = 0.01, showarrow=False,
                  bordercolor = None, bgcolor = None,
                  font = dict(family = 'Arial', color = 'white', size = 12))
fig.add_annotation(text = 'Click in the legend to toggle years on/off',
                  align = 'right', x = 0.02, y = 0.1, showarrow=False,
                  bordercolor = None, bgcolor = None,
                  font = dict(family = 'Arial', color = 'white', size = 12))
# speed this viz up so it doesn't look like crap..
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 80 
# fig.show()
#save html
fig.write_html(r'Maps/html_maps/NYC_to_UScounty.html', auto_open = True, auto_play = False, include_plotlyjs = 'cdn')
