### ACS IPUMS NHGIS - Basic Data Preparation

In [1]:
%%time 

# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal 
# Install rtree - Geopandas requirment
!apt install python3-rtree 
# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git
# Install descartes - Geopandas requirment
!pip install descartes 
# Install Folium for Geographic data visualization
!pip install folium
# Install plotlyExpress
!pip install plotly_express

'apt' is not recognized as an internal or external command,
operable program or batch file.
'apt' is not recognized as an internal or external command,
operable program or batch file.


Collecting git+git://github.com/geopandas/geopandas.git
  Cloning git://github.com/geopandas/geopandas.git to c:\users\sheld\appdata\local\temp\pip-req-build-cxykp3tc
Building wheels for collected packages: geopandas
  Building wheel for geopandas (setup.py): started
  Building wheel for geopandas (setup.py): finished with status 'done'
  Created wheel for geopandas: filename=geopandas-0.8.0+98.gefea225-py2.py3-none-any.whl size=981829 sha256=912248583ceb2a6dc4d5cf81a3d772e18b21e483b9d68e3b00c0099b7ed09a4f
  Stored in directory: C:\Users\sheld\AppData\Local\Temp\pip-ephem-wheel-cache-0s05hmba\wheels\cf\3e\0b\6475054094c2b1ea054158ac1fdcf749fb92f5b512377e4cf8
Successfully built geopandas


  Running command git clone -q git://github.com/geopandas/geopandas.git 'C:\Users\sheld\AppData\Local\Temp\pip-req-build-cxykp3tc'


Wall time: 1min 3s


In [1]:
import pandas as pd
import numpy as np
import re
import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry import MultiPolygon
from shapely.geometry import Polygon, LineString
import matplotlib
import matplotlib.pyplot as plt 
import folium
import plotly_express as px

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)

# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import random

In [3]:
def import_csv(content):
    """Read in csv as a dataframe"""
    acs_data = pd.read_csv(content, encoding= 'unicode_escape')
    return acs_data

In [4]:
def rename_variables(dataframe, dictionary):
    """Renames columns of a dataframe given dictionary"""
    dataframe = dataframe.rename(columns = dictionary)
    return dataframe

In [5]:
def returnNotMatches(a, b):
    return [x for x in a if x not in b]

In [6]:
def strip_dict_keys_values(d):
    """To remove white-space in keys and values in dictionary"""
    def strip_list(l):
        return [strip_dict_keys_values(x)
                if isinstance(x, dict) else strip_list(x)
                if isinstance(x, list) else clean(x) for x in l]

    def clean(value):
        if isinstance(value, str):
            return value.strip()
        return value

    return {key.strip(): strip_dict_keys_values(value)
            if isinstance(value, dict) else strip_list(value)
            if isinstance(value, list) else clean(value)
            if value is None else clean(value)
            for key, value in d.items()}

In [7]:
def block_groups(list_bg_data):
    """Takes in list of block group shapefiles return files concat together for 11 states."""
    block_group_shp = [0]*11
    for i in range(len(list_bg_data)):
        block_group_shp[i] = gpd.read_file(list_bg_data[i])

    block_group_con = pd.concat(block_group_shp, axis = 0)
    block_group_con = block_group_con.reset_index()
    
    return block_group_con

In [8]:
def block_groups_us(bg_data):
    """Reads in shp files for all US block groups"""
    block_group_con = gpd.read_file(bg_data)
    return block_group_con

In [9]:
def cpi_adjustment(Year_1, Year_2, data_CPI):
    """Calculates Inflation Adjustment Factor for dollar denominated variables"""
    """Uses  R-CPI-U-RS, All items CPI estimates from https://www.bls.gov/cpi/research-series/r-cpi-u-rs-home.htm"""
    CPI_i = data_CPI.set_index("ï»¿YEAR")
    CPI_p2 = CPI_i.at[Year_2,"AVG"]
    CPI_p1 = CPI_i.at[Year_1,"AVG"]
    Inflation_Adjustment_Factor = CPI_p2/CPI_p1
    return Inflation_Adjustment_Factor


In [10]:
ACS5_2009 = import_csv(r"C:\Users\sheld\Documents\nhgis0005_csv\nhgis0005_ds195_20095_2009_blck_grp.csv")

In [11]:
ACS5_2019 = import_csv(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis0003_csv\nhgis0003_ds244_20195_2019_blck_grp.csv')

In [12]:
ACS5_2014 = import_csv(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis0004_csv\nhgis0004_ds206_20145_2014_blck_grp.csv')

In [13]:
# ACS5_2019_names = import_csv(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis_2019_varaible_names.csv')
# ACS5_2014_names = import_csv(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis_2014_variable_names.csv')
# ACS5_2009_names = import_csv(r"C:\Users\sheld\Documents\nhgis0005_csv\nhgis_2009_variable_names.csv")

ACS5_2019_names = import_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis_2019_varaible_names_Updated.csv")
ACS5_2014_names = import_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis_2014_variable_names_Updated.csv")
ACS5_2009_names = import_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis_2009_variable_names_Updated.csv")

In [14]:
CPI = import_csv(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\r-cpi-u-rs-allitems (1).csv')

Code below to rename variables from GIS names. 

In [15]:
# ACS5_2019_names = ACS5_2019_names.drop(columns = ['1','2','3','4','5','6','7','8','9'])
# ACS5_2014_names = ACS5_2014_names.drop(columns = ['1','2','3','4','5','6','7','8','9'])

In [16]:
dictionary_2019 = dict(ACS5_2019_names.values.tolist())
dictionary_2014 = dict(ACS5_2014_names.values.tolist())
dictionary_2009 = dict(ACS5_2009_names.values.tolist())

In [17]:
dictionary_2019_new = strip_dict_keys_values(dictionary_2019) # remove whitespace from dictionary. 
dictionary_2014_new = strip_dict_keys_values(dictionary_2014) # remove whitespace from dictionary. 
dictionary_2009_new = strip_dict_keys_values(dictionary_2009)

In [18]:
# ACS5_2019_new = rename_variables(ACS5_2019, dictionary_2019_new) #rename variables, variables with ending in M are margins of error. 
# ACS5_2014_new = rename_variables(ACS5_2014, dictionary_2014_new)
# ACS5_2009_new = rename_variables(ACS5_2009, dictionary_2009_new)

In [19]:
# block_groups_2019 = ['/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_040_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_060_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_080_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_160_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_300_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_320_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_350_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_410_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_490_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_530_blck_grp_2019.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2019_nhgis0002_shape/nhgis0002_shapefile_tl2019_560_blck_grp_2019.zip']
# block_groups_2014 = ['/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_040_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_060_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_080_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_160_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_300_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_320_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_350_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_410_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_490_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_530_blck_grp_2014.zip','/content/drive/MyDrive/Colab Notebooks/ACS_IPUMS_NHGIS_Data/ACS5_2014_nhgis0001_shape/nhgis0001_shapefile_tl2014_560_blck_grp_2014.zip']

In [20]:
# ACS5_2019_block_groups = block_groups(block_groups_2019)
# ACS5_2014_block_groups = block_groups(block_groups_2014)

In [21]:
ACS5_2019_block_groups = block_groups_us(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis0003_shape\nhgis0003_shapefile_tl2019_us_blck_grp_2019.zip')
ACS5_2014_block_groups = block_groups_us(r'C:\Users\sheld\Documents\Thesis_Data_Full_United_States\nhgis0004_shape\nhgis0004_shapefile_tl2014_us_blck_grp_2014.zip')
ACS5_2009_block_groups = block_groups_us(r'C:\Users\sheld\Documents\nhgis0006_shape\nhgis0006_shapefile_tl2010_us_blck_grp_2010.zip')

In [22]:
ACS5_2019_bg = pd.merge(ACS5_2019, ACS5_2019_block_groups, on = ['GISJOIN'], how = "left") #Joining block groups with ACS5 data. 
ACS5_2014_bg = pd.merge(ACS5_2014, ACS5_2014_block_groups, on = ['GISJOIN'], how = "left")
ACS5_2009_bg = pd.merge(ACS5_2009, ACS5_2009_block_groups, on = ['GISJOIN'], how = "left") #-- Had to droop 25% of obs in doing calculations of min dist so trying with 2014 bg for 2009 instead of 2009 bg's --> Turned out to be workse 59,999 missings obs so I am sticking to 2009 bg for 2009 acs 
#ACS5_2009_bg = pd.merge(ACS5_2009, ACS5_2014_block_groups, on = ['GISJOIN'], how = "left") #here try with 2014 bg for 2009

Inflation Adjust the Dollar Amount Multiyear Estimates (2010-2014) to Compare in 2019 Dollars where the inflation adjusted estimate for the period earlier can be expressed as:

\begin{equation}
  \hat{X}_{p1,Adj} = \frac{CPI_{p2}}{CPI_{p1}} \hat{X}_{p1}
\end{equation}

$CPI_{p1}$ - is the All Items CPI-U-RS Annual Average for the last year in the earlier time period (P1). 

$CPI_{p2}$ -  is the All Items CPI-U-RS Annual Average for the last year in the most current time period (P2). 

$\hat{X}_{p1}$ - is the published ACS estimate for the earlier time period (P1).

Documentation on inflation adjusting dollar estimates can be found [here]("https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch10.pdf"). 

[R-CPI-U-RS from the U.S. Bureau of Labor Statistics]("https://www.bls.gov/cpi/research-series/r-cpi-u-rs-home.htm").


In [23]:
Inflation_Adjustment_2014 = cpi_adjustment(2014, 2019, CPI)
Inflation_Adjustment_2009 = cpi_adjustment(2009, 2019, CPI)

In [24]:
dollar_variables = ["ABIHE001", "ABISE001", "ABITE001", "ABIUE001"] #inflation adjusted 2014 dollar variables (i.e. rent and lower, median, upper household value) into 2019 dollars 
for i in range(len(dollar_variables)):
    ACS5_2014_bg[dollar_variables[i]] = Inflation_Adjustment_2014*ACS5_2014_bg[dollar_variables[i]]

In [25]:
dollar_variables = ["RRUE001","RR6E001", "RR7E001", "RR8E001"] #inflation adjusted 2009 dollar variables (i.e. rent and lower, median, upper household value) into 2019 dollars 
for i in range(len(dollar_variables)):
    ACS5_2009_bg[dollar_variables[i]] = Inflation_Adjustment_2009*ACS5_2009_bg[dollar_variables[i]]

In [26]:
# ACS5_2014_bg.to_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2014_bg.csv")
# ACS5_2019_bg.to_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2019_bg.csv")
# ACS5_2009_bg.to_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2009_bg.csv")

In [27]:
ACS5_2009_bg = ACS5_2009_bg.rename(columns={"ALAND10":"ALAND", "AWATER10":"AWATER","INTPTLAT10":"INTPTLAT","INTPTLON10":"INTPTLON","Shape_area":"Shape_Area","Shape_len":"Shape_Leng"})

### Organizing 2005-2009 ACS 

In [28]:
cols = list(ACS5_2009_bg.columns.values)[146:180]
for i in range(17):
    ACS5_2009_bg[f"educ{i}"] = ACS5_2009_bg[cols[i]]+ACS5_2009_bg[cols[i+17]]
#adding male and female categories to get total for education categories for 2005-2009 data
#NOTE! remember to drop cols 146 to 180-1. ******

### Combine Year Structure Built Variable for Later Period ACS 2010-2014 and 2015-2019

In [29]:
ACS5_2019_bg["built_2000"] = ACS5_2019_bg["AL0DE004"]+ACS5_2019_bg["AL0DE003"]+ACS5_2019_bg["AL0DE002"]
#combined Built 2000 to 2009, 2009 to 2013, and 2014 or later 
ACS5_2014_bg["built_2000"] = ACS5_2014_bg["ABHPE003"]+ACS5_2014_bg["ABHPE002"]
#combined Built 2000 and 2010 or later for 2014 
ACS5_2009_bg["built_2000"] = ACS5_2009_bg["RQ2M002"] + ACS5_2009_bg["RQ2M003"]
#combined built 2000-2004 and 2005 or later for 2009

#NOTE! remeber to drop old columns (e.g. RQ2M002, etc)

In [30]:
#pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.reset_option('^display.', silent=True)

In [31]:
#Nursery to grade 4
ACS5_2019_bg["educ2"] = ACS5_2019_bg["ALWGE003"]+ACS5_2019_bg["ALWGE004"]+ACS5_2019_bg["ALWGE005"]+ACS5_2019_bg["ALWGE006"]+ACS5_2019_bg["ALWGE007"]+ ACS5_2019_bg["ALWGE008"]
ACS5_2014_bg["educ2"] = ACS5_2014_bg["ABC4E003"] + ACS5_2014_bg["ABC4E004"]+ ACS5_2014_bg["ABC4E005"]+ ACS5_2014_bg["ABC4E006"]+ACS5_2014_bg["ABC4E007"]+ACS5_2014_bg["ABC4E008"]

#grades five and 6 
ACS5_2019_bg["educ3"] = ACS5_2019_bg["ALWGE009"]+ACS5_2019_bg["ALWGE010"]
ACS5_2014_bg["educ3"] = ACS5_2014_bg["ABC4E009"]+ACS5_2014_bg["ABC4E010"]

#grades seven and 8 
ACS5_2019_bg["educ4"] = ACS5_2019_bg["ALWGE011"]+ACS5_2019_bg["ALWGE012"]
ACS5_2014_bg["educ4"] = ACS5_2014_bg["ABC4E011"]+ACS5_2014_bg["ABC4E012"]

#regular hs and GED combined 
ACS5_2019_bg["educ9"] = ACS5_2019_bg["ALWGE017"]+ACS5_2019_bg["ALWGE018"]
ACS5_2014_bg["educ9"] =ACS5_2014_bg["ABC4E017"]+ACS5_2014_bg["ABC4E018"]


In [32]:
ACS5_2019_new = rename_variables(ACS5_2019_bg, dictionary_2019_new) #rename variables for merging df
ACS5_2014_new = rename_variables(ACS5_2014_bg, dictionary_2014_new)
ACS5_2009_new = rename_variables(ACS5_2009_bg, dictionary_2009_new)

In [103]:
ACS5_2009_new_states

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211274 entries, 0 to 211273
Columns: 242 entries, GISJOIN to Built 2000 or later
dtypes: float64(37), geometry(1), int64(188), object(16)
memory usage: 391.7+ MB


In [33]:
#ACS5_2009_new.columns.tolist() NOTE! For 2009 bg shp will have to rename lat and long - remove 10!!!Look for other cols to rename if necessary =

In [34]:
frames = [ACS5_2019_new, ACS5_2014_new, ACS5_2009_new]
ACS5_2005_2019 = pd.concat(frames,join='inner', ignore_index=True).reset_index()

In [35]:
cols = ['index',
 'GISJOIN',
 'YEAR',
 'REGIONA',
 'DIVISIONA',
 'STATE',
 'STATEA',
 'COUNTY',
 'COUNTYA',
 'COUSUBA',
 'PLACEA',
 'TRACTA',
 'BLKGRPA',
 'CONCITA',
 'AIANHHA',
 'RES_ONLYA',
 'TRUSTA',
 'ANRCA',
 'CBSAA',
 'CSAA',
 'METDIVA',
 'NECTAA',
 'CNECTAA',
 'NECTADIVA',
 'UAA',
 'CDCURRA',
 'SLDUA',
 'SLDLA',
 'SUBMCDA',
 'SDELMA',
 'SDSECA',
 'SDUNIA',
 'PUMA5A',
 'NAME_E',
 'Total Race',
 'White alone',
 'Black or African American alone',
 'American Indian and Alaska Native alone',
 'Asian alone',
 'Native Hawaiian and Other Pacific Islander alone',
 'Some other race alone',
 'Two or more races',
 'Two or more races Two races including Some other race',
 'Two or more races Two races excluding Some other race and three or more races',
 'Total Travel Time to Work',
 'Less than 5 minutes',
 '5 to 9 minutes',
 '10 to 14 minutes',
 '15 to 19 minutes',
 '20 to 24 minutes',
 '25 to 29 minutes',
 '30 to 34 minutes',
 '35 to 39 minutes',
 '40 to 44 minutes',
 '45 to 59 minutes',
 '60 to 89 minutes',
 '90 or more minutes',
 'Total Educational Attainment for the Population 25 Years and Over',
 'No schooling completed',
 'Nursery to 4th grade',
 '5th and 6th grade',
 '7th and 8th grade',
 '9th grade',
 '10th grade',
 '11th grade',
 '12th grade no diploma',
 'High school graduate, GED, or alternative',
 'Some college less than 1 year',
 'Some college 1 or more years no degree',
 "Associate's degree",
 "Bachelor's degree",
 "Master's degree",
 'Professional school degree',
 'Doctorate degree',
 'Total Year Structure Built',
 'Built 2000 or later',
 'Built 1990 to 1999',
 'Built 1980 to 1989',
 'Built 1970 to 1979',
 'Built 1960 to 1969',
 'Built 1950 to 1959',
 'Built 1940 to 1949',
 'Built 1939 or earlier',
 'Total Bedrooms',
 'No bedroom',
 '1 bedroom',
 '2 bedrooms',
 '3 bedrooms',
 '4 bedrooms',
 '5 or more bedrooms',
 'Median gross rent',
 'Lower value quartile (dollars)',
 'Median value (dollars)',
 'Upper value quartile (dollars)',
 'Total Mortgage Status',
 'Housing units with a mortgage contract to purchase or similar debt',
 'Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both',
 'Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both Second mortgage only',
 'Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both Home equity loan only',
 'Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan',
 'Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan',
 'Housing units without a mortgage',
 'NAME_M',
 'ALAND',
 'AWATER',
 'INTPTLAT',
 'INTPTLON',
 'Shape_Leng',
 'Shape_Area',
 'geometry']

In [36]:
ACS5_2005_2019 = ACS5_2005_2019[cols]

In [37]:
remove = ['AIANHHA',
 'RES_ONLYA',
 'TRUSTA',
 'ANRCA',
 'CBSAA',
 'CSAA',
 'METDIVA',
 'NECTAA',
 'CNECTAA',
 'NECTADIVA',
 'UAA',
 'CDCURRA',
 'SLDUA',
 'SLDLA',
 'SUBMCDA',
 'SDELMA',
 'SDSECA',
 'SDUNIA',
 'PUMA5A',
 'NAME_M']

In [38]:
ACS5_2005_2019 = ACS5_2005_2019.drop(columns=remove)

## Get Data Into % Format

In [39]:
cols = ["Two or more races Two races including Some other race","Two or more races Two races excluding Some other race and three or more races"]
ACS5_2005_2019 = ACS5_2005_2019.drop(columns=cols) # `Two or more races' already captures this variable so these extra two are not needed

In [40]:
#get race in % formate
cols = list(ACS5_2005_2019.columns.values)[16:23]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Race"]

In [41]:
#get travel time to work in % formate
cols = list(ACS5_2005_2019.columns.values)[24:36]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Travel Time to Work"]

In [42]:
#get education into % formate
cols = list(ACS5_2005_2019.columns.values)[37:53]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Educational Attainment for the Population 25 Years and Over"]

In [43]:
#get year struture built in % formate
cols = list(ACS5_2005_2019.columns.values)[54:62]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Year Structure Built"]

In [44]:
#get total bedrooms in % formate
cols = list(ACS5_2005_2019.columns.values)[63:69]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Bedrooms"]

In [45]:
cols = ["Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both Second mortgage only","Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both Home equity loan only"]
ACS5_2005_2019 = ACS5_2005_2019.drop(columns=cols) #droping sub categories under `Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both' note that 
#`Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both' + `Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan'+`Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan' = `Housing units with a mortgage contract to purchase or similar debt' 

In [46]:
#get mortgage status in % formate 
cols = list(ACS5_2005_2019.columns.values)[74:79]
for i in range(len(cols)):
    ACS5_2005_2019[cols[i]]=ACS5_2005_2019[cols[i]]/ACS5_2005_2019["Total Mortgage Status"]

### Join Complete DF with Min Dist Wildfires for Each Year

In [47]:
min_dist_2005_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2005_ACS52009_sim.csv")
min_dist_2006_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2006_ACS52009_sim.csv")
min_dist_2007_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2007_ACS52009_sim.csv")
min_dist_2008_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2008_ACS52009_sim.csv")
min_dist_2009_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2009_ACS52009_sim.csv")
min_dist_2010_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2010_ACS52014_sim.csv")
min_dist_2011_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2011_ACS52014_sim.csv")
min_dist_2012_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2012_ACS52014_sim.csv")
min_dist_2013_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2013_ACS52014_sim.csv")
min_dist_2014_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2014_ACS52014_sim.csv")
min_dist_2015_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2015_ACS52014_sim.csv")
min_dist_2016_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2016_ACS52014_sim.csv")
min_dist_2017_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2017_ACS52014_sim.csv")
min_dist_2018_sim = import_csv(r"C:\Users\sheld\Documents\GitHub\econ499\Min_distances_2018_ACS52014_sim.csv")
#Note these are only the simpify shapes, need to re-calculate 2005-2009 box shapes for those years

In [48]:
dist = [min_dist_2005_sim, min_dist_2006_sim, min_dist_2007_sim, min_dist_2008_sim, min_dist_2009_sim, min_dist_2010_sim, min_dist_2011_sim, min_dist_2012_sim, min_dist_2013_sim, min_dist_2014_sim, min_dist_2015_sim, min_dist_2016_sim, min_dist_2017_sim, min_dist_2018_sim]

In [49]:
#remove white space on GISJOIN variable for all distance series. 
for i in range(len(dist)):
    dist[i]["GISJOIN"] = dist[i]["GISJOIN"].str.replace(' ','')

In [50]:
def merge_on_dist(ACS, dist):
    for i in range(len(dist)):
        ACS = pd.merge(ACS, dist[i], on="GISJOIN")
    return ACS

In [90]:
for i in range(len(dist)):
    print(dist[i].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44430 entries, 0 to 44429
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   min_dist_2005_sim  44430 non-null  float64
 1   GISJOIN            44430 non-null  object 
dtypes: float64(1), object(1)
memory usage: 694.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44430 entries, 0 to 44429
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   min_dist_2006_sim  44430 non-null  float64
 1   GISJOIN            44430 non-null  object 
dtypes: float64(1), object(1)
memory usage: 694.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44430 entries, 0 to 44429
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   min_dist_2007_sim  44430 non-null  float64
 1   GISJOIN            44430 

In [98]:
ACS5_2005_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651940 entries, 0 to 651939
Data columns (total 86 columns):
 #   Column                                                                                                                             Non-Null Count   Dtype   
---  ------                                                                                                                             --------------   -----   
 0   index                                                                                                                              651940 non-null  int64   
 1   GISJOIN                                                                                                                            651940 non-null  object  
 2   YEAR                                                                                                                               651940 non-null  object  
 3   REGIONA                                                                   

In [99]:
ACS5_2005_2019[ACS5_2005_2019["YEAR"]=="2005-2009"].count()

index         211274
GISJOIN       211274
YEAR          211274
REGIONA            0
DIVISIONA          0
               ...  
INTPTLAT      151288
INTPTLON      151288
Shape_Leng    151288
Shape_Area    151288
geometry      151288
Length: 86, dtype: int64

In [53]:
ACS5_2005_2019_missing_center = ACS5_2005_2019_WEST[(ACS5_2005_2019_WEST["INTPTLAT"].isnull()) | (ACS5_2005_2019_WEST["Median value (dollars)"].isnull())].groupby(["STATE","YEAR"])

In [54]:
ACS5_2005_2019_missing_center2 = ACS5_2005_2019_missing_center.describe()
#block groups with both missing location and median property values

In [55]:
ACS5_2005_2019_missing_center2["index", "count"]

STATE         YEAR     
Arizona       2005-2009    1759.0
              2010-2014     178.0
              2015-2019     312.0
California    2005-2009    5802.0
              2010-2014    1120.0
              2015-2019    1675.0
Colorado      2005-2009    1079.0
              2010-2014     121.0
              2015-2019     179.0
Idaho         2005-2009     302.0
              2010-2014      10.0
              2015-2019      26.0
Kansas        2005-2009     700.0
              2010-2014      58.0
              2015-2019      99.0
Montana       2005-2009     312.0
              2010-2014      11.0
              2015-2019      28.0
Nebraska      2005-2009     484.0
              2010-2014      45.0
              2015-2019      70.0
Nevada        2005-2009     506.0
              2010-2014     136.0
              2015-2019     194.0
New Mexico    2005-2009     518.0
              2010-2014      35.0
              2015-2019     104.0
North Dakota  2005-2009     212.0
              2010-2014 

In [56]:
ACS5_2005_2019_missing_center2["index", "count"].sum()

27295.0

In [57]:
ACS5_2005_2019_WESTd = merge_on_dist(ACS5_2005_2019_WEST, dist) #merging min wildfire distances

In [58]:
ACS5_2005_2019_WEST = ACS5_2005_2019_WEST.reset_index()
missing = ACS5_2005_2019_WEST[(ACS5_2005_2019_WEST["INTPTLAT"].isnull()) | (ACS5_2005_2019_WEST["Median value (dollars)"].isnull())]
missing = missing.index.tolist()
ACS5_2005_2009_non_missing_bg = ACS5_2005_2019_WEST.copy()
ACS5_2005_2009_non_missing_bg2 = ACS5_2005_2009_non_missing_bg.drop(index=missing)

In [109]:
ACS5_2005_2009_non_missing_bg2[ACS5_2005_2009_non_missing_bg2["YEAR"]=="2015-2019"]

Unnamed: 0,level_0,index,GISJOIN,YEAR,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan,Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan,Housing units without a mortgage,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area,geometry
0,3972,3972,G04000109426001,2015-2019,,,Arizona,4,Apache County,1,...,0.0,0.004878,0.995122,1.093781e+09,208117.0,+36.6525496,-109.8475829,201634.794671,1.093990e+09,"POLYGON ((-1205442.684 24822.029, -1205445.870..."
1,3973,3973,G04000109426002,2015-2019,,,Arizona,4,Apache County,1,...,0.0,0.057018,0.942982,4.311277e+08,270973.0,+36.9353993,-109.9078833,97759.964618,4.313987e+08,"POLYGON ((-1204504.841 30811.982, -1204498.821..."
2,3974,3974,G04000109427001,2015-2019,,,Arizona,4,Apache County,1,...,0.0,0.120930,0.879070,4.503639e+08,754604.0,+36.9101983,-109.1774307,125325.976670,4.511185e+08,"POLYGON ((-1146477.066 22667.315, -1146479.085..."
3,3975,3975,G04000109427002,2015-2019,,,Arizona,4,Apache County,1,...,0.0,0.041344,0.958656,1.160413e+09,346444.0,+36.7703339,-109.6112269,335088.987092,1.160759e+09,"POLYGON ((-1170443.973 26051.139, -1170468.175..."
4,3976,3976,G04000109427003,2015-2019,,,Arizona,4,Apache County,1,...,0.0,0.061433,0.938567,5.312467e+08,261769.0,+36.8300288,-109.3653458,161224.612066,5.315086e+08,"POLYGON ((-1170140.302 21362.100, -1170059.640..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69510,217734,217734,G56004509511001,2015-2019,,,Wyoming,56,Weston County,45,...,0.0,0.442516,0.511931,1.994428e+09,1401069.0,+43.9951024,-104.7882557,226756.772928,1.995829e+09,"POLYGON ((-673489.978 777687.447, -673489.997 ..."
69511,217735,217735,G56004509511002,2015-2019,,,Wyoming,56,Weston County,45,...,0.0,0.455064,0.544936,4.105582e+09,3640658.0,+43.7685310,-104.4784990,419217.772769,4.109223e+09,"POLYGON ((-641204.770 774834.021, -641216.984 ..."
69512,217736,217736,G56004509513001,2015-2019,,,Wyoming,56,Weston County,45,...,0.0,0.518779,0.467136,4.070376e+07,0.0,+43.8816028,-104.2575269,28246.773331,4.070376e+07,"POLYGON ((-655231.143 741793.949, -655371.926 ..."
69513,217737,217737,G56004509513002,2015-2019,,,Wyoming,56,Weston County,45,...,0.0,0.500000,0.451807,1.038138e+07,0.0,+43.8541181,-104.1807371,16445.723966,1.038138e+07,"POLYGON ((-652588.350 736715.973, -652705.767 ..."


In [60]:
ACS5_2005_2019_WESTd

Unnamed: 0,index,GISJOIN,YEAR,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,...,min_dist_2009_sim,min_dist_2010_sim,min_dist_2011_sim,min_dist_2012_sim,min_dist_2013_sim,min_dist_2014_sim,min_dist_2015_sim,min_dist_2016_sim,min_dist_2017_sim,min_dist_2018_sim
0,3972,G04000109426001,2015-2019,,,Arizona,4,Apache County,1,,...,120782.03740,94759.52439,115027.99880,25503.269930,125270.03560,85914.48854,111499.68000,133972.656000,100893.33770,107968.383600
1,224305,G04000109426001,2010-2014,,,Arizona,4,Apache County,1,,...,120782.03740,94759.52439,115027.99880,25503.269930,125270.03560,85914.48854,111499.68000,133972.656000,100893.33770,107968.383600
2,444530,G04000109426001,2005-2009,,,Arizona,4,Apache County,1,,...,120782.03740,94759.52439,115027.99880,25503.269930,125270.03560,85914.48854,111499.68000,133972.656000,100893.33770,107968.383600
3,3974,G04000109427001,2015-2019,,,Arizona,4,Apache County,1,,...,69603.85231,74790.55545,78663.90506,49846.002650,102141.75150,98346.86015,48466.86904,83506.334860,92389.39464,71507.596710
4,224307,G04000109427001,2010-2014,,,Arizona,4,Apache County,1,,...,69603.85231,74790.55545,78663.90506,49846.002650,102141.75150,98346.86015,48466.86904,83506.334860,92389.39464,71507.596710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131165,649462,G56004509513003,2005-2009,,,Wyoming,56,Weston County,45,,...,139152.18100,60489.72362,11662.59376,8275.394046,89855.95756,210002.30730,25171.56196,8254.181101,54241.85561,8043.027649
131166,399242,G46011309405001,2010-2014,,,South Dakota,46,Shannon County,113,,...,275581.09980,84938.01376,86864.30068,17649.409310,222388.97080,337930.78760,68052.81444,25872.515480,67201.38599,138345.342900
131167,612941,G46011309405001,2005-2009,,,South Dakota,46,Shannon County,113,,...,275581.09980,84938.01376,86864.30068,17649.409310,222388.97080,337930.78760,68052.81444,25872.515480,67201.38599,138345.342900
131168,399243,G46011309405002,2010-2014,,,South Dakota,46,Shannon County,113,,...,300336.90930,109498.31900,115809.95600,12843.476690,246677.65790,362573.24790,93778.61921,47340.692400,90864.18991,166084.585700


In [61]:
gisjoinlist1 = ACS5_2005_2009_non_missing_bg2["GISJOIN"].tolist() #list with removed missing loctions and median prices
Year_1 = ACS5_2005_2009_non_missing_bg2["YEAR"].tolist()
gisjoinlist1y = list(zip(gisjoinlist1,Year_1))

In [62]:
gisjoinlist2 = ACS5_2005_2019_WESTd["GISJOIN"].tolist() #list merged with wildfires
Year2 = ACS5_2005_2019_WESTd["YEAR"].tolist()
gisjoinlist2y = list(zip(gisjoinlist2,Year2))

In [63]:
no_match_gis = returnNotMatches(gisjoinlist1y, gisjoinlist2y) #return non-matching GISJOIN ID's & Year

In [64]:
no_match_gisdf = pd.DataFrame(no_match_gis, columns=["GISJOIN","YEAR"])
no_match_df = pd.merge(no_match_gisdf, ACS5_2005_2009_non_missing_bg2, on=["GISJOIN","YEAR"], how ='left') #DF of dropped block groups due to merging with wildfire distances

In [65]:
len(no_match_gis) #number of block groups dropped when merged with wildfire distances

47245

In [67]:
no_match_df = no_match_df.drop(columns=["geometry"])
#no_match_df.to_csv(r"C:\Users\sheld\Documents\GitHub\econ499\no_match_wildfires.csv") 
#exported to map in No_Match_Block_Groups_To_Wildfires notebook

In [69]:
no_match_df2015 = no_match_df[no_match_df["YEAR"]=="2015-2019"]

In [70]:
no_match_df2015

Unnamed: 0,GISJOIN,YEAR,level_0,index,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both,Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan,Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan,Housing units without a mortgage,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area
0,G04000109426002,2015-2019,3973,3973,,,Arizona,4,Apache County,1,...,0.000000,0.0,0.057018,0.942982,431127728.0,270973.0,+36.9353993,-109.9078833,97759.964618,4.313987e+08
1,G04000109442011,2015-2019,3986,3986,,,Arizona,4,Apache County,1,...,0.000000,0.0,0.000000,1.000000,1959606.0,84852.0,+36.1734764,-109.5850409,8973.789995,2.044460e+06
2,G04000109442013,2015-2019,3988,3988,,,Arizona,4,Apache County,1,...,0.000000,0.0,0.043771,0.956229,445614000.0,174715.0,+36.1534278,-109.6853411,110697.217829,4.457888e+08
3,G04000109442021,2015-2019,3989,3989,,,Arizona,4,Apache County,1,...,0.000000,0.0,0.064140,0.935860,254398380.0,59830.0,+36.2212815,-109.4813620,104898.985679,2.544582e+08
4,G04000109442022,2015-2019,3990,3990,,,Arizona,4,Apache County,1,...,0.000000,0.0,0.046332,0.953668,174970041.0,58753.0,+36.0864000,-109.4770293,85365.971792,1.750288e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22528,G56003909677023,2015-2019,217703,217703,,,Wyoming,56,Teton County,39,...,0.000000,0.0,0.405751,0.594249,50938454.0,552805.0,+43.5160858,-110.7833214,33464.415314,5.149126e+07
22529,G56003909677024,2015-2019,217704,217704,,,Wyoming,56,Teton County,39,...,0.200000,0.0,0.546429,0.253571,19397589.0,1559524.0,+43.4773691,-110.8410559,33842.490298,2.095712e+07
22530,G56003909678001,2015-2019,217705,217705,,,Wyoming,56,Teton County,39,...,0.098551,0.0,0.675362,0.226087,7611142.0,19647.0,+43.4541108,-110.8166670,12900.767604,7.630790e+06
22531,G56003909678004,2015-2019,217708,217708,,,Wyoming,56,Teton County,39,...,0.000000,0.0,0.320442,0.679558,646867.0,10230.0,+43.4757605,-110.7657732,4276.301165,6.571013e+05


In [71]:
dist_2015 = [min_dist_2015_sim, min_dist_2016_sim, min_dist_2017_sim, min_dist_2018_sim]
for i in range(len(dist_2015)):
    dist_2015[i]["GISJOIN"] = dist_2015[i]["GISJOIN"].str.replace(' ','')

In [72]:
no_match_df2015_fires = merge_on_dist(no_match_df2015, dist_2015)

In [110]:
no_match_df2015_fires

Unnamed: 0,GISJOIN,YEAR,level_0,index,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area,min_dist_2015_sim,min_dist_2016_sim,min_dist_2017_sim,min_dist_2018_sim
0,G04000109426002,2015-2019,3973,3973,,,Arizona,4,Apache County,1,...,431127728.0,270973.0,+36.9353993,-109.9078833,97759.964618,4.313987e+08,100617.78450,113120.36030,86787.39070,132228.39510
1,G04000109442013,2015-2019,3988,3988,,,Arizona,4,Apache County,1,...,445614000.0,174715.0,+36.1534278,-109.6853411,110697.217829,4.457888e+08,143699.53850,144619.55410,144468.77350,58884.82264
2,G04000109442021,2015-2019,3989,3989,,,Arizona,4,Apache County,1,...,254398380.0,59830.0,+36.2212815,-109.4813620,104898.985679,2.544582e+08,128306.33460,139636.39720,153778.82360,49994.36633
3,G04000109442022,2015-2019,3990,3990,,,Arizona,4,Apache County,1,...,174970041.0,58753.0,+36.0864000,-109.4770293,85365.971792,1.750288e+08,129733.53330,128683.19200,163403.46070,39225.28689
4,G04000109449011,2015-2019,3996,3996,,,Arizona,4,Apache County,1,...,33379851.0,1140284.0,+35.7432324,-109.5049606,34352.201934,3.452013e+07,98221.88091,98196.43338,152046.58250,32693.55892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22050,G56003909677023,2015-2019,217703,217703,,,Wyoming,56,Teton County,39,...,50938454.0,552805.0,+43.5160858,-110.7833214,33464.415314,5.149126e+07,42641.67066,30805.27582,76786.00192,33658.39584
22051,G56003909677024,2015-2019,217704,217704,,,Wyoming,56,Teton County,39,...,19397589.0,1559524.0,+43.4773691,-110.8410559,33842.490298,2.095712e+07,41553.56561,24476.90705,77311.22824,38346.48398
22052,G56003909678001,2015-2019,217705,217705,,,Wyoming,56,Teton County,39,...,7611142.0,19647.0,+43.4541108,-110.8166670,12900.767604,7.630790e+06,38303.55775,24620.47830,80560.01447,37474.19518
22053,G56003909678004,2015-2019,217708,217708,,,Wyoming,56,Teton County,39,...,646867.0,10230.0,+43.4757605,-110.7657732,4276.301165,6.571013e+05,38038.28053,29381.15714,81140.85514,38136.48131


In [78]:
no_match_df_2015_20km = no_match_df2015_fires[(no_match_df2015_fires["min_dist_2015_sim"] <= 20000) | (no_match_df2015_fires["min_dist_2016_sim"] <= 20000) | (no_match_df2015_fires["min_dist_2016_sim"] <= 20000) | (no_match_df2015_fires["min_dist_2017_sim"] <= 20000) | (no_match_df2015_fires["min_dist_2018_sim"] <= 20000)]
no_match_df_2015_20km.count()
#Out of 22533 missing, 7409 are missing within 20km of a 2015-2019 wildfire perimeter
#Sample for 2015-2019 is 64831 with missing med val and locations removed

GISJOIN              7409
YEAR                 7409
level_0              7409
index                7409
REGIONA                 0
                     ... 
Shape_Area           7409
min_dist_2015_sim    7409
min_dist_2016_sim    7409
min_dist_2017_sim    7409
min_dist_2018_sim    7409
Length: 90, dtype: int64

In [79]:
dist_2010 = [min_dist_2010_sim, min_dist_2011_sim, min_dist_2012_sim, min_dist_2013_sim, min_dist_2014_sim]
for i in range(len(dist_2010)):
    dist_2010[i]["GISJOIN"] = dist_2010[i]["GISJOIN"].str.replace(' ','')

In [104]:
no_match_df2010 = no_match_df[no_match_df["YEAR"]=="2010-2014"]

In [81]:
no_match_df2010_fires = merge_on_dist(no_match_df2010, dist_2010)

In [108]:
no_match_df2010_fires #22,895 missing block groups out of 66619 for 2010-2014

Unnamed: 0,GISJOIN,YEAR,level_0,index,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area,min_dist_2010_sim,min_dist_2011_sim,min_dist_2012_sim,min_dist_2013_sim,min_dist_2014_sim
0,G04000109426002,2010-2014,224306,224306,,,Arizona,4,Apache County,1,...,270973.0,+36.9353993,-109.9078833,97724.570225,4.312342e+08,69562.69733,89743.00298,17068.672500,94213.89966,58951.02258
1,G04000109442013,2010-2014,224321,224321,,,Arizona,4,Apache County,1,...,174729.0,+36.1534278,-109.6853411,110697.216850,4.457888e+08,148349.82270,164308.69720,53948.911050,170681.74990,74273.79664
2,G04000109442021,2010-2014,224322,224322,,,Arizona,4,Apache County,1,...,59843.0,+36.2212815,-109.4813620,104898.985679,2.544582e+08,150770.40880,154403.58450,49343.292380,175500.98510,58734.32418
3,G04000109442022,2010-2014,224323,224323,,,Arizona,4,Apache County,1,...,58753.0,+36.0864000,-109.4770293,85366.423045,1.750288e+08,164838.94450,169048.50880,52170.622630,175977.71240,54429.12401
4,G04000109449011,2010-2014,224329,224329,,,Arizona,4,Apache County,1,...,1140284.0,+35.7432322,-109.5049607,34352.201934,3.452013e+07,165648.81680,162996.01470,70108.330310,153695.46240,62881.20915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22890,G56003909678001,2010-2014,438038,438038,,,Wyoming,56,Teton County,39,...,19837.0,+43.4541108,-110.8166670,12900.766678,7.630796e+06,24107.06631,20767.21713,5732.701265,39561.98770,22997.33197
22891,G56003909678002,2010-2014,438039,438039,,,Wyoming,56,Teton County,39,...,0.0,+43.4730581,-110.7552797,5430.290310,1.056984e+06,21508.68549,26034.65785,3947.992916,34190.51146,24702.08560
22892,G56003909678003,2010-2014,438040,438040,,,Wyoming,56,Teton County,39,...,15355.0,+43.4613456,-110.7758397,9685.722186,4.564372e+06,21889.16749,23932.31900,3588.700003,36263.66360,24446.51518
22893,G56003909678004,2010-2014,438041,438041,,,Wyoming,56,Teton County,39,...,10791.0,+43.4757605,-110.7657732,4243.647376,6.576245e+05,22328.14886,25506.53380,4720.602393,34812.64253,23877.23427


In [82]:
no_match_df_2010_20km = no_match_df2010_fires[(no_match_df2010_fires["min_dist_2010_sim"] <= 20000) | (no_match_df2010_fires["min_dist_2011_sim"] <= 20000) | (no_match_df2010_fires["min_dist_2012_sim"] <= 20000) | (no_match_df2010_fires["min_dist_2013_sim"] <= 20000) | (no_match_df2010_fires["min_dist_2014_sim"] <= 20000)]
no_match_df_2010_20km.count()
#Of the 22,895 10,142 are missing within 20km from a wildfire perimeter between 2010-2014
#Sample for 2010-2014 is 66619 with missing med val and locations removed

GISJOIN              10142
YEAR                 10142
level_0              10142
index                10142
REGIONA                  0
                     ...  
min_dist_2010_sim    10142
min_dist_2011_sim    10142
min_dist_2012_sim    10142
min_dist_2013_sim    10142
min_dist_2014_sim    10142
Length: 91, dtype: int64

In [83]:
dist_2005 = [min_dist_2005_sim, min_dist_2006_sim, min_dist_2007_sim, min_dist_2008_sim, min_dist_2009_sim]
for i in range(len(dist_2005)):
    dist_2005[i]["GISJOIN"] = dist_2005[i]["GISJOIN"].str.replace(' ','')

In [84]:
no_match_df2005 = no_match_df[no_match_df["YEAR"]=="2005-2009"]

In [112]:
no_match_df2005

Unnamed: 0,GISJOIN,YEAR,level_0,index,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both,Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan,Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan,Housing units without a mortgage,ALAND,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area
45428,G04000500006002,2005-2009,444674,444674,,,Arizona,4,Coconino County,5,...,0.254098,0.00000,0.475410,0.270492,1.383479e+07,56132.0,+35.2061730,-111.5427810,19658.759211,1.389092e+07
45429,G04000500010001,2005-2009,444688,444688,,,Arizona,4,Coconino County,5,...,0.000000,0.00000,0.281250,0.718750,4.974070e+05,0.0,+35.1866671,-111.6518764,3610.769508,4.974050e+05
45430,G04001300931013,2005-2009,445390,445390,,,Arizona,4,Maricopa County,13,...,0.000000,0.00000,0.877193,0.122807,2.379140e+05,0.0,+33.5221503,-112.1622073,3285.941115,2.379144e+05
45431,G04001301033042,2005-2009,445459,445459,,,Arizona,4,Maricopa County,13,...,0.000000,0.00000,1.000000,0.000000,2.396100e+05,0.0,+33.6316987,-112.0145031,2800.002264,2.396101e+05
45432,G04001301036093,2005-2009,445497,445497,,,Arizona,4,Maricopa County,13,...,0.139423,0.00000,0.524038,0.336538,7.882730e+05,2061.0,+33.6175960,-112.1034180,4252.671446,7.903354e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47240,G53006700101001,2005-2009,642626,642626,,,Washington,53,Thurston County,67,...,0.000000,0.00000,1.000000,0.000000,1.179667e+06,1394454.0,+47.0532599,-122.9040083,6869.095217,1.179667e+06
47241,G53007300010001,2005-2009,642840,642840,,,Washington,53,Whatcom County,73,...,0.235099,0.02649,0.430464,0.307947,1.202196e+06,0.0,+48.7336452,-122.4833214,5182.024566,1.202196e+06
47242,G53007300101001,2005-2009,642851,642851,,,Washington,53,Whatcom County,73,...,0.000000,0.00000,0.539446,0.460554,3.469230e+09,102473931.0,+48.8219671,-121.3586126,293943.532972,3.571687e+09
47243,G53007500006004,2005-2009,642928,642928,,,Washington,53,Whitman County,75,...,0.000000,0.00000,0.688889,0.311111,3.237790e+05,0.0,+46.7381759,-117.1608204,2400.888338,3.237775e+05


In [85]:
no_match_df2005_fires = merge_on_dist(no_match_df2005, dist_2005)

In [106]:
no_match_df2005_fires
#missing block groups due to geography changes is 706 

Unnamed: 0,GISJOIN,YEAR,level_0,index,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,COUNTYA,...,AWATER,INTPTLAT,INTPTLON,Shape_Leng,Shape_Area,min_dist_2005_sim,min_dist_2006_sim,min_dist_2007_sim,min_dist_2008_sim,min_dist_2009_sim
0,G04000500006002,2005-2009,444674,444674,,,Arizona,4,Coconino County,5,...,56132.0,+35.2061730,-111.5427810,19658.759211,1.389092e+07,51502.46404,13618.656930,33268.94408,11191.608030,14776.230440
1,G04000500010001,2005-2009,444688,444688,,,Arizona,4,Coconino County,5,...,0.0,+35.1866671,-111.6518764,3610.769508,4.974050e+05,41406.85662,3549.868713,32088.60596,8322.441981,22781.481410
2,G04001300931013,2005-2009,445390,445390,,,Arizona,4,Maricopa County,13,...,0.0,+33.5221503,-112.1622073,3285.941115,2.379144e+05,47044.71571,62769.532860,95800.70098,21640.223000,117925.306900
3,G04001301033042,2005-2009,445459,445459,,,Arizona,4,Maricopa County,13,...,0.0,+33.6316987,-112.0145031,2800.002264,2.396101e+05,32237.20861,45039.195970,88557.39657,38911.608710,100214.959300
4,G04001301036093,2005-2009,445497,445497,,,Arizona,4,Maricopa County,13,...,2061.0,+33.6175960,-112.1034180,4252.671446,7.903354e+05,35260.51342,53221.137120,87066.20375,33536.866330,106109.447900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701,G53006700101001,2005-2009,642626,642626,,,Washington,53,Thurston County,67,...,1394454.0,+47.0532599,-122.9040083,6869.095217,1.179667e+06,153178.99870,44345.218500,126532.50970,151735.207800,73255.062160
702,G53007300010001,2005-2009,642840,642840,,,Washington,53,Whatcom County,73,...,0.0,+48.7336452,-122.4833214,5182.024566,1.202196e+06,124624.45460,136938.370600,121592.13970,87586.479870,33859.400010
703,G53007300101001,2005-2009,642851,642851,,,Washington,53,Whatcom County,73,...,102473931.0,+48.8219671,-121.3586126,293943.532972,3.571687e+09,58485.70736,54447.294870,55808.63030,11984.281570,7571.304485
704,G53007500006004,2005-2009,642928,642928,,,Washington,53,Whitman County,75,...,0.0,+46.7381759,-117.1608204,2400.888338,3.237775e+05,59462.79077,16337.895280,69117.52494,66111.594490,126643.787200


In [86]:
no_match_df_2005_20km = no_match_df2005_fires[(no_match_df2005_fires["min_dist_2005_sim"] <= 20000) | (no_match_df2005_fires["min_dist_2006_sim"] <= 20000) | (no_match_df2005_fires["min_dist_2007_sim"] <= 20000) | (no_match_df2005_fires["min_dist_2008_sim"] <= 20000) | (no_match_df2005_fires["min_dist_2009_sim"] <= 20000)]
no_match_df_2005_20km.count()
#of the 706, 334 is missing within 20km from a wildfier periemeter 
#(total sample in 2005-2009 is 45,541 with missing med val and location data removed)

GISJOIN              334
YEAR                 334
level_0              334
index                334
REGIONA                0
                    ... 
min_dist_2005_sim    334
min_dist_2006_sim    334
min_dist_2007_sim    334
min_dist_2008_sim    334
min_dist_2009_sim    334
Length: 91, dtype: int64

In [112]:
no_match_df_des = no_match_df.groupby(["STATE","YEAR"]).describe()
no_match_df_des["index", "count"]

STATE         YEAR     
Arizona       2005-2009      46.0
              2010-2014    2235.0
              2015-2019    2170.0
California    2005-2009    1347.0
              2010-2014    7108.0
              2015-2019    7049.0
Colorado      2005-2009      26.0
              2010-2014    1219.0
              2015-2019    1200.0
Idaho         2005-2009       9.0
              2010-2014     310.0
              2015-2019     305.0
Kansas        2005-2009      13.0
              2010-2014     707.0
              2015-2019     705.0
Montana       2010-2014     270.0
              2015-2019     264.0
Nebraska      2005-2009       9.0
              2010-2014     490.0
              2015-2019     486.0
Nevada        2005-2009      29.0
              2010-2014     990.0
              2015-2019     959.0
New Mexico    2005-2009       7.0
              2010-2014     526.0
              2015-2019     504.0
North Dakota  2005-2009       4.0
              2010-2014     139.0
              2015-2019 

In [113]:
no_match_df_des["index", "count"].sum()

47245.0

In [51]:
missing_house_prices = ACS5_2005_2019_WESTd[ACS5_2005_2019_WESTd["Median value (dollars)"].isnull()].copy()
missing_house_prices["Count"] = 1 
missing_house_prices = missing_house_prices.count()
##1424 obs with missing median house prices 

In [116]:
ACS5_2005_2019_WESTd = ACS5_2005_2019_WESTd[ACS5_2005_2019_WESTd["Median value (dollars)"].notnull()].drop(columns = ["index"]).reset_index()
#select rows with no missing median house prices

In [123]:
cols = ['index','REGIONA','DIVISIONA','COUSUBA','PLACEA','CONCITA','Shape_Leng','Shape_Area']
ACS5_2005_2019_WESTd = ACS5_2005_2019_WESTd.drop(columns=cols)

In [131]:
rename_dict = {"Total Race":"race_0", 
               "White alone":"race_1",
              "Black or African American alone":"race_2",
              "American Indian and Alaska Native alone":"race_3",
              "Asian alone":"race_4",
              "Native Hawaiian and Other Pacific Islander alone":"race_5",
              "Some other race alone":"race_6",
              "Two or more races":"race_7",
              "Total Travel Time to Work":"travel_0",
              "Less than 5 minutes": "travel_1",
              "5 to 9 minutes":"travel_2",
              "10 to 14 minutes":"travel_3",
              "15 to 19 minutes":"travel_4",
              "20 to 24 minutes":"travel_5",
              "25 to 29 minutes":"travel_6",
              "30 to 34 minutes":"travel_7",
              "35 to 39 minutes":"travel_8",
              "40 to 44 minutes":"travel_9",
              "45 to 59 minutes":"travel_10",
              "60 to 89 minutes":"travel_11",
              "90 or more minutes":"travel_12",
              "Total Educational Attainment for the Population 25 Years and Over":"educ_0",
              "No schooling completed":"educ_1",
              "Nursery to 4th grade":"educ_2",
              "5th and 6th grade":"educ_3",
              "7th and 8th grade":"educ_4",
              "9th grade":"educ_5",
              "10th grade":"educ_6",
              "11th grade":"educ_7",
              "12th grade no diploma":"educ_8",
              "High school graduate, GED, or alternative":"educ_9",
              "Some college less than 1 year":"educ_10",
              "Some college 1 or more years no degree":"educ_11",
              "Associate's degree":"educ_12",
              "Bachelor's degree":"educ_13",
              "Master's degree":"educ_14",
              "Professional school degree":"educ_15",
              "Doctorate degree":"educ_16",
              "Total Year Structure Built":"built_0",
              "Built 2000 or later":"built_1",
              "Built 1990 to 1999":"built_2",
              "Built 1980 to 1989":"built_3",
              "Built 1970 to 1979":"built_4",
              "Built 1960 to 1969":"built_5",
              "Built 1950 to 1959":"built_6",
              "Built 1940 to 1949":"built_7",
              "Built 1939 or earlier":"built_8",
              "Total Bedrooms":"bdrm_0",
              "1 bedroom":"bdrm_1",
              "2 bedrooms":"bdrm_2",
              "3 bedrooms":"bdrm_3",
              "4 bedrooms":"bdrm_4",
              "5 or more bedrooms":"bdrm_5", 
              "Median gross rent":"rent",
              "Lower value quartile (dollars)":"lowval",
              "Median value (dollars)":"medval",
              "Upper value quartile (dollars)":"upval",
              "Total Mortgage Status":"mortag_0",
              "Housing units with a mortgage contract to purchase or similar debt":"mortag_1",
              "Housing units with a mortgage contract to purchase or similar debt With either a second mortgage or home equity loan but not both": "mortag_2",
              "Housing units with a mortgage contract to purchase or similar debt Both second mortgage and home equity loan":"mortag_3",
              "Housing units with a mortgage contract to purchase or similar debt No second mortgage and no home equity loan":"mortag_4",
              "Housing units without a mortgage":"mortag_5"}

In [132]:
ACS5_2005_2019_WESTd = ACS5_2005_2019_WESTd.rename(columns=rename_dict)

In [136]:
ACS5_2005_2019_WESTd = ACS5_2005_2019_WESTd.drop(columns=['geometry'])

In [139]:
#ACS5_2005_2019_WESTd.to_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2005_2019_FULL_DATAFRAME_FOR_ANALYSIS_no_geo.csv")

In [109]:
whp = import_csv(r"C:\Users\sheld\Downloads\USA_Wildfire_Hazard_Potential_with_Demographics.csv")

In [108]:
ACS5_2005_2019_WESTd = import_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2005_2019_FULL_DATAFRAME_FOR_ANALYSIS_no_geo.csv")

In [117]:
def insert_zero(string, index):
    return string[:index]+"0"+string[index:]

In [118]:
def insert_G(string, index):
    return string[:index]+"G"+string[index:]

In [110]:
whp["GISJOIN"] = whp["ID"].astype(str)

In [112]:
whp["GISJOIN"] = whp["GISJOIN"].str.rjust(12,'0')

In [113]:
whp["GISJOIN"] = whp["GISJOIN"].apply(lambda x: insert_zero(x, 2))

In [115]:
whp["GISJOIN"] = whp["GISJOIN"].apply(lambda x: insert_zero(x, 6))

In [116]:
whp["GISJOIN"] = whp["GISJOIN"].apply(lambda x: insert_G(x, 0))

Unnamed: 0,ï»¿OBJECTID,ID,NAME,STATE_NAME,ST_ABBREV,TOTPOP_CY,POPDENS_CY,TOTHH_CY,TOTHU_CY,OWNER_CY,...,MEAN,STD,SUM_,VARIETY,MAJORITY,MINORITY,MEDIAN,SHAPE_Length,SHAPE_Area,GISJOIN
0,1,60150001011,6.02e+07,California,CA,1925,2570.1,815,960,200,...,0.15,0.36,4.0,2.0,0.0,1.0,0.0,0.11,1.97e-04,06001500001011
1,2,60150001012,6.02e+07,California,CA,1905,6919.7,728,811,262,...,0.00,0.00,0.0,1.0,0.0,0.0,0.0,0.04,7.72e-05,06001500001012
2,3,60150001021,6.02e+07,California,CA,1704,329.3,678,748,513,...,1.65,1.25,297.0,5.0,3.0,4.0,2.0,0.19,1.44e-03,06001500001021
3,4,60150001022,6.02e+07,California,CA,2055,560.3,781,883,575,...,1.40,1.30,168.0,5.0,0.0,4.0,1.0,0.17,9.59e-04,06001500001022
4,5,60150001041,6.02e+07,California,CA,1922,2451.2,762,847,465,...,0.57,0.78,16.0,3.0,0.0,2.0,0.0,0.06,2.20e-04,06001500001041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215747,215748,330079503001,3.30e+08,New Hampshire,NH,645,1.9,313,1079,263,...,0.98,0.13,11844.0,2.0,1.0,0.0,1.0,2.82,9.98e-02,33000709503001
215748,215749,230299557001,2.30e+08,Maine,ME,1271,348.2,659,1090,445,...,1.07,0.88,129.0,3.0,2.0,1.0,1.0,0.36,9.93e-04,23002909557001
215749,215750,230299559001,2.30e+08,Maine,ME,922,13.8,424,603,339,...,1.84,0.44,4381.0,3.0,2.0,0.0,2.0,1.39,1.98e-02,23002909559001
215750,215751,230299559002,2.30e+08,Maine,ME,846,30.9,369,551,332,...,1.79,0.52,1741.0,3.0,2.0,0.0,2.0,1.02,8.15e-03,23002909559002


In [127]:
ACS5_2005_2019_WEDTd_WHP = pd.merge(ACS5_2005_2019_WESTd, whp, on = ['GISJOIN'], how = "left") 
#merge whp to acs data, merged onto acs

In [None]:
##about 249 block groups missing a WHP index

In [138]:
missing = ACS5_2005_2019_WEDTd_WHP[ACS5_2005_2019_WEDTd_WHP["MEDIAN"].isnull()].copy().groupby("STATE").size()
missing

STATE
California      233
Colorado          9
South Dakota      4
Texas             3
dtype: int64

In [139]:
number_states = ACS5_2005_2019_WEDTd_WHP.copy().groupby("STATE").size()
number_states

STATE
Arizona          5226
California      44456
Colorado         6537
Idaho            1918
Kansas           4719
Montana          1672
Nebraska         3273
Nevada           2103
New Mexico       2617
North Dakota     1234
Oklahoma         5506
Oregon           5904
South Dakota     1139
Texas           29013
Utah             2884
Washington      10640
Wyoming           905
dtype: int64

In [140]:
ACS5_2005_2019_WEDTd_WHP.to_csv(r"C:\Users\sheld\Documents\Thesis_Data_Full_United_States\merged_data_with_block_groups\ACS5_2005_2019_FULL_DATAFRAME_FOR_ANALYSIS_no_geo_WHP.csv")