# Los Angeles, CA  Redlining analysis

In [None]:
# Load libraries
import pandas as pd
import numpy as np
import altair as alt
import geopandas as gpd
import pyspark
import censusdata
from pyspark.sql import SparkSession 
from pyspark.sql.functions  import col, when, lit
from pyspark.sql import functions as f

from vega_datasets import data
alt.data_transformers.disable_max_rows()

from io import StringIO
alt.themes.enable("fivethirtyeight") # visualization theme


### Loading county codes and geolocation.

We would like to explore if redlining was more prevalent in certain areas of the country. For that we will use two more datasets for plotting.

1. https://github.com/btskinner/spatial/blob/master/data/county_centers.csv

undefined. https://github.com/kjhealy/fips-codes/blob/master/state_and_county_fips_master.csv

In [None]:
states=pd.read_csv('state_fips.csv', dtype={'fips': str})
states['fips'] = states['fips'].str.zfill(5)
fips= pd.read_csv('fipsnames-20221011-151647.csv', dtype={'fips': str})
fips= pd.merge(fips[['fips', 'clon00', 'clat00']], states[['fips', 'name']],how='left',on='fips')
fips.head(3)

### Load dataset HOLC 

Calculate Historical Redlining Score (HRS) by calculating the grade weights. 

undefined. Calculate the percentage of weighted area. Ex. area_A divided by area_rated.

undefined. Multiply by the HOLC grade factor : A= 1, B=2, C=3, D=4

undefined. Final Score. The level of redlining goes from 1-4 with 1 being low redlining and 4 high.

This methodology was obtained from https://ncrc.org/redlining-score/

In [None]:
holc_rated=pd.read_csv('HOLC_2020_census_tracts.csv', dtype={'geoid20': str})
#calculate % of rated area
holc_rated['A']= holc_rated['area_A']/holc_rated['area_rated']
holc_rated['B']= holc_rated['area_B']/holc_rated['area_rated']
holc_rated['C']= holc_rated['area_C']/holc_rated['area_rated']
holc_rated['D']= holc_rated['area_D']/holc_rated['area_rated']

#used NCRC methodology to calculate HRS (Historic redlinning score)
holc_rated['a']= holc_rated['A']*1
holc_rated['b']= holc_rated['B']*2
holc_rated['c']= holc_rated['C']*3
holc_rated['d']= holc_rated['D']*4
holc_rated['HRS']= holc_rated[['a', 'b', 'c', 'd']].sum(axis=1)

holc_rated['fips']= holc_rated['geoid20'].str[:5]  #extract county code also known as fips
holc_rated.rename(columns={'geoid20':'GEOID'}, inplace=True)
holc_rated.head(3)

We will joined the newly created fips file with HOLC to plot counties and percentage of redlining 

In [None]:
holc_fips= holc_rated[['fips', 'A', 'B', 'C', 'D', 'HRS']].groupby('fips').mean()
holc_fips.reset_index(inplace=True)
holc_fips["id"] = holc_fips["fips"].astype(int)
holc_fips= pd.merge(holc_fips, fips,how='left',on='fips')

#Long form for plotting
holc_fipsL= pd.melt(holc_fips, id_vars=['fips', 'id', 'name', 'clon00', 'clat00', 'HRS' ], value_vars=['A', 'B', 'C', 
    'D'], ignore_index=False)

holc_fips.head(3)

### Filter Los Angeles County (fips 06037)

In [None]:
LA_holc= holc_rated.loc[holc_rated.fips.str.startswith('06037')]
LA_holc.rename(columns={'geoid20':'GEOID'}, inplace=True)
LA_holc.head(3)

### Plotting redlining rate by census tract

We will use a Geojson file. The shape file was dowloaded from https://www.census.gov/cgi-bin/geo/shapefiles/index.php.  And we use OGRE shape convertor to transform the .shape file to Geojson (https://ogre.adc4gis.com/

In [None]:
#load geojson file to create map
cali= gpd.read_file('06307tract_ogre.json')
cali['GEOID']= '06037'+cali['CT20'] #add GEOID census tract number
cali.head(3)

In [None]:
#merge the HRS with the Geojson shape. 
cali_plus_geo=pd.merge(cali[['GEOID', 'geometry']], holc_rated[['GEOID', 'HRS']],how='left',on='GEOID')
cali_plus_geo

In [None]:
#Altair chart of neighborhood outlines
neighborhoods = alt.Chart(cali_plus_geo
        ).mark_geoshape(stroke = 'lightgray',strokeWidth=0.5,
        ).encode(color= alt.Color('HRS:Q', scale=alt.Scale(
            scheme="redyellowgreen",
            reverse=True
        )),
            tooltip=[alt.Tooltip('HRS:Q')]
        ).properties(
    width=500,
    height=500,
    title=alt.TitleParams(
            text='Los Angeles County HRS Score by Census Tract')
)
neighborhoods.configure(background='#FFFFFF')

### Census Data

Next, we will compare the demographics of ungraded and graded areas. For this we will use census data available to python library Censusdata. Library documentation can be found https://pypi.org/project/CensusData/. 

In [None]:
#dowload census data
county_pop = censusdata.download('acs5', 2015, censusdata.censusgeo([('state', '06'), ('county', '037'),
                                ('block group', '*')]),
                                ['B02001_001E', 'B02001_002E', 'B25081_001E', 'B25081_008E', 
                                'B25002_001E', 'B25002_002E', 'B25002_003E', 'B11001_002E'])
county_pop.rename(columns={'B02001_001E':'population_total', 'B02001_002E':'white_pop',
  'B25081_001E':'total_houses','B25081_008E':'houses_wo_mortgage','B25002_001E': 'occupancy_total', 
  'B25002_002E': 'occupied', 'B25002_003E': 'Vacant', 'B11001_002E':'families'}, inplace=True)

county_pop.reset_index(inplace=True)
county_pop

Since the census data does not contain the five digit fips code for each county, we will perform some transformation 

In [None]:
#extract state and 3 digit county code and tract number. And build fips code and GEO (censustract). 
county_pop['state']= county_pop['index'].astype(str).str.extract(r'(state:\d{2})')
county_pop['county']= county_pop['index'].astype(str).str.extract(r'(county:\d{3})')
county_pop['tract']= county_pop['index'].astype(str).str.extract(r'(Tract \d{4}.\d{2})')
county_pop['county']= county_pop['county'].str.replace("county:", "")
county_pop['state']= county_pop['state'].str.replace("state:", "")
county_pop['tract']= county_pop['tract'].str.replace("Tract ", "")
county_pop['tract']= county_pop['tract'].str.replace(".", "")
county_pop['fips']= county_pop['state']+ county_pop['county']
county_pop['GEOID']= county_pop['state']+ county_pop['county']+ county_pop['tract'] # census tract number
county_pop.drop(columns=['state', 'county',], inplace=True)
county_pop

We will calculate the vacancy, mortgage and minority percentages

In [None]:
county_pop['vacant_perc']= county_pop['Vacant']/county_pop['total_houses']
county_pop['mortgage_perc']= 1-(county_pop['houses_wo_mortgage']/county_pop['total_houses'])
county_pop['minority_perc']= 1-(county_pop['white_pop']/county_pop['population_total'])
county_pop['Families_perc']= county_pop['families']/county_pop['total_houses']
county_pop

### Joining redlining and census data 

We will join the HRS with current census data, to see the demographic composition and HRS grading together.  

Minority percentage steadily increases from ungraded to grade D. We can also see that the % of vacant units doubles between graded A areas and Graded D. mortgage percentage is similar between all ar

In [None]:
#merge HOLC, and census data
holc_rated= pd.merge(LA_holc, county_pop[['GEOID', 'population_total', 'total_houses', 'families',
    'vacant_perc', 'mortgage_perc', 'minority_perc']],
    how='right',on='GEOID')
holc_rated['HRS'] = holc_rated['HRS'].fillna(0.1)
holc_rated['grade'] = pd.cut(holc_rated['HRS'], bins=[0,1, 1.75, 2.49, 3.3, 4], labels=['Ungraded', 'A','B', 
    'C', 'D'])
holc_rated=holc_rated[holc_rated.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)] 
holc_rated

Los Angeles one of the largest counties in the nation, we will explore if t= the conditions differed significantly from the rest of the country. Census tract demographics  by HRS grade show that  83% of the populations lives in census tracts graded ‘C’ or ‘D’ and the number of families exceeds the number of houses, this indicates more than one family may live in a house. Also, the mean percentage of vacant properties increases by over 27% when we compare grades ‘B’ to “C” and “D”. 
     So we wonder is there are vacant properties. Why some Los Angeles  families decide to live in the same house? Is it possible that they are unable to get mortgages? We decided to use a logistic regression to see if the probability of approval differs based on race in Los Angeles neighborhoods. 


In [None]:
#aggreagate values by HRS grading 
names = {'population_total':'population_total', 'total_houses':'total_houses', 'families':'families',
    'mortgage_perc':'mean_%mortage', 'minority_perc':'mean-%minority', 'vacant_perc':'mean_%vacant', 'HRS':'mean_HRS'}
holc_ratedagg= holc_rated.groupby('grade').agg({'population_total':'sum', 'total_houses':'sum', 
    'families':'sum', 'mortgage_perc':'mean', 'minority_perc':'mean', 'vacant_perc':'mean', 'HRS':'mean' }).rename(columns=names)
holc_ratedagg