In [1]:
import numpy as np
import pandas as pd
from dplython import *
#import plotnine

# Calculate Low-Income Thresholds

In [2]:
# puma codes for alameda, contra costa, and san francisco counties
alameda_puma = np.arange(101, 111)
cc_puma = np.arange(1301, 1310)
sf_puma = np.arange(7501, 7508)

In [3]:
def extend_low_income(thres, upper_N=16):
    extras = (np.arange(9, upper_N) - 8) * thres[3] * 0.08 + thres[-1]
    thres = np.append(thres, extras)
    return thres

In [4]:
# alameda, contra costa, and sf low income thresholds per number of persons in household 
alameda_low_income = np.array([62750, 71700, 80650, 89600, 96800, 103950, 111150, 118300])
cc_low_income = np.array([62750, 71700, 80650, 89600, 96800, 103950, 111150, 118300])
sf_low_income = np.array([82200, 93950, 105700, 117400, 126800, 136200, 145600, 155000])
# extend the low income thresholds for households of size >8
alameda_low_income = extend_low_income(alameda_low_income)
cc_low_income = extend_low_income(cc_low_income)
sf_low_income = extend_low_income(sf_low_income)
# store in list
low_income_thresholds = [alameda_low_income, cc_low_income, sf_low_income]

# Consolidate Low-Income Tables

In [5]:
# read in household PUMS data
hca = pd.read_csv('ss16hca.csv')

In [6]:
# extract samples corresponding to each county
alameda = hca.loc[hca.PUMA.isin(alameda_puma)]
cc = hca.loc[hca.PUMA.isin(cc_puma)]
sf = hca.loc[hca.PUMA.isin(sf_puma)]
# store tables in list
tables = [alameda, cc, sf]

In [7]:
# extract low income portion of the table
low_income_tables = []
# use FINCP or HINCP?
for idx, table in enumerate(tables):
    # remove NaN household incomes
    table = table.loc[table.FINCP.notna()]
    low_income_threshold = low_income_thresholds[idx]
    low_income_table = table.loc[table.FINCP < low_income_threshold[table.NP - 1]]
    low_income_tables.append(low_income_table)

In [8]:
def calculate_percentages_from_table(table, table_variable, var_value, weighted=True):
    if weighted:
        weighted_total_sum = table.WGTP.sum()
        weighted_subset_sum = table.loc[table[table_variable] == var_value].WGTP.sum()
        final_percentage = 100 * weighted_subset_sum/weighted_total_sum
    else:
        total_sum = table.shape[0]
        subset_sum = table.loc[table[table_variable] == var_value].shape[0]
        final_percentage = 100 * subset_sum/total_sum
        
    return final_percentage

# Brief note on data completeness

In [9]:
dpl_sf = DplyFrame(sf)

print("There are", dpl_sf >> select(X.FINCP, X.NPF, X.LAPTOP, X.ACCESS) >> X.dropna() >> nrow(), "complete rows using family income/number of individuals")

print("There are", dpl_sf >> select(X.HINCP, X.NP, X.LAPTOP, X.ACCESS) >> X.dropna() >> nrow(), "complete rows using household income/number of individuals")

There are 1684 complete rows using family income/number of individuals
There are 3351 complete rows using household income/number of individuals


# Make some dplython dfs for exploration

In [10]:
dpl_sf_li = DplyFrame(low_income_tables[2])
dpl_ala_li = DplyFrame(low_income_tables[0])
dpl_cc_li = DplyFrame(low_income_tables[1])

# Internet 

#### Any access - ACCESS var

In [11]:
alameda_internet = calculate_percentages_from_table(low_income_tables[0], "ACCESS", 3)
print('In Alameda County, %0.2f%% of low income households lack access to the Internet at home.' %alameda_internet)
sf_internet = calculate_percentages_from_table(low_income_tables[2], "ACCESS", 3)
print('In San Francisco County, %0.2f%% of low income households lack access to the Internet at home.' %sf_internet)
cc_internet = calculate_percentages_from_table(low_income_tables[1], "ACCESS", 3)
print('In Contra Costa County, %0.2f%% of low income households lack access to the Internet at home.' %cc_internet)

In Alameda County, 13.44% of low income households lack access to the Internet at home.
In San Francisco County, 10.73% of low income households lack access to the Internet at home.
In Contra Costa County, 8.65% of low income households lack access to the Internet at home.


#### HISPEED - "Broadband (high speed) Internet service such as cable, fiber optic, or DSL service" - from ACS data dictionary https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMSDataDict16.pdf

In [12]:
alameda_internet = calculate_percentages_from_table(low_income_tables[0], "HISPEED", 2)
print('In Alameda County, %0.2f%% of low income households lack access to high-speed non-data plan internet at home.' %alameda_internet)
sf_internet = calculate_percentages_from_table(low_income_tables[2], "HISPEED", 2)
print('In San Francisco County, %0.2f%% of low income households lack access to high-speed non-data plan Internet at home.' %sf_internet)
cc_internet = calculate_percentages_from_table(low_income_tables[1], "HISPEED", 2)
print('In Contra Costa County, %0.2f%% of low income households lack access to high-speed non-data plan Internet at home.' %cc_internet)

In Alameda County, 14.49% of low income households lack access to high-speed non-data plan internet at home.
In San Francisco County, 12.73% of low income households lack access to high-speed non-data plan Internet at home.
In Contra Costa County, 17.01% of low income households lack access to high-speed non-data plan Internet at home.


# Computer (laptop/desktop - from LAPTOP variable)

In [13]:
alameda_computer = calculate_percentages_from_table(low_income_tables[0], "LAPTOP", 2)
print('In Alameda County, %0.2f%% of low income households lack access to a computer at home.' %alameda_computer)
sf_computer = calculate_percentages_from_table(low_income_tables[2], "LAPTOP", 2)
print('In San Francisco County, %0.2f%% of low income households lack access to a computer at home.' %sf_computer)
cc_computer = calculate_percentages_from_table(low_income_tables[1], "LAPTOP", 2)
print('In Contra Costa County, %0.2f%% of low income households lack access to a computer at home.' %cc_computer)

In Alameda County, 23.21% of low income households lack access to a computer at home.
In San Francisco County, 18.10% of low income households lack access to a computer at home.
In Contra Costa County, 22.66% of low income households lack access to a computer at home.


# Combined

In [88]:

#select(X.WGTP, X.LAPTOP, X.SMARTPHONE, X.TABLET, X.COMPOTHX, X.DIALUP, X.ACCESS, X.OTHSVCEX, X.SATELLITE, X.BROADBND)

def calc_percent_general(df, subset_conditional, outer_conditional = "", weighted = True):
    
    if weighted:
        wgtp_subset_sum = (
                            df >> 
                            eval("sift(" + subset_conditional + ")") >> 
                            summarize(wgtp_sum = X.WGTP.sum())
                         )["wgtp_sum"][0]


        wgtp_total_sum = (
                            df >> 
                            eval("sift(" + outer_conditional + ")") >>
                            summarize(wgtp_sum = X.WGTP.sum())
                        )["wgtp_sum"][0]
        
        return 100*wgtp_subset_sum/wgtp_total_sum
    
    else:
        
        subset_total = (
                            df >> 
                            eval("sift(" + subset_conditional + ")") >> 
                            nrow()
                         )
        
        total = (
                            df >> 
                            eval("sift(" + outer_conditional + ")") >>
                            nrow()      
        )
        
        return 100*subset_total/total
        

###Some notes below on the implications of the different variables

1) Respondents with internet access (ACCESS == 1) but no HISPEED/SATELLITE/DIALUP overwhelmingly have a data plan (BROADBND == 1).  Exceptions are those for whom OTHSVCEX == 1 (other unspecified form of internet access presumably e.g. public library or similar)

In [92]:

outer_conditional_broadbnd = "(X.ACCESS == 1) & (X.HISPEED == 2) & (X.SATELLITE == 2) & (X.DIALUP == 2)"
subset_conditional_broadbnd = "(X.ACCESS == 1) & (X.HISPEED == 2) & (X.SATELLITE == 2) & (X.DIALUP == 2) & (X.BROADBND == 1)"

print('In Alameda County, %0.2f%% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan.' %calc_percent_general(dpl_ala_li, subset_conditional_broadbnd, outer_conditional = outer_conditional_broadbnd))
print('In San Francisco County, %0.2f%% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan.' %calc_percent_general(dpl_sf_li, subset_conditional_broadbnd, outer_conditional = outer_conditional_broadbnd))
print('In Contra Costa County, %0.2f%% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan' %calc_percent_general(dpl_cc_li, subset_conditional_broadbnd, outer_conditional = outer_conditional_broadbnd))

In Alameda County, 99.09% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan.
In San Francisco County, 98.73% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan.
In Contra Costa County, 100.00% of low income households who have internet access but lack conventional access (e.g. through ISP subscription) have a dataplan


In [98]:
small_screen_internet_conditional = "(X.ACCESS == 3) | ((X.TABLET == 2) & (X.LAPTOP == 2) & (X.COMPOTHX == 2))"


print('In Alameda County, %0.2f%% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_ala_li, small_screen_internet_conditional))
print('In San Francisco County, %0.2f%% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_sf_li, small_screen_internet_conditional))
print('In Contra Costa County, %0.2f%% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_cc_li, small_screen_internet_conditional))


print('In Alameda County, %0.2f%% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_ala_li, small_screen_internet_conditional, weighted = False))
print('In San Francisco County, %0.2f%% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_sf_li, small_screen_internet_conditional, weighted = False))
print('In Contra Costa County, %0.2f%% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_cc_li, small_screen_internet_conditional, weighted = False))

In Alameda County, 20.52% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).
In San Francisco County, 17.77% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).
In Contra Costa County, 17.92% (weighted) of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).
In Alameda County, 17.23% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).
In San Francisco County, 15.44% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).
In Contra Costa County, 14.70% of low income households either lack internet access or have at most a small screen device with which to use it (e.g. smartphone).


In [99]:
small_screen_or_only_dataplan_internet_conditional = "((X.ACCESS == 3) | ((X.HISPEED == 2) & (X.SATELLITE == 2) & (X.DIALUP == 2))) | ((X.TABLET == 2) & (X.LAPTOP == 2) & (X.COMPOTHX == 2))"

print('In Alameda County, %0.2f%% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_ala_li, small_screen_or_only_dataplan_internet_conditional))
print('In San Francisco County, %0.2f%% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_sf_li, small_screen_or_only_dataplan_internet_conditional))
print('In Contra Costa County, %0.2f%% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_cc_li, small_screen_or_only_dataplan_internet_conditional))


print('In Alameda County, %0.2f%% of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_ala_li, small_screen_or_only_dataplan_internet_conditional, weighted = False))
print('In San Francisco County, %0.2f%% of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_sf_li, small_screen_or_only_dataplan_internet_conditional, weighted = False))
print('In Contra Costa County, %0.2f%% of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).' %calc_percent_general(dpl_cc_li, small_screen_or_only_dataplan_internet_conditional, weighted = False))

In Alameda County, 27.66% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).
In San Francisco County, 24.47% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).
In Contra Costa County, 25.56% (weighted) of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).
In Alameda County, 25.07% of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which to use it (e.g. smartphone).
In San Francisco County, 22.42% of low income households either lack internet access or only have access through a dataplan or have at most a small screen device with which

In [14]:
low_income_tables[1].WGTP.sum()/cc.WGTP.sum()

0.26992994396490871

In [15]:
print(low_income_tables[0].WGTP.sum()/alameda.WGTP.sum())
print(low_income_tables[1].WGTP.sum()/cc.WGTP.sum())
print(low_income_tables[2].WGTP.sum()/sf.WGTP.sum())

0.248337723832
0.269929943965
0.201645524829


|| Alameda | San Francisco | Contra Costa |
|---|---------|----|--------------|
|<b>Lack Computer Access</b> | 19.44% | 16.11% | 17.94% | 
|<b>Lack Computer Access (Weighted)</b> | 23.21% | 18.10% | 22.66% |
|<b>Lack Internet Access</b> | 11.60% | 10.34% | 8.13% | 
|<b>Lack Internet Access (Weighted)</b> | 13.44% | 10.73% | 8.65% |  
|<b>Lack Internet Access OR only have smartphone (no moderate/large screen device)</b> | 17.23% | 15.44% | 14.70% | 
|<b>Lack Internet Access OR only have smartphone (no moderate/large screen device) (Weighted)</b> | 20.52% | 17.77% | 17.92% | 
|<b>Lack internet Access OR only have access through a dataplan OR have at most a small screen device with which to use it</b> | 25.07% | 22.42% | 22.00% | 
|<b>Lack internet Access OR only have access through a dataplan OR have at most a small screen device with which to use it (Weighted)</b> | 27.66% | 24.47% | 25.56% | 
