This notebook adds the demographic data table to the postgres database. It has 4 columns: "censusblockgroupid", "'categorytype', "groupname", "total".
Categorytype column stores the categories such as Race, Origin, Age and Sex, Income, Vehicle Availability.
Groupname column stores the groups categories can be sub divided into. For e.g. Race can be sub divided into White, Black or African American, Asian, etc.
Total column stores the population or the count of households (whichever applicable) of each category and its associated group.

In [1]:
import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy

## processing the data

In [2]:
cbg_id = []
category_type = []
group_name = []
total = []

In [3]:
cbg_age = pd.read_csv("safegraph_open_census_data_2019/data/cbg_b01.csv")
cbg_race = pd.read_csv("safegraph_open_census_data_2019/data/cbg_b02.csv")
cbg_origin = pd.read_csv("safegraph_open_census_data_2019/data/cbg_b03.csv")
cbg_income = pd.read_csv("safegraph_open_census_data_2019/data/cbg_b19.csv")
cbg_vehicle = pd.read_csv("safegraph_open_census_data_2019/data/cbg_b25.csv")

In [4]:
#cbg_age.info()
#cbg_race.info()
#cbg_origin.info()
#cbg_income.info()
#cbg_vehicle.info()

In [5]:
tableid2agebracket_m = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_age_bracket_male.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2agebracket_m

{'B01001e10': '22 to 24 years',
 'B01001e11': '25 to 29 years',
 'B01001e12': '30 to 34 years',
 'B01001e13': '35 to 39 years',
 'B01001e14': '40 to 44 years',
 'B01001e15': '45 to 49 years',
 'B01001e16': '50 to 54 years',
 'B01001e17': '55 to 59 years',
 'B01001e18': '60 and 61 years',
 'B01001e19': '62 to 64 years',
 'B01001e2': 'Total',
 'B01001e20': '65 and 66 years',
 'B01001e21': '67 to 69 years',
 'B01001e22': '70 to 74 years',
 'B01001e23': '75 to 79 years',
 'B01001e24': '80 to 84 years',
 'B01001e25': '85 years and over',
 'B01001e3': 'Under 5 years',
 'B01001e4': '5 to 9 years',
 'B01001e5': '10 to 14 years',
 'B01001e6': '15 to 17 years',
 'B01001e7': '18 and 19 years',
 'B01001e8': '20 years',
 'B01001e9': '21 years'}

In [6]:
cbg_age["Under 18 years male"] = cbg_age['B01001e3'] + cbg_age['B01001e4'] + cbg_age['B01001e5'] + cbg_age['B01001e6']

cbg_age["18 to 45 years male"] = cbg_age['B01001e7'] + cbg_age['B01001e8'] + cbg_age['B01001e9'] + cbg_age['B01001e10'] \
                            + cbg_age['B01001e11'] + cbg_age['B01001e12'] + cbg_age['B01001e13'] + cbg_age['B01001e14']

cbg_age["45 to 65 years male"] = cbg_age['B01001e15'] + cbg_age['B01001e16'] + cbg_age['B01001e17'] + cbg_age['B01001e18'] \
                            + cbg_age['B01001e19']

cbg_age["65 years and over male"] = cbg_age['B01001e20'] + cbg_age['B01001e21'] + cbg_age['B01001e22'] + cbg_age['B01001e23'] \
                            + cbg_age['B01001e24'] + cbg_age['B01001e25']

In [7]:
cbg_id = cbg_id + list(cbg_age["census_block_group"].values)*4
category_type = category_type + ["Age and Sex"]*len(cbg_age)*4
group_name = group_name + ["Under 18 years male"]*len(cbg_age) + ["18 to 45 years male"]*len(cbg_age) \
                        + ["45 to 65 years male"]*len(cbg_age) + ["65 years and over male"]*len(cbg_age)
total = total + list(cbg_age["Under 18 years male"].values) + list(cbg_age["18 to 45 years male"].values) \
                        + list(cbg_age["45 to 65 years male"].values) + list(cbg_age["65 years and over male"].values)

In [8]:
tableid2agebracket_f = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_age_bracket_female.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2agebracket_f

{'B01001e26': 'Total',
 'B01001e27': 'Under 5 years',
 'B01001e28': '5 to 9 years',
 'B01001e29': '10 to 14 years',
 'B01001e30': '15 to 17 years',
 'B01001e31': '18 and 19 years',
 'B01001e32': '20 years',
 'B01001e33': '21 years',
 'B01001e34': '22 to 24 years',
 'B01001e35': '25 to 29 years',
 'B01001e36': '30 to 34 years',
 'B01001e37': '35 to 39 years',
 'B01001e38': '40 to 44 years',
 'B01001e39': '45 to 49 years',
 'B01001e40': '50 to 54 years',
 'B01001e41': '55 to 59 years',
 'B01001e42': '60 and 61 years',
 'B01001e43': '62 to 64 years',
 'B01001e44': '65 and 66 years',
 'B01001e45': '67 to 69 years',
 'B01001e46': '70 to 74 years',
 'B01001e47': '75 to 79 years',
 'B01001e48': '80 to 84 years',
 'B01001e49': '85 years and over'}

In [9]:
cbg_age["Under 18 years female"] = cbg_age['B01001e27'] + cbg_age['B01001e28'] + cbg_age['B01001e29'] + cbg_age['B01001e30']

cbg_age["18 to 45 years female"] = cbg_age['B01001e31'] + cbg_age['B01001e32'] + cbg_age['B01001e33'] + cbg_age['B01001e34'] \
                                   + cbg_age['B01001e35'] + cbg_age['B01001e36'] + cbg_age['B01001e37'] + cbg_age['B01001e38']

cbg_age["45 to 65 years female"] = cbg_age['B01001e39'] + cbg_age['B01001e40'] + cbg_age['B01001e41'] + cbg_age['B01001e42'] \
                                   + cbg_age['B01001e43']

cbg_age["65 years and over female"] = cbg_age['B01001e44'] + cbg_age['B01001e45'] + cbg_age['B01001e46'] \
                                      + cbg_age['B01001e47'] + cbg_age['B01001e48'] + cbg_age['B01001e49']

In [10]:
cbg_id = cbg_id + list(cbg_age["census_block_group"].values)*4
category_type = category_type + ["Age and Sex"]*len(cbg_age)*4
group_name = group_name + ["Under 18 years female"]*len(cbg_age) + ["18 to 45 years female"]*len(cbg_age) \
                        + ["45 to 65 years female"]*len(cbg_age) + ["65 years and over female"]*len(cbg_age)
total = total + list(cbg_age["Under 18 years female"].values) + list(cbg_age["18 to 45 years female"].values) \
                        + list(cbg_age["45 to 65 years female"].values) + list(cbg_age["65 years and over female"].values)

In [11]:
tableid2race = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_race.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2race

{'B02001e2': 'White alone',
 'B02001e3': 'Black or African American alone',
 'B02001e4': 'American Indian and Alaska Native alone',
 'B02001e5': 'Asian alone',
 'B02001e6': 'Native Hawaiian and Other Pacific Islander alone',
 'B02001e7': 'Some other race alone',
 'B02001e8': 'Two or more races'}

In [12]:
cbg_race["White"] = cbg_race["B02001e2"]
cbg_race["Black or African American"] = cbg_race["B02001e3"]
cbg_race["American Indian and Alaska Native"] = cbg_race["B02001e4"]
cbg_race["Asian"] = cbg_race["B02001e5"]
cbg_race["Native Hawaiian and Other Pacific Islander"] = cbg_race["B02001e6"]
cbg_race["Some other race"] = cbg_race["B02001e7"]
cbg_race["Two or more races"] = cbg_race["B02001e8"]

In [13]:
cbg_id = cbg_id + list(cbg_race["census_block_group"].values)*7
category_type = category_type + ["Race"]*len(cbg_race)*7
group_name = group_name + ["White"]*len(cbg_race) + ["Black or African American"]*len(cbg_race) \
                        + ["American Indian and Alaska Native"]*len(cbg_race) + ["Asian"]*len(cbg_race) \
                        + ["Native Hawaiian and Other Pacific Islander"]*len(cbg_race) + ["Some other race"]*len(cbg_race) \
                        + ["Two or more races"]*len(cbg_race)
total = total + list(cbg_race["White"].values) + list(cbg_race["Black or African American"].values) \
              + list(cbg_race["American Indian and Alaska Native"].values) + list(cbg_race["Asian"].values) \
              + list(cbg_race["Native Hawaiian and Other Pacific Islander"].values) + list(cbg_race["Some other race"].values) \
              + list(cbg_race["Two or more races"].values)

In [14]:
tableid2origin = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_origin.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2origin

{'B03003e2': 'Not Hispanic or Latino origin',
 'B03003e3': 'Hispanic or Latino origin'}

In [15]:
cbg_origin["Not Hispanic or Latino"] = cbg_origin["B03003e2"]
cbg_origin["Hispanic or Latino"] = cbg_origin["B03003e3"]

In [16]:
cbg_id = cbg_id + list(cbg_origin["census_block_group"].values)*2
category_type = category_type + ["Origin"]*len(cbg_origin)*2
group_name = group_name + ["Not Hispanic or Latino"]*len(cbg_origin) + ["Hispanic or Latino"]*len(cbg_origin)
total = total + list(cbg_origin["Not Hispanic or Latino"].values) + list(cbg_origin["Hispanic or Latino"].values)

In [17]:
tableid2incomebracket = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_income_bracket.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2incomebracket

{'B19001e1': 'Total',
 'B19001e10': '$45 000 to $49 999',
 'B19001e11': '$50 000 to $59 999',
 'B19001e12': '$60 000 to $74 999',
 'B19001e13': '$75 000 to $99 999',
 'B19001e14': '$100 000 to $124 999',
 'B19001e15': '$125 000 to $149 999',
 'B19001e16': '$150 000 to $199 999',
 'B19001e17': '$200 000 or more',
 'B19001e2': 'Less than $10 000',
 'B19001e3': '$10 000 to $14 999',
 'B19001e4': '$15 000 to $19 999',
 'B19001e5': '$20 000 to $24 999',
 'B19001e6': '$25 000 to $29 999',
 'B19001e7': '$30 000 to $34 999',
 'B19001e8': '$35 000 to $39 999',
 'B19001e9': '$40 000 to $44 999'}

In [18]:
cbg_income["less than $10000"] = cbg_income["B19001e2"]

cbg_income["$10000 to $40000"] = cbg_income["B19001e3"] + cbg_income["B19001e4"] + cbg_income["B19001e5"] \
                                 + cbg_income["B19001e6"] + cbg_income["B19001e7"] + cbg_income["B19001e8"] 


cbg_income["$40000 to $75000"] = cbg_income["B19001e9"] + cbg_income["B19001e10"] + cbg_income["B19001e11"]\
                                 + cbg_income["B19001e12"]

cbg_income["$75000 to $150000"] = cbg_income["B19001e13"] + cbg_income["B19001e14"] + cbg_income["B19001e15"]

cbg_income["$150000 to $200000"] = cbg_income["B19001e16"]

cbg_income["$200000 or more"] = cbg_income["B19001e17"]

In [19]:
cbg_income[["less than $10000", "$10000 to $40000", "$40000 to $75000", "$75000 to $150000", "$150000 to $200000","$200000 or more"]].sum(axis=0)

less than $10000       7625516
$10000 to $40000      32395487
$40000 to $75000      30723268
$75000 to $150000     33744660
$150000 to $200000     8184256
$200000 or more        9275515
dtype: int64

In [20]:
cbg_id = cbg_id + list(cbg_income["census_block_group"].values)*6
category_type = category_type + ["Income"]*len(cbg_income)*6
group_name = group_name + ["less than $10000"]*len(cbg_income) + ["$10000 to $40000"]*len(cbg_income) \
                        + ["$40000 to $75000"]*len(cbg_income) + ["$75000 to $150000"]*len(cbg_income) \
                        + ["$150000 to $200000"]*len(cbg_income) + ["$200000 or more"]*len(cbg_income)
total = total + list(cbg_income["less than $10000"].values) + list(cbg_income["$10000 to $40000"].values) \
              + list(cbg_income["$40000 to $75000"].values) + list(cbg_income["$75000 to $150000"].values) \
              + list(cbg_income["$150000 to $200000"].values) + list(cbg_income["$200000 or more"].values)

In [21]:
tableid2vehiclebracket = pd.read_csv("safegraph_open_census_data_2019/metadata/table_id_2_vehicle_bracket.csv")\
                         .set_index("table_id").squeeze().to_dict()
tableid2vehiclebracket

{'B25045e12': 'No vehicle available renter',
 'B25045e16': '1 or more vehicles available renter',
 'B25045e3': 'No vehicle available owner',
 'B25045e7': '1 or more vehicles available owner'}

In [22]:
cbg_vehicle["No vehicle available"] = cbg_vehicle["B25045e12"] + cbg_vehicle["B25045e3"]
cbg_vehicle["Vehicle available"] = cbg_vehicle["B25045e16"] + cbg_vehicle["B25045e7"]

In [23]:
cbg_id = cbg_id + list(cbg_vehicle["census_block_group"].values)*2
category_type = category_type + ["Vehicle Availability"]*len(cbg_vehicle)*2
group_name = group_name + ["No vehicle available"]*len(cbg_vehicle) + ["Vehicle available"]*len(cbg_vehicle)
total = total + list(cbg_vehicle["No vehicle available"].values) + list(cbg_vehicle["Vehicle available"].values)

In [24]:
print(len(cbg_id))
print(len(category_type))
print(len(group_name))
print(len(total))

5508325
5508325
5508325
5508325


In [25]:
demographics_df = pd.DataFrame(list(zip(cbg_id, category_type, group_name, total)), 
                               columns =['censusblockgroupid', 'categorytype', "groupname", "total"])

In [26]:
demographics_df["censusblockgroupid"] = demographics_df["censusblockgroupid"].astype(str)
demographics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5508325 entries, 0 to 5508324
Data columns (total 4 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   censusblockgroupid  object
 1   categorytype        object
 2   groupname           object
 3   total               int64 
dtypes: int64(1), object(3)
memory usage: 168.1+ MB


## connecting to database

In [27]:
#conn_string = 'postgresql://junaid:junaid6242@localhost/main'
conn_string = 'postgresql://junaid:junaid6242@localhost:5901/main'
db = sqlalchemy.create_engine(conn_string)
conn = db.connect()

In [28]:
demographics_df.to_sql('demographics', con=conn, if_exists='replace', index=False)

325

In [29]:
conn.close()