### Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

### Load data

In [2]:
census = pd.read_csv("../../data/census/census_tract.csv")

print("Shape of data", census.shape)
print("Columns", census.columns)
census.head()

Shape of data (74001, 37)
Columns Index(['CensusTract', 'State', 'County', 'TotalPop', 'Men', 'Women',
       'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Citizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')


Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,0.5,2.3,2.1,25.0,943,77.1,18.3,4.6,0.0,5.4
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,0.0,0.7,0.0,23.4,753,77.0,16.9,6.1,0.0,13.3
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,0.0,0.0,2.5,19.6,1373,64.1,23.6,12.3,0.0,6.2
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,0.0,2.6,1.6,25.3,1782,75.7,21.2,3.1,0.0,10.8
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,0.0,0.6,0.9,24.8,5037,67.1,27.6,5.3,0.0,4.2


### Longest and Shortest County names

In [3]:
longest_county_name_on_census_dataset_index = np.argmax(census.County.map(len))
s_i = np.argmin(census.County.map(len))

census[(census.index == longest_county_name_on_census_dataset_index) | (census.index == s_i)]

Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
590,1081040200,Alabama,Lee,5210,2881,2329,3.3,73.2,12.2,0.0,...,14.9,4.8,4.3,18.2,2691,66.6,26.3,5.4,1.7,4.4
1330,2198000100,Alaska,Prince of Wales-Hyder Census Area,2139,1274,865,0.5,58.0,0.1,31.8,...,25.6,10.4,3.0,15.2,954,50.8,39.6,9.4,0.1,14.3


### Largest income error

The income given can be off by as much as 123,116

In [8]:
max_income_err = np.argmax(census.IncomeErr)
max_income_err = census[census.index == max_income_err]

print(max_income_err.IncomeErr)
max_income_err

63909    123116.0
Name: IncomeErr, dtype: float64


Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
63909,48201210100,Texas,Harris,5799,5299,500,33.5,16.3,48.6,0.0,...,0.0,0.0,9.3,33.0,118,66.9,28.8,4.2,0.0,0.0


### Largest Income per cap error

The per capita income can be off by as much as 134,380.  I wonder if this indicates high income inequality

In [9]:
max_per_cap_err = np.argmax(census.IncomePerCapErr)
max_per_cap_err = census[census.index == max_per_cap_err]

print(max_per_cap_err.IncomePerCapErr)
max_per_cap_err

27513    134380.0
Name: IncomePerCapErr, dtype: float64


Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
27513,21067003916,Kentucky,Fayette,598,282,316,0.0,98.2,1.5,0.0,...,0.0,1.6,9.5,28.1,315,70.5,15.9,13.7,0.0,6.8


### Santa Clara County

In [15]:
sc_county = census[census.County == "Santa Clara"]

sc_county.Men.sum()

939004