### Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

### Bokeh Imports

In [2]:
from bokeh.io import output_file, show
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, ColumnDataSource, PrintfTickFormatter
from bokeh.plotting import figure, output_notebook
from bokeh.transform import transform
from bokeh.palettes import Viridis3, Viridis256

# output plot to notebook
output_notebook()

### Load data

In [3]:
census = pd.read_csv("../../data/census/census_tract.csv")

print("Shape of data", census.shape)
print("Columns", census.columns)
census.head()

Shape of data (74001, 37)
Columns Index(['CensusTract', 'State', 'County', 'TotalPop', 'Men', 'Women',
       'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Citizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')


Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,0.5,2.3,2.1,25.0,943,77.1,18.3,4.6,0.0,5.4
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,0.0,0.7,0.0,23.4,753,77.0,16.9,6.1,0.0,13.3
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,0.0,0.0,2.5,19.6,1373,64.1,23.6,12.3,0.0,6.2
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,0.0,2.6,1.6,25.3,1782,75.7,21.2,3.1,0.0,10.8
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,0.0,0.6,0.9,24.8,5037,67.1,27.6,5.3,0.0,4.2


### Checking missing values

In [4]:
missing_cols = [col for col in census.columns if any(census[col].isnull())]

census[missing_cols].isnull().sum()

Hispanic            690
White               690
Black               690
Native              690
Asian               690
Pacific             690
Income             1100
IncomeErr          1100
IncomePerCap        740
IncomePerCapErr     740
Poverty             835
ChildPoverty       1118
Professional        807
Service             807
Office              807
Construction        807
Production          807
Drive               797
Carpool             797
Transit             797
Walk                797
OtherTransp         797
WorkAtHome          797
MeanCommute         949
PrivateWork         807
PublicWork          807
SelfEmployed        807
FamilyWork          807
Unemployment        802
dtype: int64

### Weird,

I don't understand how the census works, but I thought you needed people to fill in the data?

In [5]:
census[census.Hispanic.isnull()]

Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
43,1003990000,Alabama,Baldwin,0,0,0,,,,,...,,,,,0,,,,,
107,1015981902,Alabama,Calhoun,0,0,0,,,,,...,,,,,0,,,,,
108,1015981903,Alabama,Calhoun,0,0,0,,,,,...,,,,,0,,,,,
868,1097990000,Alabama,Mobile,0,0,0,,,,,...,,,,,0,,,,,
1063,1117980000,Alabama,Shelby,0,0,0,,,,,...,,,,,0,,,,,
1460,4012980000,Arizona,La Paz,0,0,0,,,,,...,,,,,0,,,,,
1857,4013113802,Arizona,Maricopa,0,0,0,,,,,...,,,,,0,,,,,
2372,4013980100,Arizona,Maricopa,0,0,0,,,,,...,,,,,0,,,,,
2376,4013980700,Arizona,Maricopa,0,0,0,,,,,...,,,,,0,,,,,
2870,4027980003,Arizona,Yuma,0,0,0,,,,,...,,,,,0,,,,,


### Deleting all rows that have zero population

In [6]:
census = census.drop(census[census.TotalPop == 0].index)

### Let's take a look at the missing values again

In [7]:
missing_cols = [col for col in census.columns if any(census[col].isnull())]

census[missing_cols].isnull().sum()

Income             410
IncomeErr          410
IncomePerCap        50
IncomePerCapErr     50
Poverty            145
ChildPoverty       428
Professional       117
Service            117
Office             117
Construction       117
Production         117
Drive              107
Carpool            107
Transit            107
Walk               107
OtherTransp        107
WorkAtHome         107
MeanCommute        259
PrivateWork        117
PublicWork         117
SelfEmployed       117
FamilyWork         117
Unemployment       112
dtype: int64

# Correlation Maps

Thank you to [Shashank Srivastava](https://stackoverflow.com/users/1688792/shashank-srivastava) for [his code](https://stackoverflow.com/questions/39191653/python-bokeh-how-to-make-a-correlation-plot)

In [8]:
def corr_map(df):
    df.index.name = 'AllColumns1'
    df.columns.name = 'AllColumns2'
    
    # Prepare data.frame in the right format
    df = df.stack().rename("value").reset_index()
    
    # You can use your own palette here
    colors = ['#d7191c', '#fdae61', '#ffffbf', '#a6d96a', '#1a9641']


    # I am using 'Viridis256' to map colors with value, change it with 'colors' if you need some specific colors
    mapper = LinearColorMapper(
        palette=Viridis256, low=df.value.min(), high=df.value.max())

    # Define a figure
    p = figure(
        plot_width=1100,
        plot_height=1000,
        x_range=list(df.AllColumns1.drop_duplicates()),
        y_range=list(df.AllColumns2.drop_duplicates()))

    # Create rectangle for heatmap
    p.rect(
        x="AllColumns1",
        y="AllColumns2",
        width=1,
        height=1,
        source=ColumnDataSource(df),
        line_color="Black",
        fill_color=transform('value', mapper))

    # Add legend
    color_bar = ColorBar(
        color_mapper=mapper,
        location=(0, 0),
        ticker=BasicTicker(desired_num_ticks=10))

    p.add_layout(color_bar, 'right')

    show(p)

### Since there are so many columns to look at, the correlation map will be split into 3

In [9]:
df_1 = census.drop("CensusTract", axis=1).corr().head(12)
corr_map(df_1)

W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='dd1bcf7d-0b00-4814-8e77-642ce19bd1d9', ...)


In [10]:
df_2 = census.drop("CensusTract", axis=1).corr().iloc[12:21]
corr_map(df_2)

W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='be720bb5-cfe7-4668-8bd1-70e4b8d508e0', ...)


In [11]:
df_3 = census.drop("CensusTract", axis=1).corr().tail(13)
corr_map(df_3)

W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='36547d56-4f01-4881-8f2b-e996a5e4346d', ...)


In [12]:
census.head()

Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,0.5,2.3,2.1,25.0,943,77.1,18.3,4.6,0.0,5.4
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,0.0,0.7,0.0,23.4,753,77.0,16.9,6.1,0.0,13.3
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,0.0,0.0,2.5,19.6,1373,64.1,23.6,12.3,0.0,6.2
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,0.0,2.6,1.6,25.3,1782,75.7,21.2,3.1,0.0,10.8
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,0.0,0.6,0.9,24.8,5037,67.1,27.6,5.3,0.0,4.2
