In [16]:
import numpy as np
import pandas as pd
import math
import random
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import locale
locale.setlocale(locale.LC_ALL, '')
%matplotlib inline
import pickle

# Bokeh is cool.
from jinja2 import Template
from bokeh.io import show,curdoc, output_notebook, export_png, save, export_svgs, output_file
from bokeh.models import Quad, FactorRange, LabelSet, HoverTool, SaveTool, BasicTickFormatter
from bokeh.layouts import row,column,layout,widgetbox
from bokeh.models.widgets import Select,MultiSelect,Panel, Tabs, Dropdown
from bokeh.plotting import ColumnDataSource,figure,reset_output,gridplot
from bokeh.palettes import Spectral6
from bokeh.transform import dodge
from bokeh.embed import file_html
from bokeh.resources import CDN
from bokeh.themes import Theme
from bokeh.document import Document

# Setup Bokeh appropriately...
reset_output()
output_notebook()
theme = Theme(filename="./HTML_Build/themes/aiga_theme.yaml")
curdoc().theme = theme

# Ensure vector figure output uses editable text, when using matplotlib or seaborn
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['svg.fonttype'] = 'none'
# Autolayout on graphs
plt.rcParams['figure.autolayout'] = True

# Read in our preprocessed data
df = pd.read_csv('./export_csv/processed_output_without_column_names.csv', encoding = 'utf8')
with open('SavedVariables.pckl', 'rb') as f:
    continous_cols, cate_cols, cate_cols_v2, pipe_col, col_dict, pipe_col_dict = pickle.load(f)

In [17]:
AIGA_col = pipe_col_dict['V_32']['AIGA']
print('The total number of AIGA responses: ' + str(len(df[df[AIGA_col] == 1])))
print('The total number of Non-AIGA responses: ' + str(len(df[df[AIGA_col] == 0])))

The total number of AIGA responses: 4638
The total number of Non-AIGA responses: 7839


In [18]:
def summary_cate_col(col, AllorAIGA = 'All'):
    
    AIGA_col = pipe_col_dict['V_32']['AIGA']
    Salary_col = 'V_18'
    
    if AllorAIGA == 'All':
        df_filtered = df
    elif AllorAIGA =="NotAIGA":
        df_filtered = df[df[AIGA_col] == 0]
    else:
        df_filtered = df[df[AIGA_col] == 1]

    if col in continous_cols:
        return

    elif col in pipe_col:
        dff = pd.DataFrame(columns=['Options','Count','Percent','AvgSalary'])
        for key, value in pipe_col_dict[col].items():
            count = df_filtered[value].sum()
            avg_salary = np.mean(df_filtered.loc[df_filtered[value] == 1, Salary_col])
            dff = dff.append({'Options': key, 'Count' : count,'AvgSalary': avg_salary}, ignore_index=True)
    else:
        dff = pd.DataFrame(df_filtered[col].value_counts())
        dff.reset_index(inplace=True)
        dff.columns = ['Options','Count']
        dff['AvgSalary'] = dff['Options'].apply(\
            lambda option: np.mean(df_filtered.loc[df_filtered[col] == option, Salary_col]))
    
    dff['Percent'] = dff.Count / len(df_filtered)
    dff['AIGA'] = AllorAIGA
    
    dff = dff.sort_values(by = 'Percent', ascending = False)
    return dff


# Updated plot function:

In [19]:
def plotHBar_comp(dffs, data_column, label_column, x_axis_label,  y_axis_visible): 
    
    # Create Source for 2 groups
    sources = []
    for index, dff in zip([0,1], dffs):
        source  = ColumnDataSource(dict(x = np.array(dff[data_column]),
                                   y = np.array(dff['Options']) ,
                                   label = np.array(dff[label_column]),
                                   category=np.array(dff["Option_AIGA"]),
                                   salary=np.array(dff["Salary_label"]),
                                   count=np.array(dff["Count_label"]),
                                   percent=np.array(dff["Percent_label"])))
        sources.append(source)
    
    # Create label and title    
    x_label = x_axis_label # from input
    y_label = "Categories"
    title = col_dict[col] 

    # constant width, height variable based on options in the Y axis
    h = 70 * len(dffs[0])*2
    max_count = max(max(dffs[0][data_column]),max(dffs[1][data_column]))
    maxX = 50 + max_count
    maxX += .15*maxX
    if data_column == 'Percent':
        maxX = 1 # 100%
    minHeight = 600
    
    hover = HoverTool(tooltips=[
        ("Category", "@category"),
        ("Salary", "@salary"),
        ("Respondents", "@count"),
        ("Percent", "@percent"),
    ])
    saveTool = SaveTool()
 
    # Create new plot
    plot = figure(
            plot_height=h,
            x_axis_label = x_label,
            x_range=(-maxX*0.01, maxX),      
            y_range=list(dffs[0]['Options']),
            toolbar_location="right",
            tools=[hover, saveTool]
            )
    plot.toolbar.active_inspect = [hover]
    plot.xaxis.formatter = BasicTickFormatter(use_scientific=False)
    plot.ygrid.visible = False

    
    # Create nested bar plot
    for source, color, offset, legend in zip(sources, ["#111111", "#c6007e"], [0.22,-0.22], ['NonAIGA','AIGA']):
        plot.hbar(y = dodge('y', offset, range=plot.y_range), 
                      right = 'x', 
                      height = 0.4, 
                      source = source,
                      color = color, 
                      legend = legend)
        
        # Only visible for percent of respondent plot        
        plot.yaxis.visible =  y_axis_visible
        
        # Add label
        pct_count_Labels = LabelSet(x='x', y='y', text='label', text_font_size='8pt', text_font_style="bold", text_color="#444444", level='glyph',
                x_offset=10, y_offset= offset*140, source=source, render_mode='canvas', text_baseline = 'middle')
        plot.add_layout(pct_count_Labels)                                      

    return (plot)


# Updated main function:

In [20]:
percent_tabs = []
salary_tabs = []

figureCount = 0

for col in cate_cols:
    if col in ['V_8', 'V_35','V_36']:
        continue
    
    # temp var to embed plots for each column into a single file
    # versus all questions in a single file.
    plots = []
    figureCount +=1
    
    # Generate and combine two dff
    dffs = []
    for AIGA in ['NotAIGA','AIGA']:
        
        # call summary function
        dff = summary_cate_col(col, AIGA)
        
        # create label columns for dff
        dff['Option_AIGA']  = [(option, AIGA) for (option, AIGA) in zip(dff['Options'], dff['AIGA'])]
        dff['Percent_label'] = dff['Percent'].apply(lambda x: "{0:.2f}%".format(x*100))
        dff['Count_label'] = dff['Count'].apply(lambda x: "{0:n}".format(x))
        dff['AvgSalary'] = dff['AvgSalary'].apply(lambda x: int(x) if x > 0 else 0)
        dff['Salary_label'] = dff['AvgSalary'].apply(lambda x: "{0:n}".format(x) if x>0 else 0)
        dff['Percent_Count_label'] = [count+' ('+percent+')' for (percent, count) in zip(dff['Percent_label'],dff['Count_label'])]
        
        # append
        dffs.append(dff)
    
    # display(dffs[0]) # you can see the output of this data frame
    
    # make sure we are not losing options
    # print(len(dffs[0]['Options']) >= len(dffs[1]['Options']))
    
    # Test 1, Percentage
    # Order by % of respondent, descending
    for i in [0,1]:
        dffs[i] = dffs[i].sort_values(by = 'Percent', ascending = True)
    
    AIGA_Comp = plotHBar_comp(dffs, 
                              data_column = 'Percent',
                              label_column = 'Percent_Count_label',
                              x_axis_label = 'Percentage of Respondents',
                              y_axis_visible = True)

    tab = Panel(child = AIGA_Comp, title ="Percentage_Compare_for: " + col) # just for testing, to be modified
    percent_tabs.append(tab)
    plots.append(AIGA_Comp)
    
    # Test 2, Salary
    AIGA_Comp = plotHBar_comp(dffs, 
                              data_column = 'AvgSalary',
                              label_column = 'Salary_label',
                              x_axis_label = 'Average Salary',
                              y_axis_visible = False)
    
    tab = Panel(child = AIGA_Comp, title ="Salary_Compare_for: " + col) # just for testing, to be modified
    salary_tabs.append(tab)
    plots.append(AIGA_Comp)
    
    embedPlots = row(plots)
    
    # Save embedded using the template so the page is styled
    with open('./HTML_Build/templates/index.jinja', 'r') as f:
        template = Template(f.read())
    plotTitle = col_dict[col]
    html = file_html(embedPlots,CDN,title=plotTitle,template=template)
    output_file = "./HTML_Build/" + str(figureCount) + ' - ' + plotTitle.replace(':','') + '.html'

    with open(output_file, 'w') as f:
        f.write(html)
 

<br/>
<br/>
# Try some mapping

In [21]:
# Confirm python version
import sys
sys.version

'2.7.15 |Anaconda, Inc.| (default, May  1 2018, 18:37:05) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [22]:
df = pd.read_csv('./export_csv/processed_output_without_column_names.csv', encoding = 'utf8')

###  Read in zip_data that I created

In [23]:
zip_data = pd.read_csv('./export_csv/zip_data.csv', encoding = 'utf8', dtype = {'ZIP':np.object})
zip_data['COUNTY'] = [(x,y) for x,y in zip(zip_data['STATE'],zip_data['COUNTY'])]
zip_data.head(3)

Unnamed: 0,ZIP,POP,AREA_MILE,STATE,COUNTY,POP_DEN,Type
0,601,17800,64.348,72,"(72, 1)",276.620874,Rural
1,602,39716,30.621,72,"(72, 3)",1297.018386,Suburban
2,603,51565,31.616,72,"(72, 5)",1630.977986,Suburban


In [24]:
print('There are ' + str(sum(df['V_36'].isnull())) + ' records without zip codes')

There are 2224 records without zip codes


In [25]:
# exclude null and merge with zip data
df = df[~df.V_36.isnull()]
df['V_36'] = df['V_36'].apply(lambda x : str(int(x))) # Zipcode --> Convert from numeric to string
df = pd.merge(df, zip_data, left_on = 'V_36', right_on = 'ZIP')

In [27]:
print('There are ' + str(len(df)) + ' records after merging')

There are 9455 records after merging


###  Summarize salary by county

In [28]:
ByCounty = df.groupby('COUNTY', as_index=False).agg({'V_18':np.mean})
Salary = {}
for index, row in ByCounty.iterrows():
    Salary[row['COUNTY']] = row['V_18']

In [29]:
Salary[(1,3)] # State 1, County 3

45000.0

In [30]:
from bokeh.sampledata import us_states, us_counties, unemployment
from bokeh.plotting import figure, show, output_file

us_states = us_states.data.copy()
us_counties = us_counties.data.copy()

del us_states["HI"]
del us_states["AK"]

state_xs = [us_states[code]["lons"] for code in us_states]
state_ys = [us_states[code]["lats"] for code in us_states]

county_xs=[us_counties[code]["lons"] for code in us_counties if us_counties[code]["state"] not in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]]
county_ys=[us_counties[code]["lats"] for code in us_counties if us_counties[code]["state"] not in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]]

colors = ["#F1EEF6", "#D4B9DA", "#C994C7", "#DF65B0", "#DD1C77", "#980043"]

county_colors = []

for county_id in us_counties:
    if  us_counties[county_id]["state"] in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]:
        continue
    try:
        rate = Salary[county_id]
        idx = min(5,int(rate / 25000))
        county_colors.append(colors[idx])
    except KeyError:
        county_colors.append("black")

output_file("choropleth.html", title="choropleth.py example")

p = figure(title="Average Salary", toolbar_location="left",
    plot_width=1100, plot_height=700)

p.patches(county_xs, county_ys, fill_color=county_colors, fill_alpha=0.7,
    line_color="white", line_width=0.5)
p.patches(state_xs, state_ys, fill_alpha=0.0,
    line_color="#884444", line_width=2)

show(p)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
