In [258]:
import math
from bokeh.plotting import figure, output_file, show, save, ColumnDataSource, output_notebook
from bokeh.models.tools import HoverTool
from bokeh.transform import factor_cmap
from bokeh.palettes import Blues8
from bokeh.embed import components
import bokeh.models as bmo
import bokeh.io as bio
import pandas as pd
import numpy as np
from bokeh.palettes import d3

In [104]:
pandas.set_option('display.max_columns', None)

## getting started with bokeh

In [4]:
# output_file('index.html')
output_notebook()

In [2]:
# Read in csv
df = pd.read_csv('bokeh_tutorial/cars.csv')

In [7]:
# Create ColumnDataSource from data frame
source = ColumnDataSource(df)

# Car list
car_list = source.data['Car'].tolist()

# Add plot
p = figure(
    y_range=car_list,
    plot_width=800,
    plot_height=600,
    title='Cars With Top Horsepower',
    x_axis_label='Horsepower',
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)

# Render glyph
p.hbar(
    y='Car',
    right='Horsepower',
    left=0,
    height=0.4,
    fill_color=factor_cmap(
      'Car',
      palette=Blues8,
      factors=car_list
    ),
    fill_alpha=0.9,
    source=source,
    legend='Car'
)

# Add Legend
p.legend.orientation = 'vertical'
p.legend.location = 'top_right'
p.legend.label_text_font_size = '10px'

# Add Tooltips
hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Car</h3>
    <div><strong>Price: </strong>@Price</div>
    <div><strong>HP: </strong>@Horsepower</div>
    <div><img src="@Image" alt="" width="200" /></div>
  </div>
"""
p.add_tools(hover)

# Show results
show(p)

# Save file
# save(p)

# Print out div and script
# script, div = components(p)
# print(div)
# print(script)



# HW3: Data Visualization

In [59]:
# enable output to notebook
output_notebook()

In [60]:
data_df = pd.read_csv("homework3_data.csv")

In [121]:
# data_df['College_ratio'] = data_df['College_jobs'] / (data_df['College_jobs'] + data_df['Non_college_jobs'] + data_df['Low_wage_jobs'])
# data_df['College_ratio'] = data_df['College_jobs'] / data_df['Total']
data_df['College_ratio'] = data_df['College_jobs'] / (data_df['College_jobs'] + data_df['Non_college_jobs'])
data_df['Full_time_ratio'] = data_df['Full_time_year_round'] / data_df['Total']

In [122]:
data_df.head()

Unnamed: 0,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs,College_ratio,Full_time_ratio
0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193,0.808219,0.516032
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50,0.576606,0.513228
2,3,2415,METALLURGICAL ENGINEERING,856.0,725.0,131.0,Engineering,0.153037,3,648,558,133,340,16,0.024096,73000,50000,105000,456,176,0,0.721519,0.397196
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,1258.0,1123.0,135.0,Engineering,0.107313,16,758,1069,150,692,40,0.050125,70000,43000,80000,529,102,0,0.838352,0.550079
4,5,2405,CHEMICAL ENGINEERING,32260.0,21239.0,11021.0,Engineering,0.341631,289,25694,23170,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972,0.804869,0.517576


## college ratio by major

In [217]:
df_copy = data_df.groupby('Major_category', as_index=False).mean()
sorted_df["recalc_college_ratio"] = sorted_df['College_jobs'] / (sorted_df['College_jobs'] + sorted_df['Non_college_jobs'])
sorted_df = sorted_df.sort_values(by=['recalc_college_ratio'], ascending=False)
sorted_df.head()

Unnamed: 0,Major_category,Rank,Major_code,Total,Men,Women,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs,College_ratio,Full_time_ratio,recalc_college_ratio
6,Education,130.375,2387.6875,34945.5625,6470.375,28475.1875,0.748507,296.375,29989.9375,24878.6875,7537.0625,18001.9375,1560.5625,0.051702,32350.0,26590.625,38562.5,21169.5625,7610.0625,2554.375,0.713745,0.530426,0.735575
7,Engineering,22.62069,2489.896552,18537.344828,14079.551724,4457.793103,0.238889,169.862069,14495.586207,13167.827586,2935.724138,9963.862069,1028.172414,0.063334,57382.758621,41555.172414,70448.275862,9302.310345,3530.448276,864.793103,0.679889,0.530041,0.724888
8,Health,96.5,5937.916667,38602.5,6293.083333,32309.416667,0.795152,326.166667,31012.25,24568.25,9549.333333,19034.833333,1851.083333,0.06592,36825.0,26166.666667,50250.0,20453.416667,9208.0,2605.833333,0.51609,0.430015,0.689563
5,Computers & Mathematics,57.0,2702.727273,27182.545455,18975.0,8207.545455,0.311772,260.0,21626.727273,18867.727273,4842.727273,14468.727273,1670.272727,0.084256,42745.454545,29290.909091,58090.909091,12532.636364,6769.363636,1466.909091,0.604446,0.545741,0.649292
13,Physical Sciences,67.6,5022.8,18547.9,9539.0,9008.9,0.508683,113.7,13923.1,11285.2,4344.4,8563.5,788.0,0.046511,41890.0,28350.0,57290.0,7655.2,4946.9,1407.8,0.531653,0.496891,0.607454


In [218]:
palette = d3['Category20'][len(data_df['Major_category'].unique())]
color_map = bmo.CategoricalColorMapper(factors=data_df['Major_category'].unique(), palette=palette)

plot = figure(
    x_range=sorted_df["Major_category"],
    y_range=[0, 1],
    plot_width=800,
    plot_height=600,
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)
plot.vbar(
    x="Major_category", 
    width=0.5, 
    top="recalc_college_ratio",
    color=factor_cmap(
      'Major_category',
      palette=palette,
      factors=sorted_df["Major_category"].unique()
    ),
    source=sorted_df,
)
plot.xaxis.major_label_orientation = math.pi/4

hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Major_category</h3>
  </div>
"""
plot.add_tools(hover)
plot.background_fill_color = None
plot.border_fill_color = None

show(plot)

## college ratio vs salary

In [269]:
# Add plot
p = figure(
    plot_width=800,
    plot_height=600,
#     title='College ratio vs Median',
    y_axis_label='College Ratio',
    x_axis_label='Median',
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)
#     y_range='Major',

# colors
palette = d3['Category20'][len(data_df['Major_category'].unique())]
color_map = bmo.CategoricalColorMapper(factors=data_df['Major_category'].unique(), palette=palette)


# Render glyph
p.scatter(
    x='Median',
    y='College_ratio',
    fill_alpha=0.9,
    source=data_df,
    color={'field': 'Major_category', 'transform': color_map},
    legend_field='Major_category',
)

# override x axis labels
p.xaxis.ticker = bmo.FixedTicker(ticks=[2e4, 4e4, 6e4, 8e4, 1e5])
p.xaxis.major_label_overrides = {20000: '20K', 40000: '40K', 60000: '60K', 80000: '80K', 100000: '100K'}

# Add Legend
p.legend.orientation = 'vertical'
p.legend.location = 'bottom_right'
p.legend.label_text_font_size = '10px'

# remove backgrounds
p.background_fill_color = "#fbfbfb"
p.border_fill_color = "#fbfbfb"
p.legend.background_fill_alpha = 0.2

# Add Tooltips
hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Major</h3>
  </div>
"""
p.add_tools(hover)

# Show results
show(p)

In [124]:
# Add plot
p = figure(
    plot_width=800,
    plot_height=600,
    title='College ratio vs Median',
    y_axis_label='College Ratio',
    x_axis_label='Median',
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)
#     y_range='Major',

# colors
palette = d3['Category20'][len(data_df['Major_category'].unique())]
color_map = bmo.CategoricalColorMapper(factors=data_df['Major_category'].unique(), palette=palette)


# Render glyph
p.scatter(
    x='P75th',
    y='College_ratio',
    fill_alpha=0.9,
    source=data_df,
    color={'field': 'Major_category', 'transform': color_map},
    legend_field='Major_category',
)

# Add Legend
p.legend.orientation = 'vertical'
p.legend.location = 'bottom_right'
p.legend.label_text_font_size = '10px'
p.legend.padding = 10
# p.legend.location=(600, 0)

# Add Tooltips
hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Major</h3>
  </div>
"""
p.add_tools(hover)

# Show results
show(p)

In [247]:
valid_indices = data_df['College_ratio'].notnull()

In [248]:
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['P25th'][valid_indices])

array([[1.        , 0.36256817],
       [0.36256817, 1.        ]])

In [249]:
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['Median'][valid_indices])

array([[1.        , 0.37382139],
       [0.37382139, 1.        ]])

In [250]:
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['P75th'][valid_indices])

array([[1.        , 0.31865473],
       [0.31865473, 1.        ]])

## college ratio vs full time ratio

In [129]:
# Add plot
p = figure(
    plot_width=800,
    plot_height=600,
    title='College ratio vs Full time ratio',
    y_axis_label='College Ratio',
    x_axis_label='Full Time Ratio',
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)
#     y_range='Major',

# colors
palette = d3['Category20'][len(data_df['Major_category'].unique())]
color_map = bmo.CategoricalColorMapper(factors=data_df['Major_category'].unique(), palette=palette)


# Render glyph
p.scatter(
    x='Full_time_ratio',
    y='College_ratio',
    fill_alpha=0.9,
    source=data_df,
    color={'field': 'Major_category', 'transform': color_map},
    legend_field='Major_category',
)

# Add Legend
p.legend.orientation = 'vertical'
p.legend.location = 'bottom_right'
p.legend.label_text_font_size = '10px'

# Add Tooltips
hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Major</h3>
  </div>
"""
p.add_tools(hover)

# Show results
show(p)

In [130]:
valid_indices = np.logical_and(data_df['College_ratio'].notnull(), data_df['Full_time_ratio'].notnull())
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['Full_time_ratio'][valid_indices])

array([[ 1.        , -0.02364018],
       [-0.02364018,  1.        ]])

## college ratio vs unemployment

In [268]:
# Add plot
p = figure(
    plot_width=800,
    plot_height=600,
#     title='College ratio vs Full time ratio',
    y_axis_label='College Ratio',
    x_axis_label='Unemployment Rate',
    tools="pan,box_select,zoom_in,zoom_out,save,reset"
)
#     y_range='Major',

# colors
palette = d3['Category20'][len(data_df['Major_category'].unique())]
color_map = bmo.CategoricalColorMapper(factors=data_df['Major_category'].unique(), palette=palette)


# Render glyph
p.scatter(
    x='Unemployment_rate',
    y='College_ratio',
    fill_alpha=0.9,
    source=data_df,
    color={'field': 'Major_category', 'transform': color_map},
    legend_field='Major_category',
)

# Add Legend
p.legend.orientation = 'vertical'
p.legend.location = 'bottom_right'
p.legend.label_text_font_size = '10px'

# remove backgrounds
p.background_fill_color = "#fbfbfb"
p.border_fill_color = "#fbfbfb"
p.legend.background_fill_alpha = 0.2

# Add Tooltips
hover = HoverTool()
hover.tooltips = """
  <div>
    <h3>@Major</h3>
  </div>
"""
p.add_tools(hover)

# Show results
show(p)

In [252]:
valid_indices = np.logical_and(data_df['College_ratio'].notnull(), data_df['Unemployment_rate'].notnull())
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['Unemployment_rate'][valid_indices])

array([[ 1.        , -0.28131872],
       [-0.28131872,  1.        ]])

In [203]:
valid_indices = np.logical_and(data_df['College_ratio'].notnull(), data_df['ShareWomen'].notnull())
np.corrcoef(data_df['College_ratio'][valid_indices], data_df['ShareWomen'][valid_indices])

array([[ 1.        , -0.12960308],
       [-0.12960308,  1.        ]])

In [266]:
bio.export_png(p, filename="plot.png", height=600, width=800)

'/home/sb/grad/dm/homeworks/hw3/plot.png'