<a href="https://colab.research.google.com/github/simon-mellergaard/datavis/blob/main/Experiments/bokeh_exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing

In [1]:
import os
import sys
if 'google.colab' in sys.modules:
    %cd /content/
    # remove local directory if it already exists
    if os.path.isdir("datavis"):
        !rm -rf {"datavis"}
    !git clone https://github.com/simon-mellergaard/datavis.git
    %cd /content/datavis/Experiments

/content
Cloning into 'datavis'...
remote: Enumerating objects: 280, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 280 (delta 18), reused 28 (delta 9), pack-reused 238 (from 1)[K
Receiving objects: 100% (280/280), 15.78 MiB | 9.08 MiB/s, done.
Resolving deltas: 100% (142/142), done.
/content/datavis/Experiments


In [2]:
import pandas as pd
# Import data
df_raw = pd.read_excel('../Data/DATA_UFM_combined.xlsx', header=0)

# Columns to keep
cols = [
    ################## EDUCATION INFORMATION ###########################

    # Names of the education
    'udbud_id',
    'titel',
    'educational_category',
    'displaydocclass',
    'hovedinsttx',

    # Region
    'instregiontx',
    'instkommunetx',

    # Grades coefficients
    'optagne',
    'kvote_1_kvotient',

    ####################### EDUCATIONAL VARIABLES ########################

    # Likert data
    'fagligmiljo_likert',               # Faglit miljø
    'arbmedstud_likert',                # Fagligt miljø
    'medstuderende_likert',             # Fagligt miljø
    'udbytte_undervisning_likert',      # Fagligt miljø
    'socialtmiljo_likert',              # Social miljø og trivsel
    'ensom_likert',                     # Social miljø og trivsel
    'stress_daglig_likert',             # Social miljø og trivsel
    'tilpas_likert',                    # Social miljø og trivsel
    'undervisere_engagerede_likert',    # Undervisere
    'undervisere_feedback_likert',      # Undervisere
    'undervisere_hjaelp_likert',        # Undervisere
    'undervisere_kontakt_likert',       # Undervisere

    # continuous data
    'afbrud',                           # frafald
    'tidsforbrug_p50',                  # tidsforbrug studie
    'tidsforbrug_arbejde',              # tidsforbrug studiejob

    # Undervisnings aktivitet
    'uddaktivitet_opgaver_pct',
    'uddaktivitet_praktik_pct',
    'uddaktivitet_udlandsophold_pct',
    'uddaktivitet_undervisning_pct',

    # Undervisningsform
    'undervisningsform_p1',             # Primær undervisningsform

    ########################## Job data ##################################

    # continuous data
    'arbejdstid_timer',
    'ledighed_nyudd',
    'maanedloen_nyudd',
    'maanedloen_10aar',

    # Likert data
    'ruster_til_job_likert',
    'relevans_overens_udd_job_likert',
]

data = df_raw[cols]

# Remove all udbud_id==999999, as this is the education on national level
data_whole_edu = data[data['udbud_id'] == 999999]
data = data[data['udbud_id'] != 999999]

# Remove the udbud_id column
data = data.drop(columns=['udbud_id'])

data_na = data.copy()
# Remove all rows with missing values
data = data.dropna()

## Input Experiment

In [3]:
from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.models import TextInput
from bokeh.layouts import layout
from bokeh.io import output_notebook

output_notebook()

p = figure(x_range=data['educational_category'].unique().tolist(), height=400,
           tools="", toolbar_location=None, title="Counts of Educational Categories")
# Make a bar chart of educational_category counts

colors = p.vbar(x=data['educational_category'].unique().tolist(), top=data['educational_category'].value_counts(), width=0.5)
# p.vbar(x='educational_category', top=)

p.xgrid.grid_line_color = None
p.y_range.start = 0

textinput = TextInput(value=colors.glyph.fill_color, title="Label:")
textinput.js_link("value", colors.glyph, "fill_color")

layout = layout([textinput], [p])

show(layout)


## Brush and linking experiment

In [4]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show
from bokeh.models import BoxSelectTool, BoxZoomTool, LassoSelectTool

x = list(range(21))
y0 = x
y1 = [20-xx for xx in x]
y2 = [abs(xx-10) for xx in x]

# create a new plot
s1 = figure(width=250, height=250, title=None)
s1.scatter(x, y0, size=10, color="navy", alpha=0.5)

# create a new plot and share both ranges
s2 = figure(width=250, height=250, x_range=s1.x_range, y_range=s1.y_range, title=None, tools=[BoxSelectTool(), LassoSelectTool(), BoxZoomTool()])
s2.scatter(x, y1, size=10, marker="triangle", color="firebrick", alpha=0.5)

# create a new plot and share only one range
s3 = figure(width=250, height=250, x_range=s1.x_range, title=None, tools=[BoxSelectTool(), LassoSelectTool(), BoxZoomTool()])
s3.scatter(x, y2, size=10, marker="square", color="olive", alpha=0.5)

p = gridplot([[s1, s2, s3]])#, toolbar_location=None)

p.toolbar.autohide = True


show(p)

In [5]:
import numpy as np

from bokeh.models import BoxSelectTool, BoxZoomTool, LassoSelectTool
from bokeh.plotting import figure, show

x = np.random.random(size=200)
y = np.random.random(size=200)

# Basic plot setup
plot = figure(width=400, height=400, title='Select and Zoom',
              tools="box_select,box_zoom,lasso_select,reset")

plot.scatter(x, y, size=5)

select_overlay = plot.select_one(BoxSelectTool).overlay

select_overlay.fill_color = "firebrick"
select_overlay.line_color = None

zoom_overlay = plot.select_one(BoxZoomTool).overlay

zoom_overlay.line_color = "olive"
zoom_overlay.line_width = 8
zoom_overlay.line_dash = "solid"
zoom_overlay.fill_color = None

plot.select_one(LassoSelectTool).overlay.line_dash = [10, 10]

show(plot)

# Data visualizations

In [6]:
# Make a bokeh plot with a histogram of 'tidsforbrug_p50' column
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()
hist, edges = np.histogram(data['tidsforbrug_p50'], bins=20)
p = figure(title='Histogram of tidsforbrug_p50', background_fill_color="#fafafa")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="navy",
       line_color="white", alpha=0.5)

show(p)