In [1]:
import altair as alt
import pandas as pd

# Data
## Input File Format


- Columns are `attribute 1, attribute 2, ... attribute N, set 1, set2, ..., set M` where `set` columns contain either `1` or `0`, `1` indicating the '⬤' representation in UpSet)

# Process data

DeLUCS hit and misses are from c1normDeLUCS.png
Also, [SJSU - Ebay slides](https://docs.google.com/presentation/d/1_dlGqlGHNXgbPxo1PruNfi8C1a-fcNqzBPsIBoeHA9I/edit?usp=sharing) are being used to double check hit and misses

I am plotting mbarc genome size so slide 29 is being used


In [2]:
from enum import Enum

# better enum class name??
class Tools(Enum):
    DELUCSHIT = "DeLUCS hit"
    DELUCSMISS = "DeLUCS miss"
    PHYLOFLASHHIT = "phyloFlash hit"
    PHYLOFLASHMISS = "phyloFlash miss"

In [3]:
source = pd.read_csv("meetHitMiss.csv")

df = pd.DataFrame(source)


# Custom upset plot

In [4]:
width=2000
height=800
height_ratio=0.6
# currently no horizontal bar graph
horizontal_bar_chart_width=300

# Species labels size
bottom_x_axis_labels_size = 16
bottom_x_axis_labels_weight = alt.FontWeight("normal")

# bar chart settings
vertical_bar_label_size=12
vertical_bar_tick_font_size = 14
vertical_bar_thickness=40
vertical_bar_title_font_size = 35

line_chart_font_size = 15

# matrix view settings
glyph_size = 200
matrix_label_size = 13
dot_line_connection_size=2
matrix_title_size = 23

# on mouse hover tooltip show Genome size value
tooltip = [
    alt.Tooltip("GC", title="GC"),
]

vertical_bar_chart_height = height * height_ratio
matrix_height = (height - vertical_bar_chart_height) * 0.4
matrix_width = (width - horizontal_bar_chart_width)

main_color = "#3b528b"
highlight_color = "#5ec962"

# on mouse hover over Species
mouse_hover_selection = alt.selection_multi(on="mouseover", fields=["Species"])
# applies highlight on mouse hover of Species
# main_color when no mouse hover
#  note the ~
brush_color = alt.condition(~mouse_hover_selection, alt.value(main_color), alt.value(highlight_color))


##### end tuning parameters



# sorting by genome_size Descending to be used by upset plot
df = df.sort_values(by="GC", ascending=False)
# no dot in a hit row = tool misses
upset_plot_sorting = alt.SortField(field='GC', order='descending')

In [5]:
# vertical bar chart
vertical_bar = alt.Chart(df.copy(), title='Hot Springs GC Content').mark_bar(color=main_color, size=vertical_bar_thickness).encode(
    x=alt.X(
        "Species", 
        axis=alt.Axis(grid=False, labels=False, ticks=False, domain=True),
        title=None,
        sort=upset_plot_sorting,
    ),
    y=alt.Y(
        "",
        axis=alt.Axis(
            grid=False,
            orient='left',
            labelFontSize=vertical_bar_tick_font_size,
            ),
        title=None,
        scale=alt.Scale(type="log")
    ),
    tooltip=tooltip,
    color=brush_color
).properties(
    width=matrix_width,
    height=vertical_bar_chart_height
)

vertical_bar.configure_title(fontSize=10000)

vertical_bar_text = vertical_bar.mark_text(
    color=main_color,
    dy=-10,
    size=vertical_bar_label_size,
    fontWeight="bold",
).encode(
    text=alt.Text("GC", format=",.0f")
)

vertical_bar_line = vertical_bar.mark_line(color=highlight_color, opacity=1).encode(
    color=alt.value(highlight_color),
    y=alt.Y(
            '16s_GC:Q',
           )
    )

line_text = vertical_bar_line.mark_text(
    color=main_color,
    align='center',
    baseline='middle',
    dy=-19,
    fontWeight='bold',
    fontSize=line_chart_font_size,
).encode(
    text=alt.Text("16s_GC:Q")
)

bardata = pd.DataFrame({
    'y':['16S'],
    'x':['Whole Genome']
                       
})

legendline = alt.Chart(bardata).mark_tick(
        size=15, 
        thickness=3, 
        orient='horizontal', 
        color='black',
        opacity=1
).encode(
        color=alt.value(highlight_color),
            y=alt.Y('y:N',
                    axis=alt.Axis(orient='right', 
                                  titleFontSize=0, 
                                  labelFontSize=18, 
                                  titleX=80, 
                                  titleY=-5,
                                  titleAngle=0
                                 ),

                   ),
    )

legendbar = alt.Chart(bardata).mark_square(size=200, color="#303030").encode(
    y=alt.Y('x:N', 
            axis=alt.Axis(orient='right', 
                          titleFontSize=0, 
                          labelFontSize=18, 
                          titleX=80, 
                          titleY=-5,
                          titleAngle=0
                         ),
    
           ),
    )


# Combines the separate components to create the vertical bar chart
# mouse interaction added
vertical_bar_chart = (vertical_bar + vertical_bar_text + vertical_bar_line).add_selection(
    mouse_hover_selection
)


In [6]:
# matrix view selections
selection2 = alt.selection_multi(on="mouseover", fields=["Species", "value"])
circle_color = alt.condition(~selection2, alt.Color("value:N", scale=alt.Scale(domain=[0,1], range=["#440154", "#440154"]), legend=None), alt.value(highlight_color))

In [7]:
# UpSet glyph view (matrix view) of hits
# value of 0 = miss
matrix_base = alt.Chart(df.copy()).mark_circle(
    size=glyph_size,
    opacity=1
).transform_fold(
    [Tools.PHYLOFLASHHIT.value, Tools.DELUCSHIT.value]
).encode(
    x=alt.X(
        "Species",
        axis=alt.Axis(
            grid=False,
            ticks=False,
            domain=False,
            labelFontSize=bottom_x_axis_labels_size,
            labelFontWeight=bottom_x_axis_labels_weight,
            labelLimit=10000,
            labelAngle=-45,
            titleY=300.0
            ),
        title="Species",
        sort=None
    ),
    y=alt.Y(
        "key:N",
        axis=alt.Axis(grid=False, ticks=False, domain=False, orient='right', labelFontWeight="bold"),
        title=None,
        sort=["phyloFlash hit","DeLUCS hit"]
    ),
    color=circle_color,
).properties(
    height=matrix_height,
    width= matrix_width
)

# keeps miss dots from being highlighted on mouse hover
grey_glyph_size = glyph_size + 30
circle_grey_miss = matrix_base.mark_circle(size=grey_glyph_size, opacity=1).transform_filter(
    (alt.datum["value"] == 0)
).encode(
    color=alt.value("#E6E6E6")
)

# connect dots new changes might of broken but might no longer be needed
# line_connect_dots = matrix_base.mark_bar(size=dot_line_connection_size, color=main_color).transform_filter(
#     alt.datum["value"] == 1
# ).encode(
#     y=alt.Y("min(key):N"),
#     y2=alt.Y2("max(key):N"),
#     color=alt.value(main_color)
# )

# highlights every other row of matrix view to distinguish hit and miss rows 
zebra_strip_matrix_view = matrix_base.mark_rect().transform_filter(
    (alt.datum.key == "DeLUCS hit")
).encode(
    color=alt.value("#F7F7F7")
)

# Combines the separate components to create the matrix view
# order seems matter
# mouse interaction added
# Duplicate `circle` is to properly show tooltips and on mouse color highlighting
matrix_view = (matrix_base + zebra_strip_matrix_view + matrix_base + circle_grey_miss).add_selection(
    mouse_hover_selection,
    selection2
)




In [8]:
# combine all charts to make final upset plot
upset_plot = alt.hconcat(
    vertical_bar_chart,
    legendline + legendbar
)

upset_plot = alt.vconcat(
    upset_plot,
    matrix_view
)

# final graph settings tuning
upset_plot.configure_view(
    stroke=None
).configure_axis(
    titleFontSize=matrix_title_size,
    labelFontSize=matrix_label_size
).configure_title(
    fontSize=vertical_bar_title_font_size
)


ValueError:  encoding field is specified without a type; the type cannot be inferred because it does not match any column in the data.

alt.VConcatChart(...)