In [1]:
import altair as alt
import pandas as pd

# Data
## Input File Format


- Columns are `attribute 1, attribute 2, ... attribute N, set 1, set2, ..., set M` where `set` columns contain either `1` or `0`, `1` indicating the '⬤' representation in UpSet)

# Process data

DeLUCS hit and misses are from c1normDeLUCS.png
Also, [SJSU - Ebay slides](https://docs.google.com/presentation/d/1_dlGqlGHNXgbPxo1PruNfi8C1a-fcNqzBPsIBoeHA9I/edit?usp=sharing) are being used to double check hit and misses

I am plotting mbarc genome size so slide 29 is being used


In [2]:
from enum import Enum

# better enum class name??
class Tools(Enum):
    DELUCSHIT = "DeLUCS hit"
    DELUCSMISS = "DeLUCS miss"
    PHYLOFLASHHIT = "phyloFlash hit"
    PHYLOFLASHMISS = "phyloFlash miss"

In [3]:
source = pd.read_csv("meet_correct.csv")
file = pd.read_csv("Repeats - spade - hot spring.csv")


df = pd.DataFrame(source)
df2 = pd.DataFrame(file, columns=['Species', 'Spade Whole Genome', 'Spade 16S'])

df = df.sort_values(by="Species")
df = df.reset_index(drop=True)
print(df['Species'])

df['Repetitiveness (%)'] = (df2['Spade Whole Genome']/df['genome_size'])
df['16S bp'] = df2['Spade 16S']
df.to_csv('./hotsprings_repetitiveness_percent.csv')
print(df[['Species','Repetitiveness (%)']].sort_values(by="Repetitiveness (%)", ascending=False))

0             Caldimicrobium thiodismutans strain TF1
1     Candidatus Caldatribacterium saccharof. SpSt-82
2            Candidatus Solibacter usitatus Ellin6076
3         Candidatus Thermochlorobacteriaceae GBChlB 
4                  Chloracidobacterium thermophilum B
5                     Chloroflexus aggregans DSM 9485
6                    Chloroflexus aurantiacus J-10-fl
7                   Chloroflexus aurantiacus Y-400-fl
8                Chloroflexus islandicus strain isl-2
9                               Chloroflexus sp. MS-G
10                           Chloroflexus sp. Y-396-1
11                     Elioraea tepidiphila DSM 17972
12           Gloeomargarita lithophora Alchichica-D10
13                    Ignavibacterium album JCM 16511
14                         Meiothermus ruber DSM 1279
15                      Meiothermus silvanus DSM 9946
16                     Meiothermus taiwanensis WR-220
17                     Oscillochloris trichoides DG-6
18                          

# Custom upset plot

In [4]:
width=2000
height=800
height_ratio=0.6
# currently no horizontal bar graph
horizontal_bar_chart_width=300

# Species labels size
bottom_x_axis_labels_size = 20
bottom_x_axis_labels_weight = alt.FontWeight("normal")

# bar chart settings
vertical_bar_label_size=16
vertical_bar_tick_font_size = 20
vertical_bar_thickness=40
vertical_bar_title_font_size = 35

line_chart_font_size = 20

# matrix view settings
glyph_size = 400
matrix_label_size = 20
dot_line_connection_size=2
matrix_title_size = 23

# on mouse hover tooltip show Genome size value
tooltip = [
    alt.Tooltip("Repetitiveness (%)", title="Size"),
]

vertical_bar_chart_height = height * height_ratio
matrix_height = (height - vertical_bar_chart_height) * 0.4
matrix_width = (width - horizontal_bar_chart_width)

main_color = "#b80000"
highlight_color = "#5ec962"

# on mouse hover over Species
mouse_hover_selection = alt.selection_multi(on="mouseover", fields=["Species"])
# applies highlight on mouse hover of Species
# main_color when no mouse hover
#  note the ~
brush_color = alt.condition(~mouse_hover_selection, alt.value(main_color), alt.value(highlight_color))


##### end tuning parameters



# sorting by Repetitiveness Descending to be used by upset plot
df = df.sort_values(by="Repetitiveness (%)", ascending=False)
# no dot in a hit row = tool misses
upset_plot_sorting = alt.SortField(field='Repetitiveness (%)', order='descending')
print(upset_plot_sorting)
print(df['Species'])

SortField({
  field: 'Repetitiveness (%)',
  order: 'descending'
})
22                 Roseiflexus castenholzii DSM 13941
23                               Roseiflexus sp. RS-1
8                Chloroflexus islandicus strain isl-2
6                    Chloroflexus aurantiacus J-10-fl
7                   Chloroflexus aurantiacus Y-400-fl
5                     Chloroflexus aggregans DSM 9485
10                           Chloroflexus sp. Y-396-1
17                     Oscillochloris trichoides DG-6
19                         Pseudanabaena sp. PCC 7367
12           Gloeomargarita lithophora Alchichica-D10
9                               Chloroflexus sp. MS-G
18                          Pseudanabaena sp. ABRG5-3
4                  Chloracidobacterium thermophilum B
13                    Ignavibacterium album JCM 16511
29                          Thermus aquaticus Y51MC23
24                  Synechococcus sp. JA-2-3B'a(2-13)
15                      Meiothermus silvanus DSM 9946
0             

In [5]:
# vertical bar chart
vertical_bar = alt.Chart(df.copy(), title='Hot Spring Repetitiveness (Whole Genome %)').mark_bar(color=main_color, size=vertical_bar_thickness).encode(
    x=alt.X(
        "Species", 
        axis=alt.Axis(grid=False, labels=False, ticks=False, domain=True),
        title=None,
        sort=upset_plot_sorting,
    ),
    y=alt.Y(
        "Repetitiveness (%)",
        axis=alt.Axis(
            grid=False,
            orient='left',
            labelFontSize=vertical_bar_tick_font_size,
            tickCount=5,
            format='%'
            ),
        title=None,
        scale=alt.Scale(type="symlog", domain=[0,max(df['Repetitiveness (%)'])])
    ),
    tooltip=tooltip,
    color=brush_color
).properties(
    width=matrix_width,
    height=vertical_bar_chart_height
)

vertical_bar.configure_title(fontSize=10000)

vertical_bar_text = vertical_bar.mark_text(
    color=main_color,
    dy=-10,
    size=vertical_bar_label_size,
    fontWeight="bold",
).encode(
    text=alt.Text("Repetitiveness (%)", format=".2%")
)

# vertical_bar_line = vertical_bar.mark_line(color=highlight_color, opacity=1).encode(
#     color=alt.value(highlight_color),
#     y=alt.Y(
#             '16s_Repetitiveness:Q',
#            )
#     )

# line_text = vertical_bar_line.mark_text(
#     color=main_color,
#     align='center',
#     baseline='middle',
#     dy=-19,
#     fontWeight='bold',
#     fontSize=line_chart_font_size,
# ).encode(
#     text=alt.Text("16s_Repetitiveness:Q")
# )

# bardata = pd.DataFrame({
#     'x':['Repetitiveness (%)']
                       
# })

# legendline = alt.Chart(bardata).mark_tick(
#         size=15, 
#         thickness=3, 
#         orient='horizontal', 
#         color='black',
#         opacity=1
# ).encode(
#         color=alt.value(highlight_color),
#             y=alt.Y('y:N',
#                     axis=alt.Axis(orient='right', 
#                                   titleFontSize=0, 
#                                   labelFontSize=18, 
#                                   titleX=80, 
#                                   titleY=-5,
#                                   titleAngle=0
#                                  ),

#                    ),
#     )

# legendbar = alt.Chart(bardata).mark_square(size=200, color="#303030").encode(
#     y=alt.Y('x:N', 
#             axis=alt.Axis(orient='right', 
#                           titleFontSize=0, 
#                           labelFontSize=18, 
#                           titleX=80, 
#                           titleY=-5,
#                           titleAngle=0
#                          ),
    
#            ),
#     )



# Combines the separate components to create the vertical bar chart
# mouse interaction added
vertical_bar_chart = ( vertical_bar + vertical_bar_text).add_selection(
    mouse_hover_selection
)



In [6]:
# matrix view selections
selection2 = alt.selection_multi(on="mouseover", fields=["Species", "value"])
circle_color = alt.condition(~selection2, alt.Color("value:N", scale=alt.Scale(domain=[0,1], range=[main_color, main_color]), legend=None), alt.value(highlight_color))

In [7]:
# UpSet glyph view (matrix view) of hits
# value of 0 = miss
matrix_base = alt.Chart(df.copy()).mark_circle(
    size=glyph_size,
    opacity=1
).transform_fold(
    [Tools.PHYLOFLASHHIT.value, Tools.DELUCSHIT.value]
).encode(
    x=alt.X(
        "Species",
        axis=alt.Axis(
            grid=False,
            ticks=False,
            domain=False,
            labelFontSize=bottom_x_axis_labels_size,
            labelFontWeight=bottom_x_axis_labels_weight,
            labelLimit=10000,
            labelAngle=-45,
            titleY=300.0
            ),
        title="Species",
        sort=upset_plot_sorting
    ),
    y=alt.Y(
        "key:N",
        axis=alt.Axis(grid=False, ticks=False, domain=False, orient='right', labelFontWeight="bold"),
        title=None,
        sort=["phyloFlash hit","DeLUCS hit"]
    ),
    color=circle_color,
).properties(
    height=matrix_height,
    width= matrix_width
)

# keeps miss dots from being highlighted on mouse hover
grey_glyph_size = glyph_size + 30
circle_grey_miss = matrix_base.mark_circle(size=grey_glyph_size, opacity=1).transform_filter(
    (alt.datum["value"] == 0)
).encode(
    color=alt.value("#E6E6E6")
)

# highlights every other row of matrix view to distinguish hit and miss rows 
zebra_strip_matrix_view = matrix_base.mark_rect().transform_filter(
    (alt.datum.key == "DeLUCS hit")
).encode(
    color=alt.value("#F7F7F7")
)

# Combines the separate components to create the matrix view
# order seems matter
# mouse interaction added
# Duplicate `circle` is to properly show tooltips and on mouse color highlighting
matrix_view = (matrix_base + zebra_strip_matrix_view + matrix_base + circle_grey_miss).add_selection(
    mouse_hover_selection,
    selection2
)




In [8]:
# combine all charts to make final upset plot
upset_plot = alt.hconcat(
    vertical_bar_chart
)

upset_plot = alt.vconcat(
    upset_plot,
    matrix_view
)

# final graph settings tuning
upset_plot.configure_view(
    stroke=None
).configure_axis(
    titleFontSize=matrix_title_size,
    labelFontSize=matrix_label_size
).configure_title(
    fontSize=vertical_bar_title_font_size
)


  for col_name, dtype in df.dtypes.iteritems():
