In [94]:
import altair as alt
import pandas as pd

# Data
## Input File Format
The input file format is identical to "Options 1: File" in UpSetR-shiny (https://github.com/hms-dbmi/UpSetR-shiny)

- Columns are `attribute 1, attribute 2, ... attribute N, set 1, set2, ..., set M` where `set` columns contain either `1` or `0`, `1` indicating the '⬤' representation in UpSet)

# Visualization
#### The UpSetAltair visualizations contain three main views: 

(1) **vertical bar chart** on the top showing the cardinality of each intersecting set;

(2) **matrix view** on the bottom-left showing the intersecting set;

~~(3) **horizontal bar chart** on the bottom-right showing the cardinality of each set.~~

# Process data

DeLUCS hit and misses are from c1normDeLUCS.png
Also, [SJSU - Ebay slides](https://docs.google.com/presentation/d/1_dlGqlGHNXgbPxo1PruNfi8C1a-fcNqzBPsIBoeHA9I/edit?usp=sharing) are being used to double check hit and misses

I am plotting mbarc genome size so slide 29 is being used


In [95]:
from enum import Enum
import pandas as pd
import altair as alt

# better enum class name??
class Tools(Enum):
    DELUCSHIT = "DeLUCS hit"
    DELUCSMISS = "DeLUCS miss"
    PHYLOFLASHHIT = "phyloFlash hit"
    PHYLOFLASHMISS = "phyloFlash miss"

In [96]:
source = pd.read_csv("Repeats - spade - mbarc.csv")
size = pd.read_csv("size.csv")

df_size = pd.DataFrame(size, columns=['Species', 'Whole Genome'])

df = pd.DataFrame(source, columns=['Species', 'Spade Whole Genome', 'Spade 16S'])
# new columns 1 | 0 for membership (1 = True)
# phyloFlash hit | phyloFlash miss | DeLUCS hit | DeLUCS miss

# most of phyloFlash hit so set all columns to 1 and change the few that miss
df[Tools.PHYLOFLASHHIT.value] = 1
df[Tools.PHYLOFLASHMISS.value] = 0
# most of DeLUCS hit so set all columns to 1 and change the few that miss
df[Tools.DELUCSHIT.value] = 1
df[Tools.DELUCSMISS.value] = 0

print(df_size['Whole Genome'])
print(df['Spade Whole Genome'])

df['Genome Size'] = df_size['Whole Genome']
df['Whole Genome'] = (df['Spade Whole Genome']/df_size['Whole Genome'])
df['16S'] = df['Spade 16S']

df.to_csv('./mbarc_repetitiveness_percent.csv')

print(df['Whole Genome'])

0     3256683
1     3843301
2     3750771
3     3309401
4     4991181
5     4873567
6     4855529
7     4639675
8     5608040
9     2166381
10    3603458
11    3223876
12    3540114
13    3721669
14    3788356
15    4314118
16    6543312
17    2051896
18    4600489
19    4460105
20    4600800
21    3157527
22    4653970
23    1852441
24    5227858
25    4355525
Name: Whole Genome, dtype: int64
0      29174
1      70506
2     100227
3      11249
4      71266
5      39920
6      54706
7      74419
8      52946
9      34521
10     48568
11    101001
12     13339
13     72806
14     84127
15     60479
16    110621
17     24769
18     38301
19     35096
20     77761
21     42605
22     18231
23     23146
24     60377
25     75299
Name: Spade Whole Genome, dtype: int64
0     0.008958
1     0.018345
2     0.026722
3     0.003399
4     0.014278
5     0.008191
6     0.011267
7     0.016040
8     0.009441
9     0.015935
10    0.013478
11    0.031329
12    0.003768
13    0.019563
14    0.022207
1

### DeLUCS miss [total 4](https://drive.google.com/file/d/1PSZ_mhcrcZkt2JlWj6S-4rmYLKxkI_PA/view?usp=sharing)

Olsenella_uli, Salmonella_bongori, Segniliparus_rotundus, Streptococcus_pyogenes

In [97]:
df.loc[df["Species"] == "Olsenella_uli", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Olsenella_uli", Tools.DELUCSHIT.value] = 0

df.loc[df["Species"] == "Salmonella_bongori", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Salmonella_bongori", Tools.DELUCSHIT.value] = 0

df.loc[df["Species"] == "Segniliparus_rotundus", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Segniliparus_rotundus", Tools.DELUCSHIT.value] = 0


df.loc[df["Species"] == "Streptococcus_pyogenes", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Streptococcus_pyogenes", Tools.DELUCSHIT.value] = 0

### phyloFlash miss [total 4](https://docs.google.com/spreadsheets/d/1bfky2TWEFXRBDVJEu07mVmbjfhVjrO01BbkltOo915A/edit#gid=939002145)

E.coli, Nocardiopsis_dassonvillei, Salmonella_bongori, Fervidobacterium_pennivorans

In [98]:
df.loc[df["Species"] == "Nocardiopsis_dassonvillei", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Nocardiopsis_dassonvillei", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "E.coli", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "E.coli", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "Salmonella_bongori", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Salmonella_bongori", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "Fervidobacterium_pennivorans", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Fervidobacterium_pennivorans", Tools.PHYLOFLASHHIT.value] = 0

# Custom upset plot

In [99]:
width=2000
height=800
height_ratio=0.6
# currently no horizontal bar graph
horizontal_bar_chart_width=300

# Species labels size
bottom_x_axis_labels_size = 16
bottom_x_axis_labels_weight = alt.FontWeight("normal")

# bar chart settings
vertical_bar_label_size=14
vertical_bar_tick_font_size=16
vertical_bar_thickness=40
vertical_bar_title_font_size=35

# line_chart_font_size = 15

# matrix view settings
glyph_size = 200
matrix_label_size = 13
dot_line_connection_size=2
matrix_title_size = 23

# on mouse hover tooltip show Genome size value
tooltip = [
    alt.Tooltip("Whole Genome", title="Size"),
]

vertical_bar_chart_height = height * height_ratio
matrix_height = (height - vertical_bar_chart_height) * 0.4
matrix_width = (width - horizontal_bar_chart_width)

main_color = "#3b528b"
highlight_color = "#5ec962"

# on mouse hover over Species
mouse_hover_selection = alt.selection_multi(on="mouseover", fields=["Species"])
# applies highlight on mouse hover of Species
# main_color when no mouse hover
#  note the ~
brush_color = alt.condition(~mouse_hover_selection, alt.value(main_color), alt.value(highlight_color))


##### end tuning parameters



# sorting by Whole Genome Descending to be used by upset plot
df = df.sort_values(by="Whole Genome", ascending=False)
# no dot in a hit row = tool misses
df = df.drop([Tools.PHYLOFLASHMISS.value, Tools.DELUCSMISS.value], axis=1)
# upset_plot_sorting = alt.SortField(field='Whole Genome', order='descending')
upset_plot_sorting = alt.SortField(field='index')

In [115]:
# vertical bar chart
vertical_bar = alt.Chart(df.copy(), title='MBARC-26 Repetitiveness (Whole Genome %)').mark_bar(color=main_color, size=vertical_bar_thickness).encode(
    x=alt.X(
        "Species", 
        axis=alt.Axis(grid=False, labels=False, ticks=False, domain=True),
        title=None,
        sort=upset_plot_sorting,
    ),
    y=alt.Y(
        "Whole Genome",
        axis=alt.Axis(
            grid=False,
            orient='left',
            labelFontSize=vertical_bar_tick_font_size,
            tickCount=6,
            format='%'
            ),
        title=None,
        scale=alt.Scale(type="symlog",domain=[0,max(df["Whole Genome"])])
    ),
    tooltip=tooltip,
    color=brush_color
).properties(
    width=matrix_width,
    height=vertical_bar_chart_height
)

vertical_bar.configure_title(fontSize=10000)

vertical_bar_text = vertical_bar.mark_text(
    color=main_color, 
    dy=-10,
    size=vertical_bar_label_size,
    fontWeight='bold',
).encode(
    text=alt.Text("Whole Genome", format=".2%")
)

# vertical_bar_line = vertical_bar.mark_line(color=highlight_color, opacity=1).encode(
#     color=alt.value(highlight_color),
#     y=alt.Y(
#             '16S:Q',
#            )
#     )

# line_text = vertical_bar_line.mark_text(
#     color=main_color,
#     align='center',
#     baseline='middle',
#     dy=-19,
#     fontWeight='bold',
#     fontSize=line_chart_font_size,
# ).encode(
#     text=alt.Text("16S:Q")
# )

bardata = pd.DataFrame({
    'x':['Repetitiveness (%)']
                       
})

# legendline = alt.Chart(bardata).mark_tick(
#         size=15, 
#         thickness=3, 
#         orient='horizontal', 
#         color='black',
#         opacity=1
# ).encode(
#         color=alt.value(highlight_color),
#             y=alt.Y('y:N',
#                     axis=alt.Axis(orient='right', 
#                                   titleFontSize=0, 
#                                   labelFontSize=18, 
#                                   titleX=80, 
#                                   titleY=-5,
#                                   titleAngle=0
#                                  ),

#                    ),
#     )

legendbar = alt.Chart(bardata).mark_square(size=200, color="#303030").encode(
    y=alt.Y('x:N', 
            axis=alt.Axis(orient='right', 
                          titleFontSize=0, 
                          labelFontSize=18,
                          titleX=80, 
                          titleY=-5,
                          titleAngle=0
                         ),
    
           ),
    )


band0 = alt.Chart(pd.DataFrame({'y': [.01]})).mark_rule(color='cyan',strokeDash=[5,5]).encode(y='y')
band1 = alt.Chart(pd.DataFrame({'y': [.02]})).mark_rule(color='magenta',strokeDash=[5,5]).encode(y='y')

# Combines the separate components to create the vertical bar chart
# mouse interaction added
vertical_bar_chart = ( vertical_bar + vertical_bar_text + band0 + band1).add_selection(
    mouse_hover_selection
)



In [116]:
# matrix view selections
selection2 = alt.selection_multi(on="mouseover", fields=["Species", "value"])
circle_color = alt.condition(~selection2, alt.Color("value:N", scale=alt.Scale(domain=[0,1], range=["#440154", "#440154"]), legend=None), alt.value(highlight_color))

In [117]:
# UpSet glyph view (matrix view) of hits
# value of 0 = miss
matrix_base = alt.Chart(df.copy()).mark_circle(
    size=glyph_size,
    opacity=1
).transform_fold(
    [Tools.PHYLOFLASHHIT.value, Tools.DELUCSHIT.value]
).encode(
    x=alt.X(
        "Species",
        axis=alt.Axis(grid=False, ticks=False, domain=False, labelFontSize=bottom_x_axis_labels_size, labelFontWeight=bottom_x_axis_labels_weight, labelLimit=10000, labelAngle=-45),
        title="Species",
        sort=None
    ),
    y=alt.Y(
        "key:N",
        axis=alt.Axis(grid=False, ticks=False, domain=False, orient='right', labelFontWeight="bold"),
        title=None,
        sort=["phyloFlash hit","DeLUCS hit"]
    ),
    color=circle_color,
).properties(
    height=matrix_height,
    width= matrix_width
)

# keeps miss dots from being highlighted on mouse hover
grey_glyph_size = glyph_size + 30
circle_grey_miss = matrix_base.mark_circle(size=grey_glyph_size, opacity=1).transform_filter(
    (alt.datum["value"] == 0)
).encode(
    color=alt.value("#E6E6E6")
)

# highlights every other row of matrix view to distinguish hit and miss rows 
zebra_strip_matrix_view = matrix_base.mark_rect().transform_filter(
    (alt.datum.key == "DeLUCS hit")
).encode(
    color=alt.value("#F7F7F7")
)

# Combines the separate components to create the matrix view
# order seems matter
# mouse interaction added
# Duplicate `circle` is to properly show tooltips and on mouse color highlighting
matrix_view = (matrix_base + zebra_strip_matrix_view + matrix_base + circle_grey_miss).add_selection(
    mouse_hover_selection,
    selection2
)




In [118]:
# combine all charts to make final upset plot
upset_plot = alt.hconcat(
    vertical_bar_chart
)

upset_plot = alt.vconcat(
    upset_plot,
    matrix_view
)

# final graph settings tuning
upset_plot.configure_view(
    stroke=None
).configure_axis(
    titleFontSize=matrix_title_size,
    labelFontSize=matrix_label_size
).configure_title(
    fontSize=vertical_bar_title_font_size
)
