In [1]:
import altair as alt
import pandas as pd

# Data
## Input File Format
The input file format is identical to "Options 1: File" in UpSetR-shiny (https://github.com/hms-dbmi/UpSetR-shiny)

- Columns are `attribute 1, attribute 2, ... attribute N, set 1, set2, ..., set M` where `set` columns contain either `1` or `0`, `1` indicating the '⬤' representation in UpSet)

# Visualization
#### The UpSetAltair visualizations contain three main views: 

(1) **vertical bar chart** on the top showing the cardinality of each intersecting set;

(2) **matrix view** on the bottom-left showing the intersecting set;

~~(3) **horizontal bar chart** on the bottom-right showing the cardinality of each set.~~

# Process data

DeLUCS hit and misses are from c1normDeLUCS.png
Also, [SJSU - Ebay slides](https://docs.google.com/presentation/d/1_dlGqlGHNXgbPxo1PruNfi8C1a-fcNqzBPsIBoeHA9I/edit?usp=sharing) are being used to double check hit and misses

I am plotting mbarc genome size so slide 29 is being used


In [2]:
from enum import Enum
import pandas as pd
import altair as alt

# better enum class name??
class Tools(Enum):
    DELUCSHIT = "DeLUCS hit"
    DELUCSMISS = "DeLUCS miss"
    PHYLOFLASHHIT = "phyloFlash hit"
    PHYLOFLASHMISS = "phyloFlash miss"

In [3]:
source = pd.read_csv("Repeats - spade - mbarc.csv")
size = pd.read_csv("size.csv")

df_size = pd.DataFrame(size, columns=['Species', 'Whole Genome'])

df = pd.DataFrame(source, columns=['Species', 'Spade Whole Genome', 'Spade 16S'])
# new columns 1 | 0 for membership (1 = True)
# phyloFlash hit | phyloFlash miss | DeLUCS hit | DeLUCS miss

# most of phyloFlash hit so set all columns to 1 and change the few that miss
df[Tools.PHYLOFLASHHIT.value] = 1
df[Tools.PHYLOFLASHMISS.value] = 0
# most of DeLUCS hit so set all columns to 1 and change the few that miss
df[Tools.DELUCSHIT.value] = 1
df[Tools.DELUCSMISS.value] = 0



df['Genome Size'] = df_size['Whole Genome']
df['Whole Genome'] = (df['Spade Whole Genome']/df_size['Whole Genome'])
df['16S'] = df['Spade 16S']

df['set'] = 1

#Set up hotSprings
source = pd.read_csv("meet_correct.csv")
file = pd.read_csv("Repeats - spade - hot spring.csv")

meet_correct = pd.DataFrame(source,columns=['Species', 'genome_size','phyloFlash hit','DeLUCS hit'])
repeat = pd.DataFrame(file, columns=['Species', 'Spade Whole Genome', 'Spade 16S'])

print(meet_correct['genome_size'])

meet_correct['Whole Genome'] = repeat['Spade Whole Genome']
meet_correct['16S'] = repeat['Spade 16S']
meet_correct['set'] = 0


meet_correct = meet_correct.sort_values(by="Species")
meet_correct = meet_correct.reset_index(drop=True)
print(meet_correct['genome_size'])

meet_correct['Whole Genome'] = (repeat['Spade Whole Genome']/meet_correct['genome_size'])

print(df['Whole Genome'])

df = pd.concat([df, meet_correct])
df

0     4304237
1     2158963
2     2035182
3     3097457
4     2802273
5     3249394
6     9965640
7     3695372
8     5723298
9     5136885
10    5801598
11    2932766
12    4373075
13    3046682
14    5258541
15    5268950
16    4684931
17    2353197
18    4770266
19    4890986
20    3049282
21    3059647
22    2165165
23    4557046
24    1521037
25    4796642
26    2135342
27    1814952
28    1871474
29    3658997
30    1915222
Name: genome_size, dtype: int64
0     1814952
1     2353197
2     9965640
3     3059647
4     3695372
5     4684931
6     5258541
7     5268950
8     5136885
9     4770266
10    4890986
11    4304237
12    3049282
13    3658997
14    3097457
15    3249394
16    2802273
17    4373075
18    4796642
19    4557046
20    2165165
21    2135342
22    5723298
23    5801598
24    3046682
25    2932766
26    1521037
27    1915222
28    1871474
29    2158963
30    2035182
Name: genome_size, dtype: int64
0     0.008958
1     0.018345
2     0.026722
3     0.003399
4     0.

Unnamed: 0,Species,Spade Whole Genome,Spade 16S,phyloFlash hit,phyloFlash miss,DeLUCS hit,DeLUCS miss,Genome Size,Whole Genome,16S,set,genome_size
0,Clostridium_perfringens,29174.0,0.0,1,0.0,1,0.0,3256683.0,0.008958,0,1,
1,Clostridium_thermocellum,70506.0,0.0,1,0.0,1,0.0,3843301.0,0.018345,0,1,
2,Coraliomargarita_akajimensis,100227.0,0.0,1,0.0,1,0.0,3750771.0,0.026722,0,1,
3,Corynebacterium_glutamicum,11249.0,0.0,1,0.0,1,0.0,3309401.0,0.003399,0,1,
4,Desulfosporosinus_acidiphilus,71266.0,0.0,1,0.0,1,0.0,4991181.0,0.014278,0,1,
5,Desulfosporosinus_meridiei,39920.0,0.0,1,0.0,1,0.0,4873567.0,0.008191,0,1,
6,Desulfotomaculum_gibsoniae,54706.0,0.0,1,0.0,1,0.0,4855529.0,0.011267,0,1,
7,E.coli,74419.0,0.0,1,0.0,1,0.0,4639675.0,0.01604,0,1,
8,Echinicola_vietnamensis,52946.0,0.0,1,0.0,1,0.0,5608040.0,0.009441,0,1,
9,Fervidobacterium_pennivorans,34521.0,0.0,1,0.0,1,0.0,2166381.0,0.015935,0,1,


### DeLUCS miss [total 4](https://drive.google.com/file/d/1PSZ_mhcrcZkt2JlWj6S-4rmYLKxkI_PA/view?usp=sharing)

Olsenella_uli, Salmonella_bongori, Segniliparus_rotundus, Streptococcus_pyogenes

In [4]:
df.loc[df['DeLUCS hit'] == 1 , Tools.DELUCSHIT.value] = 1
df.loc[df['DeLUCS hit'] == 1 , Tools.DELUCSMISS.value] = 0

df.loc[df['DeLUCS hit'] == 0 , Tools.DELUCSMISS.value] = 1
df.loc[df['DeLUCS hit'] == 0 , Tools.DELUCSHIT.value] = 0

df.loc[df["Species"] == "Olsenella_uli", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Olsenella_uli", Tools.DELUCSHIT.value] = 0

df.loc[df["Species"] == "Salmonella_bongori", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Salmonella_bongori", Tools.DELUCSHIT.value] = 0

df.loc[df["Species"] == "Segniliparus_rotundus", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Segniliparus_rotundus", Tools.DELUCSHIT.value] = 0


df.loc[df["Species"] == "Streptococcus_pyogenes", Tools.DELUCSMISS.value] = 1
df.loc[df["Species"] == "Streptococcus_pyogenes", Tools.DELUCSHIT.value] = 0

### phyloFlash miss [total 4](https://docs.google.com/spreadsheets/d/1bfky2TWEFXRBDVJEu07mVmbjfhVjrO01BbkltOo915A/edit#gid=939002145)

E.coli, Nocardiopsis_dassonvillei, Salmonella_bongori, Fervidobacterium_pennivorans

In [5]:
df.loc[df["phyloFlash hit"] == 1, Tools.PHYLOFLASHHIT.value] = 1
df.loc[df["phyloFlash hit"] == 1, Tools.PHYLOFLASHMISS.value] = 0

df.loc[df["phyloFlash hit"] == 0, Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["phyloFlash hit"] == 0, Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "Nocardiopsis_dassonvillei", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Nocardiopsis_dassonvillei", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "E.coli", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "E.coli", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "Salmonella_bongori", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Salmonella_bongori", Tools.PHYLOFLASHHIT.value] = 0

df.loc[df["Species"] == "Fervidobacterium_pennivorans", Tools.PHYLOFLASHMISS.value] = 1
df.loc[df["Species"] == "Fervidobacterium_pennivorans", Tools.PHYLOFLASHHIT.value] = 0

# Custom upset plot

In [6]:
width=3000
height=800
height_ratio=0.6
# currently no horizontal bar graph
horizontal_bar_chart_width=300

# Species labels size
bottom_x_axis_labels_size = 20
bottom_x_axis_labels_weight = alt.FontWeight("normal")

# bar chart settings
vertical_bar_label_size=15
vertical_bar_tick_font_size=20
vertical_bar_thickness=40
vertical_bar_title_font_size=35

# line_chart_font_size = 15

# matrix view settings
glyph_size = 400
matrix_label_size = 20
dot_line_connection_size=2
matrix_title_size = 23

# on mouse hover tooltip show Genome size value
tooltip = [
    alt.Tooltip("Whole Genome", title="Size"),
]

vertical_bar_chart_height = height * height_ratio
matrix_height = (height - vertical_bar_chart_height) * 0.4
matrix_width = (width - horizontal_bar_chart_width)

main_color = "#3b528b"
highlight_color = "#5ec962"

# on mouse hover over Species
mouse_hover_selection = alt.selection_multi(on="mouseover", fields=["Species"])
# applies highlight on mouse hover of Species
# main_color when no mouse hover
#  note the ~
brush_color = alt.condition(~mouse_hover_selection, alt.value(main_color), alt.value(highlight_color))


##### end tuning parameters



# sorting by Whole Genome Descending to be used by upset plot
df = df.sort_values(by="Whole Genome", ascending=False)
# no dot in a hit row = tool misses
df = df.drop([Tools.PHYLOFLASHMISS.value, Tools.DELUCSMISS.value], axis=1)
# upset_plot_sorting = alt.SortField(field='Whole Genome', order='descending')
upset_plot_sorting = alt.SortField(field='index')

In [7]:
# vertical bar chart
vertical_bar = alt.Chart(df.copy(), title='Repetitiveness (Whole Genome %)').mark_bar(color=main_color, size=vertical_bar_thickness).encode(
    x=alt.X(
        "Species", 
        axis=alt.Axis(grid=False, labels=False, ticks=False, domain=True),
        title=None,
        sort=upset_plot_sorting,
    ),
    y=alt.Y(
        "Whole Genome",
        axis=alt.Axis(
            grid=False,
            orient='left',
            labelFontSize=vertical_bar_tick_font_size,
            tickCount=6,
            format='%'
            ),
        title=None,
        scale=alt.Scale(type="symlog",domain=[0,max(df["Whole Genome"])])
    ),
    tooltip=tooltip,
    color=alt.condition(alt.datum.set == 1, alt.value(main_color), alt.value("#b80000") )
).properties(
    width=matrix_width,
    height=vertical_bar_chart_height
)

vertical_bar.configure_title(fontSize=10000)

vertical_bar_text = vertical_bar.mark_text(
    color=main_color, 
    dy=-10,
    size=vertical_bar_label_size,
    fontWeight='bold',
).encode(
    text=alt.Text("Whole Genome", format=".2%")
)

# vertical_bar_line = vertical_bar.mark_line(color=highlight_color, opacity=1).encode(
#     color=alt.value(highlight_color),
#     y=alt.Y(
#             '16S:Q',
#            )
#     )

# line_text = vertical_bar_line.mark_text(
#     color=main_color,
#     align='center',
#     baseline='middle',
#     dy=-19,
#     fontWeight='bold',
#     fontSize=line_chart_font_size,
# ).encode(
#     text=alt.Text("16S:Q")
# )

bardata = pd.DataFrame({
    'x': ['Hot Springs'],
    'y': ['MBARC']
                       
})

legendline = alt.Chart(bardata).mark_square(size=400, color=main_color).encode(
            y=alt.Y('y:N',
                    axis=alt.Axis(orient='right', 
                                  titleFontSize=0, 
                                  labelFontSize=26, 
                                  titleX=80, 
                                  titleY=-5,
                                  titleAngle=0
                                 ),

                   ),
    ).properties(
    height=60
)

legendbar = alt.Chart(bardata).mark_square(size=400, color='#b80000').encode(
    y=alt.Y('x:N', 
            axis=alt.Axis(orient='right', 
                          titleFontSize=0, 
                          labelFontSize=26,
                          titleX=80, 
                          titleY=-5,
                          titleAngle=0
                         ),
    
           ),
    )

# Combines the separate components to create the vertical bar chart
# mouse interaction added
vertical_bar_chart = ( vertical_bar + vertical_bar_text).add_selection(
    mouse_hover_selection
)



In [8]:
# matrix view selections
selection2 = alt.selection_multi(on="mouseover", fields=["Species", "value"])
circle_color = alt.condition(~selection2, alt.Color("value:N", scale=alt.Scale(domain=[0,1], range=[main_color, main_color]), legend=None), alt.value(highlight_color))

In [9]:
# UpSet glyph view (matrix view) of hits
# value of 0 = miss
matrix_base = alt.Chart(df.copy()).mark_circle(
    size=glyph_size,
    opacity=1
).transform_fold(
    [Tools.PHYLOFLASHHIT.value, Tools.DELUCSHIT.value]
).encode(
    x=alt.X(
        "Species",
#         axis=alt.Axis(grid=False, ticks=False, domain=False, labelFontSize=bottom_x_axis_labels_size, labelFontWeight=bottom_x_axis_labels_weight, labelLimit=10000, labelAngle=-45),
        axis=alt.Axis(
            grid=False,
            ticks=False,
            domain=False,
            labelFontSize=bottom_x_axis_labels_size,
            labelFontWeight=bottom_x_axis_labels_weight,
            labelLimit=10000,
            labelAngle=-45,
            titleY=300.0
            ),
        title="Species",
        sort=None
    ),
    y=alt.Y(
        "key:N",
        axis=alt.Axis(grid=False, ticks=False, domain=False, orient='right', labelFontWeight="bold"),
        title=None,
        sort=["phyloFlash hit","DeLUCS hit"]
    ),
    color=circle_color,
).properties(
    height=matrix_height,
    width= matrix_width
)

# keeps miss dots from being highlighted on mouse hover
grey_glyph_size = glyph_size + 30
circle_grey_miss = matrix_base.mark_circle(size=grey_glyph_size, opacity=1).transform_filter(
    (alt.datum["value"] == 0)
).encode(
    color=alt.value("#E6E6E6")
)

# highlights every other row of matrix view to distinguish hit and miss rows 
zebra_strip_matrix_view = matrix_base.mark_rect().transform_filter(
    (alt.datum.key == "DeLUCS hit")
).encode(
    color=alt.value("#F7F7F7")
)

# Combines the separate components to create the matrix view
# order seems matter
# mouse interaction added
# Duplicate `circle` is to properly show tooltips and on mouse color highlighting
matrix_view = (matrix_base + zebra_strip_matrix_view + matrix_base + circle_grey_miss).add_selection(
    mouse_hover_selection,
    selection2
)




In [10]:
# combine all charts to make final upset plot
upset_plot = alt.hconcat(
    vertical_bar_chart, legendline + legendbar
)

upset_plot = alt.vconcat(
    upset_plot,
    matrix_view
)

# final graph settings tuning
upset_plot.configure_view(
    stroke=None
).configure_axis(
    titleFontSize=matrix_title_size,
    labelFontSize=matrix_label_size
).configure_title(
    fontSize=vertical_bar_title_font_size
)


  for col_name, dtype in df.dtypes.iteritems():
