From b989178cc342152140585366c075284a79e8b069 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Wed, 10 May 2023 11:36:24 -0400
Subject: [PATCH 01/13] Adding stable base for UpSet plots w/ single or
multiple group color modes.
---
.../plotly/plotly/figure_factory/__init__.py | 2 +
.../plotly/plotly/figure_factory/_upset.py | 262 ++++++++++++++++++
2 files changed, 264 insertions(+)
create mode 100644 packages/python/plotly/plotly/figure_factory/_upset.py
diff --git a/packages/python/plotly/plotly/figure_factory/__init__.py b/packages/python/plotly/plotly/figure_factory/__init__.py
index 0a41dca1ba2..4b82aeda542 100644
--- a/packages/python/plotly/plotly/figure_factory/__init__.py
+++ b/packages/python/plotly/plotly/figure_factory/__init__.py
@@ -25,6 +25,7 @@
from plotly.figure_factory._streamline import create_streamline
from plotly.figure_factory._table import create_table
from plotly.figure_factory._trisurf import create_trisurf
+from plotly.figure_factory._upset import create_upset
from plotly.figure_factory._violin import create_violin
if optional_imports.get_module("pandas") is not None:
@@ -65,5 +66,6 @@ def create_ternary_contour(*args, **kwargs):
"create_table",
"create_ternary_contour",
"create_trisurf",
+ "create_upset",
"create_violin",
]
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
new file mode 100644
index 00000000000..14966369c3a
--- /dev/null
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -0,0 +1,262 @@
+from __future__ import absolute_import
+
+from plotly import exceptions, optional_imports
+import plotly.graph_objects as go
+import plotly.express as px
+
+pd = optional_imports.get_module("pandas")
+np = optional_imports.get_module("numpy")
+
+
+CHART_TYPES = ['bar', 'box', 'violin']
+
+
+def create_upset(data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts',
+ max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None,
+ color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0):
+ plot_obj = _Upset(**locals())
+ upset_plot = plot_obj.make_upset_plot()
+ return upset_plot, plot_obj
+
+
+def _expand_subset_column(df):
+ # TODO: Fill in this method for alternate data representation
+ # TODO: Add input for subset_column
+ pass
+
+
+def _make_binary(t):
+ """
+ Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order.
+ """
+ return sum([t[i] * 2**i for i in range(len(t))])
+
+
+def _transform_upset_data(df):
+ """
+ Takes raw data of binary vectors for set inclusion and produces counts over each.
+ """
+ intersect_counts = pd.DataFrame({'Intersections': list(df.value_counts().to_dict().keys()),
+ 'Counts': list(df.value_counts().to_dict().values())})
+ return intersect_counts
+
+
+def _sort_intersect_counts(df, sort_by='Counts', asc=True):
+ """
+ Takes output from _transform_upset_data and sorts by method requested.
+ """
+ key = None if (sort_by == 'Counts') else lambda x: x.apply(lambda y: (sum(y), _make_binary(y)))
+ df = df.sort_values(by=sort_by, key=key, ascending=asc)
+ return df
+
+
+class _Upset:
+ """
+ Represents builder object for UpSet plot. Refer to figure_factory.create_upset() for full docstring.
+ """
+
+ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts',
+ max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None,
+ color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0):
+
+ # Plot inputs and settings
+ self.df = data_frame
+ self.x = x
+ self.color = color
+ self.title = title
+ self.sort_by = sort_by
+ self.asc = asc
+ self.mode = mode
+ self.max_subsets = max_subsets,
+ self.subset_bgcolor = subset_bgcolor
+ self.subset_fgcolor = subset_fgcolor
+ self.category_orders = category_orders
+ self.color_discrete_sequence = color_discrete_sequence
+ self.color_discrete_map = color_discrete_map
+ self.log_y = log_y
+ self.barmode = barmode
+ self.textangle = textangle
+
+ # TODO: Refactor code for "common plot args" that can be reused for eventual box/violin plots
+
+ # Figure-building specific attributes
+ self.fig = go.Figure()
+ self.intersect_counts = pd.DataFrame()
+ self.subset_col_names = [c for c in data_frame.columns if c != x and c != color]
+ self.switchboard_heights = []
+
+ # Validate inputs
+ self.validate_upset_inputs()
+
+ # DEBUG
+ self.test = None
+
+ def make_upset_plot(self):
+ # Create intersect_counts df depending on if color provided
+ color = self.color
+ df = self.df
+ if color is not None:
+ # TODO: Consider refactor using groupby instead of looping over colors
+ for c in df[color].unique():
+ sub_df = df[df[color] == c].drop(columns=[color])
+ if self.x is not None:
+ # TODO: Check counting code for clustering by x value for distribution plots
+ new_df = sub_df.groupby(self.x).apply(lambda x: _transform_upset_data(x.drop(columns=['self.x'])))
+ new_df = new_df.reset_index()[[self.x, 'Intersections', 'Counts']]
+ else:
+ new_df = _transform_upset_data(sub_df)
+ # Sort subgroup in requested order
+ new_df = _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc).reset_index(drop=True).reset_index()
+ new_df[color] = c
+ self.intersect_counts = pd.concat([self.intersect_counts, new_df])
+ # TODO: Need to saturate each cluster with 0 value for subsets in one but not other...
+ else:
+ self.intersect_counts = _transform_upset_data(df)
+ self.intersect_counts = _sort_intersect_counts(self.intersect_counts, sort_by=self.sort_by, asc=self.asc)
+ self.intersect_counts = self.intersect_counts.reset_index(drop=True).reset_index()
+
+ # Rescale for percents if requested
+ mode = self.mode
+ if mode == 'Percent':
+ if color is not None:
+ denom = self.intersect_counts.groupby(color).sum().reset_index()
+ denom_dict = dict(zip(denom[color], denom['Counts']))
+ self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts[color].map(denom_dict), 2)
+ else:
+ self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts['Counts'].sum(), 2)
+
+ # Create 3 main components for figure
+ self.make_primary_plot()
+ self.make_switchboard()
+ self.make_margin_plot()
+
+ # Add title
+ self.fig.update_layout(title=self.title, title_x=0.5)
+
+ return self.fig
+
+ def validate_upset_inputs(self):
+ # Check sorting inputs are valid
+ sort_by = self.sort_by
+ try:
+ assert (sort_by == 'Counts') or (sort_by == 'Intersections')
+ except AssertionError:
+ raise ValueError(f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}')
+
+ # Check mode is either Counts or Percent
+ mode = self.mode
+ try:
+ assert (mode == 'Counts') or (mode == 'Percent')
+ except AssertionError:
+ raise ValueError(f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}')
+
+ def make_primary_plot(self):
+ bar_args = {
+ 'color': self.color,
+ 'category_orders': self.category_orders,
+ 'color_discrete_sequence': self.color_discrete_sequence,
+ 'color_discrete_map': self.color_discrete_map,
+ 'barmode': self.barmode,
+ 'log_y': self.log_y
+ }
+
+ # TODO: Override default hover info for something more useful
+ self.fig = px.bar(self.intersect_counts, x='index', y='Counts', text='Counts', **bar_args)
+ self.fig.update_traces(textposition='outside', cliponaxis=False, textangle=self.textangle)
+ self.fig.update_layout(plot_bgcolor='#FFFFFF', xaxis_visible=False, xaxis_showticklabels=False,
+ yaxis_visible=False, yaxis_showticklabels=False)
+
+ def make_switchboard(self):
+ """
+ Method to add subset points to input fig px.bar chart in the style of UpSet plot.
+ Returns updated figure, and list of heights of dots for downstream convenience.
+ """
+ # Compute coordinates for bg subset scatter points
+ d = len(self.subset_col_names)
+ num_bars = len(self.fig.data[0]['x'])
+ x_bg_scatter = np.repeat(self.fig.data[0]['x'], d)
+ y_scatter_offset = 0.2 # Offsetting ensures bars will hover just above the subset scatterplot
+ y_max = (1 + y_scatter_offset) * max([max(bar['y']) for bar in self.fig.data])
+ self.switchboard_heights = [-y_max / d * i - y_scatter_offset * y_max for i in list(range(d))]
+ y_bg_scatter = num_bars * self.switchboard_heights
+
+ # Add bg subset scatter points to figure below bar chart
+ self.fig.add_trace(go.Scatter(x=x_bg_scatter, y=y_bg_scatter, mode='markers', showlegend=False,
+ marker=dict(size=16, color=self.subset_bgcolor)))
+ self.fig.update_layout(xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False),
+ margin=dict(t=40, l=150))
+
+ # Compute list of intersections
+ intersections = None
+ if self.color is not None:
+ # Pull out full list of possible intersection combinations from first color grouping
+ query = self.intersect_counts[self.color] == self.intersect_counts[self.color].iloc[0]
+ intersections = list(self.intersect_counts[query]['Intersections'])
+ else:
+ intersections = list(self.intersect_counts['Intersections'])
+
+ # Then fill in subset markers with fg color
+ x = 0
+ for s in intersections:
+ x_subsets = []
+ y_subsets = []
+ y = 0
+ for e in s:
+ if e:
+ x_subsets += [x]
+ y_subsets += [-y_max / d * y - y_scatter_offset * y_max]
+ y += 1
+ x += 1
+ # TODO: Add hover information for subset/intersection description
+ self.fig.add_trace(go.Scatter(x=x_subsets, y=y_subsets, mode='markers+lines', showlegend=False,
+ marker=dict(size=16, color=self.subset_fgcolor, showscale=False)))
+
+ def make_margin_plot(self):
+ """
+ Method to add left margin count px.bar chart in style of UpSet plot.
+ """
+ # Group and count according to color input
+ color = self.color
+ counts_df = self.df.groupby(color).sum().reset_index() if color is not None else self.df.sum().reset_index().rename(
+ columns={'index': 'variable', 0: 'value'})
+
+ bar_args = {
+ 'color': self.color,
+ 'category_orders': self.category_orders,
+ 'color_discrete_sequence': self.color_discrete_sequence,
+ 'color_discrete_map': self.color_discrete_map,
+ 'barmode': self.barmode,
+ 'log_y': self.log_y
+ }
+
+ # Create counts px.bar chart
+ plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df
+ if self.mode == 'Percent':
+ if color is not None:
+ denom = plot_df.groupby(color).sum().reset_index()
+ denom_dict = dict(zip(denom[color], denom['value']))
+ plot_df['value'] = round(plot_df['value'] / plot_df[color].map(denom_dict), 2)
+ else:
+ plot_df['value'] = round(plot_df['value'] / plot_df['value'].sum(), 2)
+ counts_bar = px.bar(plot_df, x='value', y='variable', orientation='h', text='value', **bar_args)
+ counts_bar.update_traces(textposition='outside', cliponaxis=False)
+ # TODO: Change hover info to be more useful
+
+ # Add subset names as text into plot
+ subset_names = self.subset_col_names
+ # subset_names = counts_bar.data[0]['y']
+ max_name_len = max([len(s) for s in subset_names])
+ annotation_center = -1 + -0.01 * max_name_len
+ for i, s in enumerate(subset_names):
+ self.fig.add_annotation(x=annotation_center, y=self.switchboard_heights[i], text=s, showarrow=False,
+ font=dict(size=12, color='#000000'), align='left')
+
+ # Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights
+ for trace in counts_bar.data:
+ trace['x'] = -trace['x'] / max(trace['x'])
+ trace['y'] = self.switchboard_heights
+ counts_bar.update_traces(base=annotation_center - 1, showlegend=False)
+
+ # Add counts chart traces to main fig
+ for trace in counts_bar.data:
+ self.fig.add_trace(trace)
From e06813ba235b999a7fc5523423d08b0200d8d26e Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Wed, 10 May 2023 14:47:23 -0400
Subject: [PATCH 02/13] Added functionality to allow user to specify column of
lists/tuples for subset inclusion.
---
.../plotly/plotly/figure_factory/_upset.py | 294 +++++++++++++-----
1 file changed, 218 insertions(+), 76 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index 14966369c3a..2506320d3ac 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -8,21 +8,49 @@
np = optional_imports.get_module("numpy")
-CHART_TYPES = ['bar', 'box', 'violin']
-
-
-def create_upset(data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts',
- max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None,
- color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0):
+CHART_TYPES = ["bar", "box", "violin"]
+
+
+def create_upset(
+ data_frame,
+ x=None,
+ color=None,
+ title=None,
+ sort_by="Counts",
+ asc=False,
+ mode="Counts",
+ max_subsets=50,
+ subset_column=None,
+ subset_order=None,
+ subset_bgcolor="#C9C9C9",
+ subset_fgcolor="#000000",
+ category_orders=None,
+ color_discrete_sequence=None,
+ color_discrete_map=None,
+ log_y=False,
+ barmode="group",
+ textangle=0,
+):
plot_obj = _Upset(**locals())
upset_plot = plot_obj.make_upset_plot()
+ # TODO: Create tests for plotter
return upset_plot, plot_obj
-def _expand_subset_column(df):
- # TODO: Fill in this method for alternate data representation
- # TODO: Add input for subset_column
- pass
+def _expand_subset_column(df, subset_column, subset_order):
+ """
+ Takes a column of iterables and expands into binary columns representing inclusion. Also returns subset_names.
+ """
+ subset_names = (
+ subset_order
+ if subset_order is not None
+ else list(df[subset_column].explode().unique())
+ )
+ new_df = df.copy()
+ for name in subset_names:
+ new_df[name] = new_df[subset_column].apply(lambda x: int(name in x))
+ new_df = new_df[subset_names]
+ return new_df, subset_names
def _make_binary(t):
@@ -36,16 +64,24 @@ def _transform_upset_data(df):
"""
Takes raw data of binary vectors for set inclusion and produces counts over each.
"""
- intersect_counts = pd.DataFrame({'Intersections': list(df.value_counts().to_dict().keys()),
- 'Counts': list(df.value_counts().to_dict().values())})
+ intersect_counts = pd.DataFrame(
+ {
+ "Intersections": list(df.value_counts().to_dict().keys()),
+ "Counts": list(df.value_counts().to_dict().values()),
+ }
+ )
return intersect_counts
-def _sort_intersect_counts(df, sort_by='Counts', asc=True):
+def _sort_intersect_counts(df, sort_by="Counts", asc=True):
"""
Takes output from _transform_upset_data and sorts by method requested.
"""
- key = None if (sort_by == 'Counts') else lambda x: x.apply(lambda y: (sum(y), _make_binary(y)))
+ key = (
+ None
+ if (sort_by == "Counts")
+ else lambda x: x.apply(lambda y: (sum(y), _make_binary(y)))
+ )
df = df.sort_values(by=sort_by, key=key, ascending=asc)
return df
@@ -55,9 +91,27 @@ class _Upset:
Represents builder object for UpSet plot. Refer to figure_factory.create_upset() for full docstring.
"""
- def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts',
- max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None,
- color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0):
+ def __init__(
+ self,
+ data_frame,
+ x=None,
+ color=None,
+ title=None,
+ sort_by="Counts",
+ asc=False,
+ mode="Counts",
+ max_subsets=50,
+ subset_column=None,
+ subset_order=None,
+ subset_bgcolor="#C9C9C9",
+ subset_fgcolor="#000000",
+ category_orders=None,
+ color_discrete_sequence=None,
+ color_discrete_map=None,
+ log_y=False,
+ barmode="group",
+ textangle=0,
+ ):
# Plot inputs and settings
self.df = data_frame
@@ -67,7 +121,9 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts',
self.sort_by = sort_by
self.asc = asc
self.mode = mode
- self.max_subsets = max_subsets,
+ self.max_subsets = max_subsets
+ self.subset_column = subset_column
+ self.subset_order = subset_order
self.subset_bgcolor = subset_bgcolor
self.subset_fgcolor = subset_fgcolor
self.category_orders = category_orders
@@ -82,7 +138,7 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts',
# Figure-building specific attributes
self.fig = go.Figure()
self.intersect_counts = pd.DataFrame()
- self.subset_col_names = [c for c in data_frame.columns if c != x and c != color]
+ self.subset_names = None
self.switchboard_heights = []
# Validate inputs
@@ -92,6 +148,22 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts',
self.test = None
def make_upset_plot(self):
+ # If subset_column provided, expand into standard wider format
+ if self.subset_column is not None:
+ color_column = self.df[self.color] if self.color is not None else None
+ x_column = self.df[self.x] if self.x is not None else None
+ self.df, self.subset_names = _expand_subset_column(
+ self.df, self.subset_column, self.subset_order
+ )
+ if self.color is not None:
+ self.df = pd.concat([self.df, color_column], axis=1)
+ if self.x is not None:
+ self.df = pd.concat([self.df, x_column], axis=1)
+ else:
+ self.subset_names = [
+ c for c in self.df.columns if c != self.x and c != self.color
+ ]
+
# Create intersect_counts df depending on if color provided
color = self.color
df = self.df
@@ -101,29 +173,47 @@ def make_upset_plot(self):
sub_df = df[df[color] == c].drop(columns=[color])
if self.x is not None:
# TODO: Check counting code for clustering by x value for distribution plots
- new_df = sub_df.groupby(self.x).apply(lambda x: _transform_upset_data(x.drop(columns=['self.x'])))
- new_df = new_df.reset_index()[[self.x, 'Intersections', 'Counts']]
+ new_df = sub_df.groupby(self.x).apply(
+ lambda x: _transform_upset_data(x.drop(columns=["self.x"]))
+ )
+ new_df = new_df.reset_index()[[self.x, "Intersections", "Counts"]]
else:
new_df = _transform_upset_data(sub_df)
# Sort subgroup in requested order
- new_df = _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc).reset_index(drop=True).reset_index()
+ new_df = (
+ _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc)
+ .reset_index(drop=True)
+ .reset_index()
+ )
new_df[color] = c
self.intersect_counts = pd.concat([self.intersect_counts, new_df])
# TODO: Need to saturate each cluster with 0 value for subsets in one but not other...
else:
self.intersect_counts = _transform_upset_data(df)
- self.intersect_counts = _sort_intersect_counts(self.intersect_counts, sort_by=self.sort_by, asc=self.asc)
- self.intersect_counts = self.intersect_counts.reset_index(drop=True).reset_index()
+ self.intersect_counts = _sort_intersect_counts(
+ self.intersect_counts, sort_by=self.sort_by, asc=self.asc
+ )
+ self.intersect_counts = self.intersect_counts.reset_index(
+ drop=True
+ ).reset_index()
# Rescale for percents if requested
mode = self.mode
- if mode == 'Percent':
+ if mode == "Percent":
if color is not None:
denom = self.intersect_counts.groupby(color).sum().reset_index()
- denom_dict = dict(zip(denom[color], denom['Counts']))
- self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts[color].map(denom_dict), 2)
+ denom_dict = dict(zip(denom[color], denom["Counts"]))
+ self.intersect_counts["Counts"] = round(
+ self.intersect_counts["Counts"]
+ / self.intersect_counts[color].map(denom_dict),
+ 2,
+ )
else:
- self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts['Counts'].sum(), 2)
+ self.intersect_counts["Counts"] = round(
+ self.intersect_counts["Counts"]
+ / self.intersect_counts["Counts"].sum(),
+ 2,
+ )
# Create 3 main components for figure
self.make_primary_plot()
@@ -139,32 +229,45 @@ def validate_upset_inputs(self):
# Check sorting inputs are valid
sort_by = self.sort_by
try:
- assert (sort_by == 'Counts') or (sort_by == 'Intersections')
+ assert (sort_by == "Counts") or (sort_by == "Intersections")
except AssertionError:
- raise ValueError(f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}')
+ raise ValueError(
+ f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}'
+ )
# Check mode is either Counts or Percent
mode = self.mode
try:
- assert (mode == 'Counts') or (mode == 'Percent')
+ assert (mode == "Counts") or (mode == "Percent")
except AssertionError:
- raise ValueError(f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}')
+ raise ValueError(
+ f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}'
+ )
def make_primary_plot(self):
bar_args = {
- 'color': self.color,
- 'category_orders': self.category_orders,
- 'color_discrete_sequence': self.color_discrete_sequence,
- 'color_discrete_map': self.color_discrete_map,
- 'barmode': self.barmode,
- 'log_y': self.log_y
+ "color": self.color,
+ "category_orders": self.category_orders,
+ "color_discrete_sequence": self.color_discrete_sequence,
+ "color_discrete_map": self.color_discrete_map,
+ "barmode": self.barmode,
+ "log_y": self.log_y,
}
# TODO: Override default hover info for something more useful
- self.fig = px.bar(self.intersect_counts, x='index', y='Counts', text='Counts', **bar_args)
- self.fig.update_traces(textposition='outside', cliponaxis=False, textangle=self.textangle)
- self.fig.update_layout(plot_bgcolor='#FFFFFF', xaxis_visible=False, xaxis_showticklabels=False,
- yaxis_visible=False, yaxis_showticklabels=False)
+ self.fig = px.bar(
+ self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args
+ )
+ self.fig.update_traces(
+ textposition="outside", cliponaxis=False, textangle=self.textangle
+ )
+ self.fig.update_layout(
+ plot_bgcolor="#FFFFFF",
+ xaxis_visible=False,
+ xaxis_showticklabels=False,
+ yaxis_visible=False,
+ yaxis_showticklabels=False,
+ )
def make_switchboard(self):
"""
@@ -172,28 +275,45 @@ def make_switchboard(self):
Returns updated figure, and list of heights of dots for downstream convenience.
"""
# Compute coordinates for bg subset scatter points
- d = len(self.subset_col_names)
- num_bars = len(self.fig.data[0]['x'])
- x_bg_scatter = np.repeat(self.fig.data[0]['x'], d)
- y_scatter_offset = 0.2 # Offsetting ensures bars will hover just above the subset scatterplot
- y_max = (1 + y_scatter_offset) * max([max(bar['y']) for bar in self.fig.data])
- self.switchboard_heights = [-y_max / d * i - y_scatter_offset * y_max for i in list(range(d))]
+ d = len(self.subset_names)
+ num_bars = len(self.fig.data[0]["x"])
+ x_bg_scatter = np.repeat(self.fig.data[0]["x"], d)
+ y_scatter_offset = (
+ 0.2 # Offsetting ensures bars will hover just above the subset scatterplot
+ )
+ y_max = (1 + y_scatter_offset) * max([max(bar["y"]) for bar in self.fig.data])
+ self.switchboard_heights = [
+ -y_max / d * i - y_scatter_offset * y_max for i in list(range(d))
+ ]
y_bg_scatter = num_bars * self.switchboard_heights
# Add bg subset scatter points to figure below bar chart
- self.fig.add_trace(go.Scatter(x=x_bg_scatter, y=y_bg_scatter, mode='markers', showlegend=False,
- marker=dict(size=16, color=self.subset_bgcolor)))
- self.fig.update_layout(xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False),
- margin=dict(t=40, l=150))
+ self.fig.add_trace(
+ go.Scatter(
+ x=x_bg_scatter,
+ y=y_bg_scatter,
+ mode="markers",
+ showlegend=False,
+ marker=dict(size=16, color=self.subset_bgcolor),
+ )
+ )
+ self.fig.update_layout(
+ xaxis=dict(showgrid=False, zeroline=False),
+ yaxis=dict(showgrid=True, zeroline=False),
+ margin=dict(t=40, l=150),
+ )
# Compute list of intersections
intersections = None
if self.color is not None:
# Pull out full list of possible intersection combinations from first color grouping
- query = self.intersect_counts[self.color] == self.intersect_counts[self.color].iloc[0]
- intersections = list(self.intersect_counts[query]['Intersections'])
+ query = (
+ self.intersect_counts[self.color]
+ == self.intersect_counts[self.color].iloc[0]
+ )
+ intersections = list(self.intersect_counts[query]["Intersections"])
else:
- intersections = list(self.intersect_counts['Intersections'])
+ intersections = list(self.intersect_counts["Intersections"])
# Then fill in subset markers with fg color
x = 0
@@ -208,8 +328,15 @@ def make_switchboard(self):
y += 1
x += 1
# TODO: Add hover information for subset/intersection description
- self.fig.add_trace(go.Scatter(x=x_subsets, y=y_subsets, mode='markers+lines', showlegend=False,
- marker=dict(size=16, color=self.subset_fgcolor, showscale=False)))
+ self.fig.add_trace(
+ go.Scatter(
+ x=x_subsets,
+ y=y_subsets,
+ mode="markers+lines",
+ showlegend=False,
+ marker=dict(size=16, color=self.subset_fgcolor, showscale=False),
+ )
+ )
def make_margin_plot(self):
"""
@@ -217,44 +344,59 @@ def make_margin_plot(self):
"""
# Group and count according to color input
color = self.color
- counts_df = self.df.groupby(color).sum().reset_index() if color is not None else self.df.sum().reset_index().rename(
- columns={'index': 'variable', 0: 'value'})
+ counts_df = (
+ self.df.groupby(color).sum().reset_index()
+ if color is not None
+ else self.df.sum()
+ .reset_index()
+ .rename(columns={"index": "variable", 0: "value"})
+ )
bar_args = {
- 'color': self.color,
- 'category_orders': self.category_orders,
- 'color_discrete_sequence': self.color_discrete_sequence,
- 'color_discrete_map': self.color_discrete_map,
- 'barmode': self.barmode,
- 'log_y': self.log_y
+ "color": self.color,
+ "category_orders": self.category_orders,
+ "color_discrete_sequence": self.color_discrete_sequence,
+ "color_discrete_map": self.color_discrete_map,
+ "barmode": self.barmode,
+ "log_y": self.log_y,
}
# Create counts px.bar chart
plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df
- if self.mode == 'Percent':
+ if self.mode == "Percent":
if color is not None:
denom = plot_df.groupby(color).sum().reset_index()
- denom_dict = dict(zip(denom[color], denom['value']))
- plot_df['value'] = round(plot_df['value'] / plot_df[color].map(denom_dict), 2)
+ denom_dict = dict(zip(denom[color], denom["value"]))
+ plot_df["value"] = round(
+ plot_df["value"] / plot_df[color].map(denom_dict), 2
+ )
else:
- plot_df['value'] = round(plot_df['value'] / plot_df['value'].sum(), 2)
- counts_bar = px.bar(plot_df, x='value', y='variable', orientation='h', text='value', **bar_args)
- counts_bar.update_traces(textposition='outside', cliponaxis=False)
+ plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2)
+ counts_bar = px.bar(
+ plot_df, x="value", y="variable", orientation="h", text="value", **bar_args
+ )
+ counts_bar.update_traces(textposition="outside", cliponaxis=False)
# TODO: Change hover info to be more useful
# Add subset names as text into plot
- subset_names = self.subset_col_names
+ subset_names = self.subset_names
# subset_names = counts_bar.data[0]['y']
max_name_len = max([len(s) for s in subset_names])
annotation_center = -1 + -0.01 * max_name_len
for i, s in enumerate(subset_names):
- self.fig.add_annotation(x=annotation_center, y=self.switchboard_heights[i], text=s, showarrow=False,
- font=dict(size=12, color='#000000'), align='left')
+ self.fig.add_annotation(
+ x=annotation_center,
+ y=self.switchboard_heights[i],
+ text=s,
+ showarrow=False,
+ font=dict(size=12, color="#000000"),
+ align="left",
+ )
# Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights
for trace in counts_bar.data:
- trace['x'] = -trace['x'] / max(trace['x'])
- trace['y'] = self.switchboard_heights
+ trace["x"] = -trace["x"] / max(trace["x"])
+ trace["y"] = self.switchboard_heights
counts_bar.update_traces(base=annotation_center - 1, showlegend=False)
# Add counts chart traces to main fig
From cef0fb84417ef39d62bb0b047d02f52875a9f24e Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Wed, 10 May 2023 16:34:56 -0400
Subject: [PATCH 03/13] Padded intersection counts with zeros when color groups
had some missing subsets.
---
.../plotly/plotly/figure_factory/_upset.py | 69 +++++++++++--------
1 file changed, 41 insertions(+), 28 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index 2506320d3ac..cf8bc78a3e7 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -7,7 +7,6 @@
pd = optional_imports.get_module("pandas")
np = optional_imports.get_module("numpy")
-
CHART_TYPES = ["bar", "box", "violin"]
@@ -37,7 +36,7 @@ def create_upset(
return upset_plot, plot_obj
-def _expand_subset_column(df, subset_column, subset_order):
+def _expand_subset_column(df, subset_column, subset_order=None):
"""
Takes a column of iterables and expands into binary columns representing inclusion. Also returns subset_names.
"""
@@ -166,30 +165,44 @@ def make_upset_plot(self):
# Create intersect_counts df depending on if color provided
color = self.color
- df = self.df
- if color is not None:
- # TODO: Consider refactor using groupby instead of looping over colors
- for c in df[color].unique():
- sub_df = df[df[color] == c].drop(columns=[color])
- if self.x is not None:
- # TODO: Check counting code for clustering by x value for distribution plots
- new_df = sub_df.groupby(self.x).apply(
- lambda x: _transform_upset_data(x.drop(columns=["self.x"]))
- )
- new_df = new_df.reset_index()[[self.x, "Intersections", "Counts"]]
- else:
- new_df = _transform_upset_data(sub_df)
- # Sort subgroup in requested order
- new_df = (
- _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc)
- .reset_index(drop=True)
- .reset_index()
+ # TODO: Add grouping by x value input
+ if self.color is not None:
+ intersect_df = self.df.groupby(self.color).apply(
+ lambda df: _transform_upset_data(
+ df.drop(columns=[self.color])
+ ).reset_index(drop=True)
+ )
+
+ # Fill in counts for subsets where count is zero for certain color groups
+ filled_df = (
+ intersect_df.pivot_table(
+ index="Intersections",
+ columns=[self.color],
+ values="Counts",
+ fill_value=0,
+ )
+ .unstack()
+ .reset_index()
+ .rename(columns={0: "Counts"})
+ )
+
+ # Perform sorting within each color group
+ # WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups
+ self.intersect_counts = (
+ filled_df.groupby(self.color)
+ .apply(
+ lambda df: _sort_intersect_counts(
+ df.drop(columns=[self.color]),
+ sort_by="Intersections",
+ asc=self.asc,
+ ).reset_index()
)
- new_df[color] = c
- self.intersect_counts = pd.concat([self.intersect_counts, new_df])
- # TODO: Need to saturate each cluster with 0 value for subsets in one but not other...
+ .reset_index()
+ .drop(columns=["index"])
+ .rename(columns={"level_1": "index"})
+ )
else:
- self.intersect_counts = _transform_upset_data(df)
+ self.intersect_counts = _transform_upset_data(self.df)
self.intersect_counts = _sort_intersect_counts(
self.intersect_counts, sort_by=self.sort_by, asc=self.asc
)
@@ -200,12 +213,12 @@ def make_upset_plot(self):
# Rescale for percents if requested
mode = self.mode
if mode == "Percent":
- if color is not None:
- denom = self.intersect_counts.groupby(color).sum().reset_index()
- denom_dict = dict(zip(denom[color], denom["Counts"]))
+ if self.color is not None:
+ denom = self.intersect_counts.groupby(self.color).sum().reset_index()
+ denom_dict = dict(zip(denom[self.color], denom["Counts"]))
self.intersect_counts["Counts"] = round(
self.intersect_counts["Counts"]
- / self.intersect_counts[color].map(denom_dict),
+ / self.intersect_counts[self.color].map(denom_dict),
2,
)
else:
From df2531870506697d36e297dbbf28585b23d0d01b Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Wed, 10 May 2023 17:16:26 -0400
Subject: [PATCH 04/13] Added more useful hover data for switchboard.
---
.../plotly/plotly/figure_factory/_upset.py | 40 ++++++++++++-------
1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index cf8bc78a3e7..a41bd5babb8 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -120,6 +120,7 @@ def __init__(
self.sort_by = sort_by
self.asc = asc
self.mode = mode
+ # TODO: Implement max_subsets in code
self.max_subsets = max_subsets
self.subset_column = subset_column
self.subset_order = subset_order
@@ -267,7 +268,6 @@ def make_primary_plot(self):
"log_y": self.log_y,
}
- # TODO: Override default hover info for something more useful
self.fig = px.bar(
self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args
)
@@ -287,6 +287,18 @@ def make_switchboard(self):
Method to add subset points to input fig px.bar chart in the style of UpSet plot.
Returns updated figure, and list of heights of dots for downstream convenience.
"""
+ # Compute list of intersections
+ intersections = None
+ if self.color is not None:
+ # Pull out full list of possible intersection combinations from first color grouping
+ query = (
+ self.intersect_counts[self.color]
+ == self.intersect_counts[self.color].iloc[0]
+ )
+ intersections = list(self.intersect_counts[query]["Intersections"])
+ else:
+ intersections = list(self.intersect_counts["Intersections"])
+
# Compute coordinates for bg subset scatter points
d = len(self.subset_names)
num_bars = len(self.fig.data[0]["x"])
@@ -301,6 +313,14 @@ def make_switchboard(self):
y_bg_scatter = num_bars * self.switchboard_heights
# Add bg subset scatter points to figure below bar chart
+ labels = np.repeat(
+ [
+ "+".join([x for x, y in zip(self.subset_names, s) if y != 0])
+ for s in intersections
+ ],
+ d,
+ )
+ labels = ["None" if x == "" else x for x in labels]
self.fig.add_trace(
go.Scatter(
x=x_bg_scatter,
@@ -308,6 +328,8 @@ def make_switchboard(self):
mode="markers",
showlegend=False,
marker=dict(size=16, color=self.subset_bgcolor),
+ text=labels,
+ hovertemplate="%{text}",
)
)
self.fig.update_layout(
@@ -316,18 +338,6 @@ def make_switchboard(self):
margin=dict(t=40, l=150),
)
- # Compute list of intersections
- intersections = None
- if self.color is not None:
- # Pull out full list of possible intersection combinations from first color grouping
- query = (
- self.intersect_counts[self.color]
- == self.intersect_counts[self.color].iloc[0]
- )
- intersections = list(self.intersect_counts[query]["Intersections"])
- else:
- intersections = list(self.intersect_counts["Intersections"])
-
# Then fill in subset markers with fg color
x = 0
for s in intersections:
@@ -340,7 +350,6 @@ def make_switchboard(self):
y_subsets += [-y_max / d * y - y_scatter_offset * y_max]
y += 1
x += 1
- # TODO: Add hover information for subset/intersection description
self.fig.add_trace(
go.Scatter(
x=x_subsets,
@@ -348,6 +357,9 @@ def make_switchboard(self):
mode="markers+lines",
showlegend=False,
marker=dict(size=16, color=self.subset_fgcolor, showscale=False),
+ text=["+".join([x for x, y in zip(self.subset_names, s) if y != 0])]
+ * sum(s),
+ hovertemplate="%{text}",
)
)
From a6f04a722c360d092f743b1cd46a90c56613db22 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Thu, 11 May 2023 12:07:16 -0400
Subject: [PATCH 05/13] Refactored plot args, updated hovers, etc
---
.../plotly/plotly/figure_factory/_upset.py | 103 ++++++++++++------
1 file changed, 71 insertions(+), 32 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index a41bd5babb8..d22b3eb4a0b 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -7,7 +7,7 @@
pd = optional_imports.get_module("pandas")
np = optional_imports.get_module("numpy")
-CHART_TYPES = ["bar", "box", "violin"]
+VALID_PLOT_TYPES = ["bar", "box", "violin"]
def create_upset(
@@ -18,7 +18,7 @@ def create_upset(
sort_by="Counts",
asc=False,
mode="Counts",
- max_subsets=50,
+ max_subsets=20,
subset_column=None,
subset_order=None,
subset_bgcolor="#C9C9C9",
@@ -30,6 +30,7 @@ def create_upset(
barmode="group",
textangle=0,
):
+ # TODO: Add docstring and webpage documentation
plot_obj = _Upset(**locals())
upset_plot = plot_obj.make_upset_plot()
# TODO: Create tests for plotter
@@ -52,13 +53,6 @@ def _expand_subset_column(df, subset_column, subset_order=None):
return new_df, subset_names
-def _make_binary(t):
- """
- Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order.
- """
- return sum([t[i] * 2**i for i in range(len(t))])
-
-
def _transform_upset_data(df):
"""
Takes raw data of binary vectors for set inclusion and produces counts over each.
@@ -72,6 +66,13 @@ def _transform_upset_data(df):
return intersect_counts
+def _make_binary(t):
+ """
+ Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order.
+ """
+ return sum([t[i] * 2**i for i in range(len(t))])
+
+
def _sort_intersect_counts(df, sort_by="Counts", asc=True):
"""
Takes output from _transform_upset_data and sorts by method requested.
@@ -96,10 +97,11 @@ def __init__(
x=None,
color=None,
title=None,
+ plot_type="bar",
sort_by="Counts",
asc=False,
mode="Counts",
- max_subsets=50,
+ max_subsets=20,
subset_column=None,
subset_order=None,
subset_bgcolor="#C9C9C9",
@@ -110,6 +112,11 @@ def __init__(
log_y=False,
barmode="group",
textangle=0,
+ boxmode="group",
+ points="outliers",
+ notched=False,
+ violinmode="group",
+ box=False,
):
# Plot inputs and settings
@@ -117,6 +124,7 @@ def __init__(
self.x = x
self.color = color
self.title = title
+ self.plot_type = plot_type
self.sort_by = sort_by
self.asc = asc
self.mode = mode
@@ -132,8 +140,37 @@ def __init__(
self.log_y = log_y
self.barmode = barmode
self.textangle = textangle
+ self.boxmode = (boxmode,)
+ self.points = (points,)
+ self.notched = (notched,)
+ self.violinmode = (violinmode,)
+ self.box = box
+
+ # Aggregate common plotting args
+ self.common_plot_args = {
+ "color": self.color,
+ "category_orders": self.category_orders,
+ "color_discrete_sequence": self.color_discrete_sequence,
+ "color_discrete_map": self.color_discrete_map,
+ "log_y": self.log_y,
+ }
+
+ # Collect plot specific args
+ self.bar_args = {
+ "barmode": self.barmode,
+ }
- # TODO: Refactor code for "common plot args" that can be reused for eventual box/violin plots
+ self.box_args = {
+ "boxmode": self.boxmode,
+ "points": self.points,
+ "notched": self.notched,
+ }
+
+ self.violin_args = {
+ "violinmode": self.violinmode,
+ "box": self.box,
+ "points": self.points,
+ }
# Figure-building specific attributes
self.fig = go.Figure()
@@ -164,6 +201,7 @@ def make_upset_plot(self):
c for c in self.df.columns if c != self.x and c != self.color
]
+ self.test = self.df.copy()
# Create intersect_counts df depending on if color provided
color = self.color
# TODO: Add grouping by x value input
@@ -258,15 +296,17 @@ def validate_upset_inputs(self):
f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}'
)
+ # Check plot_type is valid
+ plot_type = self.plot_type
+ try:
+ assert plot_type in VALID_PLOT_TYPES
+ except AssertionError:
+ raise ValueError(
+ f'Invalid input for "plot_type". Must be one of "bar", "box", or "violin" but you provided {plot_type}'
+ )
+
def make_primary_plot(self):
- bar_args = {
- "color": self.color,
- "category_orders": self.category_orders,
- "color_discrete_sequence": self.color_discrete_sequence,
- "color_discrete_map": self.color_discrete_map,
- "barmode": self.barmode,
- "log_y": self.log_y,
- }
+ bar_args = {**self.common_plot_args, **self.bar_args}
self.fig = px.bar(
self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args
@@ -329,7 +369,7 @@ def make_switchboard(self):
showlegend=False,
marker=dict(size=16, color=self.subset_bgcolor),
text=labels,
- hovertemplate="%{text}",
+ hovertemplate="%{text}",
)
)
self.fig.update_layout(
@@ -359,7 +399,7 @@ def make_switchboard(self):
marker=dict(size=16, color=self.subset_fgcolor, showscale=False),
text=["+".join([x for x, y in zip(self.subset_names, s) if y != 0])]
* sum(s),
- hovertemplate="%{text}",
+ hovertemplate="%{text}",
)
)
@@ -377,15 +417,6 @@ def make_margin_plot(self):
.rename(columns={"index": "variable", 0: "value"})
)
- bar_args = {
- "color": self.color,
- "category_orders": self.category_orders,
- "color_discrete_sequence": self.color_discrete_sequence,
- "color_discrete_map": self.color_discrete_map,
- "barmode": self.barmode,
- "log_y": self.log_y,
- }
-
# Create counts px.bar chart
plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df
if self.mode == "Percent":
@@ -397,11 +428,19 @@ def make_margin_plot(self):
)
else:
plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2)
+
+ hover_data = {"variable": False}
+ bar_args = {**self.common_plot_args, **self.bar_args}
counts_bar = px.bar(
- plot_df, x="value", y="variable", orientation="h", text="value", **bar_args
+ plot_df,
+ x="value",
+ y="variable",
+ orientation="h",
+ text="value",
+ hover_data=hover_data,
+ **bar_args,
)
counts_bar.update_traces(textposition="outside", cliponaxis=False)
- # TODO: Change hover info to be more useful
# Add subset names as text into plot
subset_names = self.subset_names
From c654b807edc2aefb4cb40333f488f01a0a26e81f Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Thu, 11 May 2023 21:39:24 -0400
Subject: [PATCH 06/13] Added main docstring, fixed some functionality for
grouping, etc
---
.../plotly/plotly/figure_factory/_upset.py | 245 +++++++++++++-----
1 file changed, 175 insertions(+), 70 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index d22b3eb4a0b..76fa72aefd8 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -9,12 +9,13 @@
VALID_PLOT_TYPES = ["bar", "box", "violin"]
-
+# TODO: Add webpage documentation
def create_upset(
data_frame,
x=None,
color=None,
title=None,
+ plot_type="bar",
sort_by="Counts",
asc=False,
mode="Counts",
@@ -27,10 +28,89 @@ def create_upset(
color_discrete_sequence=None,
color_discrete_map=None,
log_y=False,
+ show_yaxis=False,
barmode="group",
textangle=0,
+ boxmode="group",
+ points="outliers",
+ notched=False,
+ violinmode="group",
+ box=False,
):
- # TODO: Add docstring and webpage documentation
+ """
+ Creates an UpSet plot, a scalable alternative to Venn diagrams. The interface supports a flexible range of use cases
+ input data formats.
+
+ :param (pandas.DataFrame) data_frame: a DataFrame either in wide format with subset/intersection inclusion data, or
+ with a column in condensed format; see the tutorial for more details
+ :param (str) x: (optional) column name in data_frame for data point labels, e.g. sample name to cluster intersection
+ observations by
+ :param (str) color: (optional) column name in data_frame for grouping intersection counts, similar to plotly.express
+ inputs
+ :param (str) title: (optional) title for plot
+ :param (str) plot_type: (default="bar") type of plot to visualize intersection count data; must be one of "bar", "box", or "violin";
+ the latter two should only be used if x is provided, in which case they represent the distribution of intersection
+ counts (across color groups)
+ :param (str) sort_by: (default="Counts") order in which counts are displayed; must be one of "Counts" or "Intersections";
+ ignored if color is provided
+ :param (bool) asc: (default=False) sort in ascending order
+ :param (str) mode: (default="Counts") how to represent counts; must be one of "Counts" or "Percent"
+ :param (int) max_subsets: (default=20) maximum number of intersection subsets to display
+ :param (str) subset_column: (optional) if data is formatted in condensed form, input column name here with that data;
+ do not use if data is already formatted in wide format
+ :param (list) subset_order: (optional) if subset_column is provided, use this list of entries to specify order of labels
+ :param (str) subset_bgcolor: (default="#C9C9C9") color for background dots on switchboard
+ :param (str) subset_fgcolor: (default="#000000") color for foreground dots on switchboard
+ :param (dict) category_orders: (optional) specify order for groups in color, as in plotly.express inputs
+ :param (list) color_discrete_sequence: (optional) list of colors to use for color input, as in plotly.express inputs
+ :param (dict) color_discrete_map: (optional) map of color categories to color, as in plotly.express inputs
+ :param (bool) log_y: (default=False) use logarithmic y scale
+ :param (bool) show_yaxis: (default=False) show y-axis tickmarks
+ :param (str) barmode: (default="group") argument passed to plotly.express.bar when selected for plotting
+ :param (int) textangle: (default=0) angle to use when displaying counts above bars in a bar chart
+ :param (str) boxmode: (default="group") argument passed to plotly.express.box when selected for plotting
+ :param (str) points: (default="outliers") argument passed to plotly.express.box when selected for plotting
+ :param (bool) notched: (default=False) argument passed to plotly.express.box when selected for plotting
+ :param (str) violinmode: (default="group") argument passed to plotly.express.violin when selected for plotting
+ :param (bool) box: (default=False) argument passed to plotly.express.violin when selected for plotting
+
+ :rtype (plotly.graph_objects.Figure): returns UpSet plot rendered according to input settings.
+
+ Example 1: Simple Counts
+
+ >>> import plotly.express as px
+ >>> import plotly.figure_factory as ff
+
+ >>> df = px.data.iris()
+ >>> # Create 4 subsets defined by qualitative "large" conditions
+ >>> df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6))
+ >>> df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3))
+ >>> df['PL'] = df['petal_length'].apply(lambda x: int(x > 3))
+ >>> df['PW'] = df['petal_width'].apply(lambda x: int(x > 1))
+
+ >>> df = df[['species', 'SL', 'SW', 'PL', 'PW']]
+ >>> # Only use columns with inclusion in subset (0/1) values for this example
+ >>> fig = ff.create_upset(df.drop(columns=['species']), color_discrete_sequence=['#000000'])
+ >>> fig.show()
+
+ Example 2: Counting by Group
+
+ >>> # Continued from Example 1
+ >>> fig = ff.create_upset(df, color='species', asc=True)
+ >>> fig.show()
+
+ Example 3: Tracking Variance of Counts Across a Category
+
+ >>> # Continued from Example 1
+ >>> import numpy as np
+
+ >>> np.random.seed(100)
+ >>> # Add a dummy variable for "day entry was observed" to track variation of subset counts across the days
+ >>> df['day'] = np.random.randint(0, 5, len(df))
+ >>> fig = ff.create_upset(df.drop(columns=['species']), x='day', plot_type='box', show_yaxis=True)
+ >>> fig.update_layout(yaxis_side="right")
+ >>> fig.show()
+ """
plot_obj = _Upset(**locals())
upset_plot = plot_obj.make_upset_plot()
# TODO: Create tests for plotter
@@ -110,6 +190,7 @@ def __init__(
color_discrete_sequence=None,
color_discrete_map=None,
log_y=False,
+ show_yaxis=False,
barmode="group",
textangle=0,
boxmode="group",
@@ -128,7 +209,6 @@ def __init__(
self.sort_by = sort_by
self.asc = asc
self.mode = mode
- # TODO: Implement max_subsets in code
self.max_subsets = max_subsets
self.subset_column = subset_column
self.subset_order = subset_order
@@ -138,12 +218,13 @@ def __init__(
self.color_discrete_sequence = color_discrete_sequence
self.color_discrete_map = color_discrete_map
self.log_y = log_y
+ self.show_yaxis = show_yaxis
self.barmode = barmode
self.textangle = textangle
- self.boxmode = (boxmode,)
- self.points = (points,)
- self.notched = (notched,)
- self.violinmode = (violinmode,)
+ self.boxmode = boxmode
+ self.points = points
+ self.notched = notched
+ self.violinmode = violinmode
self.box = box
# Aggregate common plotting args
@@ -201,22 +282,20 @@ def make_upset_plot(self):
c for c in self.df.columns if c != self.x and c != self.color
]
- self.test = self.df.copy()
# Create intersect_counts df depending on if color provided
- color = self.color
- # TODO: Add grouping by x value input
- if self.color is not None:
- intersect_df = self.df.groupby(self.color).apply(
- lambda df: _transform_upset_data(
- df.drop(columns=[self.color])
- ).reset_index(drop=True)
+ groups = [x for x in [self.color, self.x] if x is not None]
+ if len(groups) > 0:
+ intersect_df = self.df.groupby(groups).apply(
+ lambda df: _transform_upset_data(df.drop(columns=groups)).reset_index(
+ drop=True
+ )
)
# Fill in counts for subsets where count is zero for certain color groups
filled_df = (
intersect_df.pivot_table(
index="Intersections",
- columns=[self.color],
+ columns=groups,
values="Counts",
fill_value=0,
)
@@ -228,18 +307,26 @@ def make_upset_plot(self):
# Perform sorting within each color group
# WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups
self.intersect_counts = (
- filled_df.groupby(self.color)
+ filled_df.groupby(groups)
.apply(
lambda df: _sort_intersect_counts(
- df.drop(columns=[self.color]),
+ df.drop(columns=groups),
sort_by="Intersections",
asc=self.asc,
).reset_index()
)
.reset_index()
.drop(columns=["index"])
- .rename(columns={"level_1": "index"})
+ .rename(
+ columns={"level_1": "index", "level_2": "index"}
+ ) # Not sure how to tell the two apart...
)
+
+ # Truncate subsets if necessary
+ self.intersect_counts = self.intersect_counts.groupby(groups).head(
+ self.max_subsets
+ )
+
else:
self.intersect_counts = _transform_upset_data(self.df)
self.intersect_counts = _sort_intersect_counts(
@@ -249,8 +336,11 @@ def make_upset_plot(self):
drop=True
).reset_index()
+ self.intersect_counts = self.intersect_counts.head(self.max_subsets)
+
# Rescale for percents if requested
mode = self.mode
+ # TODO: Check this input still works with all the subsetting changes...
if mode == "Percent":
if self.color is not None:
denom = self.intersect_counts.groupby(self.color).sum().reset_index()
@@ -306,20 +396,34 @@ def validate_upset_inputs(self):
)
def make_primary_plot(self):
- bar_args = {**self.common_plot_args, **self.bar_args}
+ plot_function = None
+ args = {}
+ update_traces = {}
+
+ if self.plot_type == "bar":
+ plot_function = px.bar
+ args = {**self.common_plot_args, **self.bar_args, "text": "Counts"}
+ update_traces = {
+ "textposition": "outside",
+ "cliponaxis": False,
+ "textangle": self.textangle,
+ }
+ elif self.plot_type == "box":
+ plot_function = px.box
+ args = {**self.common_plot_args, **self.box_args}
+ elif self.plot_type == "violin":
+ plot_function = px.violin
+ args = {**self.common_plot_args, **self.violin_args}
+
+ self.fig = plot_function(self.intersect_counts, x="index", y="Counts", **args)
+ self.fig.update_traces(**update_traces)
- self.fig = px.bar(
- self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args
- )
- self.fig.update_traces(
- textposition="outside", cliponaxis=False, textangle=self.textangle
- )
self.fig.update_layout(
plot_bgcolor="#FFFFFF",
xaxis_visible=False,
xaxis_showticklabels=False,
- yaxis_visible=False,
- yaxis_showticklabels=False,
+ yaxis_visible=self.show_yaxis,
+ yaxis_showticklabels=self.show_yaxis,
)
def make_switchboard(self):
@@ -327,22 +431,13 @@ def make_switchboard(self):
Method to add subset points to input fig px.bar chart in the style of UpSet plot.
Returns updated figure, and list of heights of dots for downstream convenience.
"""
- # Compute list of intersections
- intersections = None
- if self.color is not None:
- # Pull out full list of possible intersection combinations from first color grouping
- query = (
- self.intersect_counts[self.color]
- == self.intersect_counts[self.color].iloc[0]
- )
- intersections = list(self.intersect_counts[query]["Intersections"])
- else:
- intersections = list(self.intersect_counts["Intersections"])
+ # Pull out full list of possible intersection combinations
+ intersections = list(self.intersect_counts["Intersections"].unique())
# Compute coordinates for bg subset scatter points
d = len(self.subset_names)
- num_bars = len(self.fig.data[0]["x"])
- x_bg_scatter = np.repeat(self.fig.data[0]["x"], d)
+ num_bars = len(intersections)
+ x_bg_scatter = np.repeat(range(num_bars), d)
y_scatter_offset = (
0.2 # Offsetting ensures bars will hover just above the subset scatterplot
)
@@ -367,7 +462,7 @@ def make_switchboard(self):
y=y_bg_scatter,
mode="markers",
showlegend=False,
- marker=dict(size=16, color=self.subset_bgcolor),
+ marker=dict(size=16, color=self.subset_bgcolor, showscale=False),
text=labels,
hovertemplate="%{text}",
)
@@ -375,7 +470,7 @@ def make_switchboard(self):
self.fig.update_layout(
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=True, zeroline=False),
- margin=dict(t=40, l=150),
+ margin=dict(t=0, l=0),
)
# Then fill in subset markers with fg color
@@ -407,18 +502,28 @@ def make_margin_plot(self):
"""
Method to add left margin count px.bar chart in style of UpSet plot.
"""
- # Group and count according to color input
+ # Group and count according to inputs
color = self.color
- counts_df = (
- self.df.groupby(color).sum().reset_index()
- if color is not None
- else self.df.sum()
- .reset_index()
- .rename(columns={"index": "variable", 0: "value"})
- )
+ groups = [x for x in [self.color, self.x] if x is not None]
+ # if len(groups) > 0:
+ # counts_df = self.df.groupby(groups).sum().reset_index()
+ if self.color is not None:
+ counts_df = self.df.groupby(self.color).sum().reset_index()
+ if self.x is not None:
+ counts_df = counts_df.drop(columns=[self.x])
+ else:
+ counts_df = (
+ self.df.sum()
+ .reset_index()
+ .rename(columns={"index": "variable", 0: "value"})
+ )
# Create counts px.bar chart
- plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df
+ plot_df = (
+ counts_df.melt(id_vars=[self.color])
+ if self.color is not None
+ else counts_df
+ )
if self.mode == "Percent":
if color is not None:
denom = plot_df.groupby(color).sum().reset_index()
@@ -429,25 +534,24 @@ def make_margin_plot(self):
else:
plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2)
- hover_data = {"variable": False}
- bar_args = {**self.common_plot_args, **self.bar_args}
- counts_bar = px.bar(
- plot_df,
- x="value",
- y="variable",
- orientation="h",
- text="value",
- hover_data=hover_data,
- **bar_args,
+ plot_function = px.bar
+ update_traces = {"textposition": "outside", "cliponaxis": False}
+ args = {
+ **self.common_plot_args,
+ **self.bar_args,
+ "text": "value",
+ "hover_data": {"variable": False},
+ }
+
+ counts_fig = plot_function(
+ plot_df, x="value", y="variable", orientation="h", **args
)
- counts_bar.update_traces(textposition="outside", cliponaxis=False)
+ counts_fig.update_traces(**update_traces)
# Add subset names as text into plot
- subset_names = self.subset_names
- # subset_names = counts_bar.data[0]['y']
- max_name_len = max([len(s) for s in subset_names])
+ max_name_len = max([len(s) for s in self.subset_names])
annotation_center = -1 + -0.01 * max_name_len
- for i, s in enumerate(subset_names):
+ for i, s in enumerate(self.subset_names):
self.fig.add_annotation(
x=annotation_center,
y=self.switchboard_heights[i],
@@ -458,11 +562,12 @@ def make_margin_plot(self):
)
# Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights
- for trace in counts_bar.data:
+ for trace in counts_fig.data:
trace["x"] = -trace["x"] / max(trace["x"])
trace["y"] = self.switchboard_heights
- counts_bar.update_traces(base=annotation_center - 1, showlegend=False)
+ # if self.plot_type == 'bar':
+ counts_fig.update_traces(base=annotation_center - 1, showlegend=False)
# Add counts chart traces to main fig
- for trace in counts_bar.data:
+ for trace in counts_fig.data:
self.fig.add_trace(trace)
From 537ec39c80c609227c7dbdcada211489d862fa94 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Thu, 11 May 2023 22:26:32 -0400
Subject: [PATCH 07/13] Changed margins to fix title issue; added webdocs
---
doc/python/upset-plots.md | 143 ++++++++++++++++++
.../plotly/plotly/figure_factory/_upset.py | 6 +-
2 files changed, 146 insertions(+), 3 deletions(-)
create mode 100644 doc/python/upset-plots.md
diff --git a/doc/python/upset-plots.md b/doc/python/upset-plots.md
new file mode 100644
index 00000000000..a701f538850
--- /dev/null
+++ b/doc/python/upset-plots.md
@@ -0,0 +1,143 @@
+---
+jupyter:
+ jupytext:
+ notebook_metadata_filter: all
+ text_representation:
+ extension: .md
+ format_name: markdown
+ format_version: '1.1'
+ jupytext_version: 1.2.3
+ kernelspec:
+ display_name: Python 3
+ language: python
+ name: python3
+ language_info:
+ codemirror_mode:
+ name: ipython
+ version: 3
+ file_extension: .py
+ mimetype: text/x-python
+ name: python
+ nbconvert_exporter: python
+ pygments_lexer: ipython3
+ version: 3.7.3
+ plotly:
+ description: How to make an UpSet plot in Python, which can be used to display counts of
+arbitrarily complex set intersections.
+ display_as: scientific
+ language: python
+ layout: base
+ name: UpSet Plots
+ order: 10
+ permalink: python/quiver-plots/
+ thumbnail: thumbnail/quiver-plot.jpg
+---
+
+[UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) allow you to visualize data that counts different intersections
+subsets inside a set. This could arise by actual intersections, or counting tag occurrences on data which need not be
+disjoint. Data used in this method must be in one of two forms: wide or condensed. If the latter is provided, then the
+data will be transformed into the wide format before proceeding with the plot generation.
+
+#### A Simple UpSet Plot
+```python
+import plotly.express as px
+import plotly.figure_factory as ff
+
+df = px.data.iris()
+
+# Create categorical non-disjoint tags for "large" features
+df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6))
+df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3))
+df['PL'] = df['petal_length'].apply(lambda x: int(x > 3))
+df['PW'] = df['petal_width'].apply(lambda x: int(x > 1))
+
+df = df[['SL', 'SW', 'PL', 'PW']] # data in "wide" form
+fig = ff.create_upset(df, color_discrete_sequence=['#000000'])
+fig.show()
+```
+
+
+#### Using Condensed Format
+
+Sometimes it's more convenient to have data where one column is given as a list of (possibly) overlapping tags that data
+point has. This can be thought of as dividing our dataset into a family of subsets, one for each tag. UpSet plots can help
+analyze how different combinations of these tags are distributed in the data.
+
+As long as the entries in this column are a list/tuple, this method can handle the preprocessing step of getting the
+data into the "wide" format like above. We simulate this below.
+
+```python
+import plotly.express as px
+import plotly.figure_factory as ff
+
+df = px.data.iris()
+
+# Create categorical non-disjoint tags for "large" features
+df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6))
+df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3))
+df['PL'] = df['petal_length'].apply(lambda x: int(x > 3))
+df['PW'] = df['petal_width'].apply(lambda x: int(x > 1))
+
+# Simulate "tags" column
+df['tags'] = df['sepal_length'].apply(lambda x: ['SL'] if x > 6 else ['']) + df['sepal_width'].apply(lambda x: ['SW'] if x > 3 else ['']) + \
+ df['petal_length'].apply(lambda x: ['PL'] if x > 3 else ['']) + df['petal_width'].apply(lambda x: ['PW'] if x > 1 else [''])
+df['tags'] = df['tags'].apply(lambda x: [y for y in x if y != ''])
+
+# Note we can (optionally) choose the order for how the method unpacks the tags
+fig = ff.create_upset(df, subset_column='tags', subset_order=['PW', 'SW', 'PL', 'SL'], color_discrete_sequence=['#000000'])
+fig.show()
+```
+
+#### Grouping Data by Color
+
+This method supports two ways of grouping data to visualize counts of subset intersections. The first, shown here,
+allows you to see how these counts vary by subset in parallel across categories described by another column.
+
+```python
+import plotly.express as px
+import plotly.figure_factory as ff
+
+df = px.data.iris()
+
+# Create categorical non-disjoint tags for "large" features
+df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6))
+df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3))
+df['PL'] = df['petal_length'].apply(lambda x: int(x > 3))
+df['PW'] = df['petal_width'].apply(lambda x: int(x > 1))
+
+df = df[['species', 'SL', 'SW', 'PL', 'PW']] # data in "wide" form, with extra "species" column
+# Note: ONLY the extra color column was kept, as rest of columns are inferred to make "wide" format subset data
+fig = ff.create_upset(df, color='species', asc=True) # Can toggle in "asc" order
+fig.show()
+```
+
+#### Visualizing Distributions of Counts by Subset
+
+The other way to group data is to provide a column which provides label for different clusters of observations. This
+could be e.g. the day of the observation, different samples in biology, or any other way of dividing up the same
+observations in different situations. This technique lets you see how the different subset counts vary across this
+dimension.
+
+```python
+import plotly.express as px
+import plotly.figure_factory as ff
+import numpy as np
+
+df = px.data.iris()
+
+# Create categorical non-disjoint tags for "large" features
+df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6))
+df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3))
+df['PL'] = df['petal_length'].apply(lambda x: int(x > 3))
+df['PW'] = df['petal_width'].apply(lambda x: int(x > 1))
+df = df[['SL', 'SW', 'PL', 'PW']]
+
+# Simulate random "day" of observation
+np.random.seed(100)
+df['day'] = np.random.randint(0, 5, len(df))
+
+fig = ff.create_upset(df, x='day', plot_type='box', show_yaxis=True, title='Variation of Tags by Day')
+fig.update_layout(yaxis_side="right")
+fig.show()
+```
+
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index 76fa72aefd8..d9455836f49 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -1,6 +1,6 @@
from __future__ import absolute_import
-from plotly import exceptions, optional_imports
+from plotly import optional_imports
import plotly.graph_objects as go
import plotly.express as px
@@ -9,7 +9,7 @@
VALID_PLOT_TYPES = ["bar", "box", "violin"]
-# TODO: Add webpage documentation
+
def create_upset(
data_frame,
x=None,
@@ -470,7 +470,7 @@ def make_switchboard(self):
self.fig.update_layout(
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=True, zeroline=False),
- margin=dict(t=0, l=0),
+ margin=dict(t=40, l=40),
)
# Then fill in subset markers with fg color
From 41f9c400889b6455e77b203d8323a12b91edf374 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Thu, 11 May 2023 23:03:44 -0400
Subject: [PATCH 08/13] Added some simple tests and removed some debugging code
---
.../plotly/plotly/figure_factory/_upset.py | 11 ++---
.../test_figure_factory.py | 49 +++++++++++++++++++
2 files changed, 53 insertions(+), 7 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index d9455836f49..0ffc279721e 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -113,8 +113,7 @@ def create_upset(
"""
plot_obj = _Upset(**locals())
upset_plot = plot_obj.make_upset_plot()
- # TODO: Create tests for plotter
- return upset_plot, plot_obj
+ return upset_plot
def _expand_subset_column(df, subset_column, subset_order=None):
@@ -124,7 +123,9 @@ def _expand_subset_column(df, subset_column, subset_order=None):
subset_names = (
subset_order
if subset_order is not None
- else list(df[subset_column].explode().unique())
+ else [
+ x for x in df[subset_column].explode().unique() if not pd.isnull(x)
+ ] # Remove empty subset = NaN
)
new_df = df.copy()
for name in subset_names:
@@ -262,9 +263,6 @@ def __init__(
# Validate inputs
self.validate_upset_inputs()
- # DEBUG
- self.test = None
-
def make_upset_plot(self):
# If subset_column provided, expand into standard wider format
if self.subset_column is not None:
@@ -340,7 +338,6 @@ def make_upset_plot(self):
# Rescale for percents if requested
mode = self.mode
- # TODO: Check this input still works with all the subsetting changes...
if mode == "Percent":
if self.color is not None:
denom = self.intersect_counts.groupby(self.color).sum().reset_index()
diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
index 8783dce1ab4..49a9843d456 100644
--- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
+++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
@@ -4516,3 +4516,52 @@ def test_build_dataframe(self):
assert len(fig6.frames) == n_frames
assert len(fig7.frames) == n_frames
assert fig6.data[0].geojson == fig1.data[0].geojson
+
+
+class TestUpset(TestCaseNoTemplate):
+ # Test compatibilities between using wide format input data vs condensed
+ def test_wide_vs_condensed(self):
+ np.random.seed(0)
+
+ a = np.random.randint(0, 2, 1000)
+ b = np.random.randint(0, 2, 1000)
+ c = np.random.randint(0, 2, 1000)
+ color = np.random.randint(0, 3, 1000).astype(str)
+
+ df = pd.DataFrame({"a": a, "b": b, "c": c, "color": color})
+ fig1 = ff.create_upset(df.drop(columns=["color"]))
+ fig2 = ff.create_upset(
+ df.drop(columns=["color"]), sort_by="Intersections", asc=False
+ )
+ fig3 = ff.create_upset(df, color="color")
+
+ for tag in ["a", "b", "c"]:
+ df[tag] = df[tag].map({1: [tag], 0: [""]})
+
+ df["tags"] = df["a"] + df["b"] + df["c"]
+ df["tags"] = df["tags"].apply(lambda x: [y for y in x if y != ""])
+
+ fig4 = ff.create_upset(
+ df.drop(columns=["a", "b", "c", "color"]), subset_column="tags"
+ )
+ fig5 = ff.create_upset(
+ df.drop(columns=["a", "b", "c"]),
+ subset_column="tags",
+ sort_by="Intersections",
+ asc=False,
+ )
+ fig6 = ff.create_upset(
+ df.drop(columns=["a", "b", "c"]), subset_column="tags", color="color"
+ )
+
+ for data in zip(fig1.data, fig4.data):
+ self.assert_fig_equal(data[0], data[1])
+ self.assert_fig_equal(fig1.layout, fig4.layout)
+
+ for data in zip(fig2.data, fig5.data):
+ self.assert_fig_equal(data[0], data[1])
+ self.assert_fig_equal(fig2.layout, fig5.layout)
+
+ for data in zip(fig3.data, fig6.data):
+ self.assert_fig_equal(data[0], data[1])
+ self.assert_fig_equal(fig3.layout, fig6.layout)
From dbfab5df61d1859937af975bac9f5e3a32b91395 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Fri, 12 May 2023 09:42:20 -0400
Subject: [PATCH 09/13] Fixed inheritence for test
---
.../test_optional/test_figure_factory/test_figure_factory.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
index 49a9843d456..c128f62a581 100644
--- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
+++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
@@ -4518,7 +4518,7 @@ def test_build_dataframe(self):
assert fig6.data[0].geojson == fig1.data[0].geojson
-class TestUpset(TestCaseNoTemplate):
+class TestUpset(NumpyTestUtilsMixin, TestCaseNoTemplate):
# Test compatibilities between using wide format input data vs condensed
def test_wide_vs_condensed(self):
np.random.seed(0)
From 1af712f2cdb270a20e9ca3f39785a1d6da604c95 Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Fri, 12 May 2023 10:01:38 -0400
Subject: [PATCH 10/13] Changed order for subset labels in test
---
.../test_figure_factory/test_figure_factory.py | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
index c128f62a581..3a0ca92cd84 100644
--- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
+++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py
@@ -4542,16 +4542,22 @@ def test_wide_vs_condensed(self):
df["tags"] = df["tags"].apply(lambda x: [y for y in x if y != ""])
fig4 = ff.create_upset(
- df.drop(columns=["a", "b", "c", "color"]), subset_column="tags"
+ df.drop(columns=["a", "b", "c", "color"]),
+ subset_column="tags",
+ subset_order=["a", "b", "c"],
)
fig5 = ff.create_upset(
df.drop(columns=["a", "b", "c"]),
subset_column="tags",
+ subset_order=["a", "b", "c"],
sort_by="Intersections",
asc=False,
)
fig6 = ff.create_upset(
- df.drop(columns=["a", "b", "c"]), subset_column="tags", color="color"
+ df.drop(columns=["a", "b", "c"]),
+ subset_column="tags",
+ subset_order=["a", "b", "c"],
+ color="color",
)
for data in zip(fig1.data, fig4.data):
From 38f7c8f55036221674642c18098a2991507fcdaa Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Fri, 12 May 2023 11:19:44 -0400
Subject: [PATCH 11/13] Updated permalink in notebook doc
---
doc/python/upset-plots.md | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/doc/python/upset-plots.md b/doc/python/upset-plots.md
index a701f538850..0b9b454a346 100644
--- a/doc/python/upset-plots.md
+++ b/doc/python/upset-plots.md
@@ -29,8 +29,7 @@ arbitrarily complex set intersections.
layout: base
name: UpSet Plots
order: 10
- permalink: python/quiver-plots/
- thumbnail: thumbnail/quiver-plot.jpg
+ permalink: python/upset-plots/
---
[UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) allow you to visualize data that counts different intersections
From fd2420db455a2c289545bc891fc567da97cf962d Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Fri, 12 May 2023 12:13:59 -0400
Subject: [PATCH 12/13] Updated CHANGELOG
---
CHANGELOG.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 880774085a9..931648b5d46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- Fixed another compatibility issue with Pandas 2.0, just affecting `px.*(line_close=True)` [[#4190](https://github.com/plotly/plotly.py/pull/4190)]
- Added some rounding to the `make_subplots` function to handle situations where the user-input specs cause the domain to exceed 1 by small amounts https://github.com/plotly/plotly.py/pull/4153
+### Added
+
+ - Added implementation of [UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) in `plotly.figure_factory` via the `create_upset` method [[#4204](https://github.com/plotly/plotly.py/pull/4204)]
+
## [5.14.1] - 2023-04-05
### Fixed
From 4e0f60f701defcea0ef16f3c56d9b5768911872d Mon Sep 17 00:00:00 2001
From: rickymagner <81349869+rickymagner@users.noreply.github.com>
Date: Fri, 12 May 2023 15:35:16 -0400
Subject: [PATCH 13/13] Fixed some bugs in scaling/labeling margin plot
---
.../python/plotly/plotly/figure_factory/_upset.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py
index 0ffc279721e..8f02659e90f 100644
--- a/packages/python/plotly/plotly/figure_factory/_upset.py
+++ b/packages/python/plotly/plotly/figure_factory/_upset.py
@@ -304,6 +304,7 @@ def make_upset_plot(self):
# Perform sorting within each color group
# WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups
+ # TODO: Make sensible behavior for sort by "Counts" in this case
self.intersect_counts = (
filled_df.groupby(groups)
.apply(
@@ -523,13 +524,18 @@ def make_margin_plot(self):
)
if self.mode == "Percent":
if color is not None:
- denom = plot_df.groupby(color).sum().reset_index()
+ denom = (
+ self.df.groupby(color)
+ .apply(lambda df: len(df))
+ .reset_index()
+ .rename(columns={0: "value"})
+ )
denom_dict = dict(zip(denom[color], denom["value"]))
plot_df["value"] = round(
plot_df["value"] / plot_df[color].map(denom_dict), 2
)
else:
- plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2)
+ plot_df["value"] = round(plot_df["value"] / len(self.df), 2)
plot_function = px.bar
update_traces = {"textposition": "outside", "cliponaxis": False}
@@ -559,10 +565,10 @@ def make_margin_plot(self):
)
# Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights
+ max_x = max([max(t["x"]) for t in counts_fig.data])
for trace in counts_fig.data:
- trace["x"] = -trace["x"] / max(trace["x"])
+ trace["x"] = -trace["x"] / max_x
trace["y"] = self.switchboard_heights
- # if self.plot_type == 'bar':
counts_fig.update_traces(base=annotation_center - 1, showlegend=False)
# Add counts chart traces to main fig