From b989178cc342152140585366c075284a79e8b069 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Wed, 10 May 2023 11:36:24 -0400 Subject: [PATCH 01/13] Adding stable base for UpSet plots w/ single or multiple group color modes. --- .../plotly/plotly/figure_factory/__init__.py | 2 + .../plotly/plotly/figure_factory/_upset.py | 262 ++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 packages/python/plotly/plotly/figure_factory/_upset.py diff --git a/packages/python/plotly/plotly/figure_factory/__init__.py b/packages/python/plotly/plotly/figure_factory/__init__.py index 0a41dca1ba2..4b82aeda542 100644 --- a/packages/python/plotly/plotly/figure_factory/__init__.py +++ b/packages/python/plotly/plotly/figure_factory/__init__.py @@ -25,6 +25,7 @@ from plotly.figure_factory._streamline import create_streamline from plotly.figure_factory._table import create_table from plotly.figure_factory._trisurf import create_trisurf +from plotly.figure_factory._upset import create_upset from plotly.figure_factory._violin import create_violin if optional_imports.get_module("pandas") is not None: @@ -65,5 +66,6 @@ def create_ternary_contour(*args, **kwargs): "create_table", "create_ternary_contour", "create_trisurf", + "create_upset", "create_violin", ] diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py new file mode 100644 index 00000000000..14966369c3a --- /dev/null +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -0,0 +1,262 @@ +from __future__ import absolute_import + +from plotly import exceptions, optional_imports +import plotly.graph_objects as go +import plotly.express as px + +pd = optional_imports.get_module("pandas") +np = optional_imports.get_module("numpy") + + +CHART_TYPES = ['bar', 'box', 'violin'] + + +def create_upset(data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts', + max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None, + color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0): + plot_obj = _Upset(**locals()) + upset_plot = plot_obj.make_upset_plot() + return upset_plot, plot_obj + + +def _expand_subset_column(df): + # TODO: Fill in this method for alternate data representation + # TODO: Add input for subset_column + pass + + +def _make_binary(t): + """ + Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order. + """ + return sum([t[i] * 2**i for i in range(len(t))]) + + +def _transform_upset_data(df): + """ + Takes raw data of binary vectors for set inclusion and produces counts over each. + """ + intersect_counts = pd.DataFrame({'Intersections': list(df.value_counts().to_dict().keys()), + 'Counts': list(df.value_counts().to_dict().values())}) + return intersect_counts + + +def _sort_intersect_counts(df, sort_by='Counts', asc=True): + """ + Takes output from _transform_upset_data and sorts by method requested. + """ + key = None if (sort_by == 'Counts') else lambda x: x.apply(lambda y: (sum(y), _make_binary(y))) + df = df.sort_values(by=sort_by, key=key, ascending=asc) + return df + + +class _Upset: + """ + Represents builder object for UpSet plot. Refer to figure_factory.create_upset() for full docstring. + """ + + def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts', + max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None, + color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0): + + # Plot inputs and settings + self.df = data_frame + self.x = x + self.color = color + self.title = title + self.sort_by = sort_by + self.asc = asc + self.mode = mode + self.max_subsets = max_subsets, + self.subset_bgcolor = subset_bgcolor + self.subset_fgcolor = subset_fgcolor + self.category_orders = category_orders + self.color_discrete_sequence = color_discrete_sequence + self.color_discrete_map = color_discrete_map + self.log_y = log_y + self.barmode = barmode + self.textangle = textangle + + # TODO: Refactor code for "common plot args" that can be reused for eventual box/violin plots + + # Figure-building specific attributes + self.fig = go.Figure() + self.intersect_counts = pd.DataFrame() + self.subset_col_names = [c for c in data_frame.columns if c != x and c != color] + self.switchboard_heights = [] + + # Validate inputs + self.validate_upset_inputs() + + # DEBUG + self.test = None + + def make_upset_plot(self): + # Create intersect_counts df depending on if color provided + color = self.color + df = self.df + if color is not None: + # TODO: Consider refactor using groupby instead of looping over colors + for c in df[color].unique(): + sub_df = df[df[color] == c].drop(columns=[color]) + if self.x is not None: + # TODO: Check counting code for clustering by x value for distribution plots + new_df = sub_df.groupby(self.x).apply(lambda x: _transform_upset_data(x.drop(columns=['self.x']))) + new_df = new_df.reset_index()[[self.x, 'Intersections', 'Counts']] + else: + new_df = _transform_upset_data(sub_df) + # Sort subgroup in requested order + new_df = _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc).reset_index(drop=True).reset_index() + new_df[color] = c + self.intersect_counts = pd.concat([self.intersect_counts, new_df]) + # TODO: Need to saturate each cluster with 0 value for subsets in one but not other... + else: + self.intersect_counts = _transform_upset_data(df) + self.intersect_counts = _sort_intersect_counts(self.intersect_counts, sort_by=self.sort_by, asc=self.asc) + self.intersect_counts = self.intersect_counts.reset_index(drop=True).reset_index() + + # Rescale for percents if requested + mode = self.mode + if mode == 'Percent': + if color is not None: + denom = self.intersect_counts.groupby(color).sum().reset_index() + denom_dict = dict(zip(denom[color], denom['Counts'])) + self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts[color].map(denom_dict), 2) + else: + self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts['Counts'].sum(), 2) + + # Create 3 main components for figure + self.make_primary_plot() + self.make_switchboard() + self.make_margin_plot() + + # Add title + self.fig.update_layout(title=self.title, title_x=0.5) + + return self.fig + + def validate_upset_inputs(self): + # Check sorting inputs are valid + sort_by = self.sort_by + try: + assert (sort_by == 'Counts') or (sort_by == 'Intersections') + except AssertionError: + raise ValueError(f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}') + + # Check mode is either Counts or Percent + mode = self.mode + try: + assert (mode == 'Counts') or (mode == 'Percent') + except AssertionError: + raise ValueError(f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}') + + def make_primary_plot(self): + bar_args = { + 'color': self.color, + 'category_orders': self.category_orders, + 'color_discrete_sequence': self.color_discrete_sequence, + 'color_discrete_map': self.color_discrete_map, + 'barmode': self.barmode, + 'log_y': self.log_y + } + + # TODO: Override default hover info for something more useful + self.fig = px.bar(self.intersect_counts, x='index', y='Counts', text='Counts', **bar_args) + self.fig.update_traces(textposition='outside', cliponaxis=False, textangle=self.textangle) + self.fig.update_layout(plot_bgcolor='#FFFFFF', xaxis_visible=False, xaxis_showticklabels=False, + yaxis_visible=False, yaxis_showticklabels=False) + + def make_switchboard(self): + """ + Method to add subset points to input fig px.bar chart in the style of UpSet plot. + Returns updated figure, and list of heights of dots for downstream convenience. + """ + # Compute coordinates for bg subset scatter points + d = len(self.subset_col_names) + num_bars = len(self.fig.data[0]['x']) + x_bg_scatter = np.repeat(self.fig.data[0]['x'], d) + y_scatter_offset = 0.2 # Offsetting ensures bars will hover just above the subset scatterplot + y_max = (1 + y_scatter_offset) * max([max(bar['y']) for bar in self.fig.data]) + self.switchboard_heights = [-y_max / d * i - y_scatter_offset * y_max for i in list(range(d))] + y_bg_scatter = num_bars * self.switchboard_heights + + # Add bg subset scatter points to figure below bar chart + self.fig.add_trace(go.Scatter(x=x_bg_scatter, y=y_bg_scatter, mode='markers', showlegend=False, + marker=dict(size=16, color=self.subset_bgcolor))) + self.fig.update_layout(xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), + margin=dict(t=40, l=150)) + + # Compute list of intersections + intersections = None + if self.color is not None: + # Pull out full list of possible intersection combinations from first color grouping + query = self.intersect_counts[self.color] == self.intersect_counts[self.color].iloc[0] + intersections = list(self.intersect_counts[query]['Intersections']) + else: + intersections = list(self.intersect_counts['Intersections']) + + # Then fill in subset markers with fg color + x = 0 + for s in intersections: + x_subsets = [] + y_subsets = [] + y = 0 + for e in s: + if e: + x_subsets += [x] + y_subsets += [-y_max / d * y - y_scatter_offset * y_max] + y += 1 + x += 1 + # TODO: Add hover information for subset/intersection description + self.fig.add_trace(go.Scatter(x=x_subsets, y=y_subsets, mode='markers+lines', showlegend=False, + marker=dict(size=16, color=self.subset_fgcolor, showscale=False))) + + def make_margin_plot(self): + """ + Method to add left margin count px.bar chart in style of UpSet plot. + """ + # Group and count according to color input + color = self.color + counts_df = self.df.groupby(color).sum().reset_index() if color is not None else self.df.sum().reset_index().rename( + columns={'index': 'variable', 0: 'value'}) + + bar_args = { + 'color': self.color, + 'category_orders': self.category_orders, + 'color_discrete_sequence': self.color_discrete_sequence, + 'color_discrete_map': self.color_discrete_map, + 'barmode': self.barmode, + 'log_y': self.log_y + } + + # Create counts px.bar chart + plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df + if self.mode == 'Percent': + if color is not None: + denom = plot_df.groupby(color).sum().reset_index() + denom_dict = dict(zip(denom[color], denom['value'])) + plot_df['value'] = round(plot_df['value'] / plot_df[color].map(denom_dict), 2) + else: + plot_df['value'] = round(plot_df['value'] / plot_df['value'].sum(), 2) + counts_bar = px.bar(plot_df, x='value', y='variable', orientation='h', text='value', **bar_args) + counts_bar.update_traces(textposition='outside', cliponaxis=False) + # TODO: Change hover info to be more useful + + # Add subset names as text into plot + subset_names = self.subset_col_names + # subset_names = counts_bar.data[0]['y'] + max_name_len = max([len(s) for s in subset_names]) + annotation_center = -1 + -0.01 * max_name_len + for i, s in enumerate(subset_names): + self.fig.add_annotation(x=annotation_center, y=self.switchboard_heights[i], text=s, showarrow=False, + font=dict(size=12, color='#000000'), align='left') + + # Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights + for trace in counts_bar.data: + trace['x'] = -trace['x'] / max(trace['x']) + trace['y'] = self.switchboard_heights + counts_bar.update_traces(base=annotation_center - 1, showlegend=False) + + # Add counts chart traces to main fig + for trace in counts_bar.data: + self.fig.add_trace(trace) From e06813ba235b999a7fc5523423d08b0200d8d26e Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Wed, 10 May 2023 14:47:23 -0400 Subject: [PATCH 02/13] Added functionality to allow user to specify column of lists/tuples for subset inclusion. --- .../plotly/plotly/figure_factory/_upset.py | 294 +++++++++++++----- 1 file changed, 218 insertions(+), 76 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index 14966369c3a..2506320d3ac 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -8,21 +8,49 @@ np = optional_imports.get_module("numpy") -CHART_TYPES = ['bar', 'box', 'violin'] - - -def create_upset(data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts', - max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None, - color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0): +CHART_TYPES = ["bar", "box", "violin"] + + +def create_upset( + data_frame, + x=None, + color=None, + title=None, + sort_by="Counts", + asc=False, + mode="Counts", + max_subsets=50, + subset_column=None, + subset_order=None, + subset_bgcolor="#C9C9C9", + subset_fgcolor="#000000", + category_orders=None, + color_discrete_sequence=None, + color_discrete_map=None, + log_y=False, + barmode="group", + textangle=0, +): plot_obj = _Upset(**locals()) upset_plot = plot_obj.make_upset_plot() + # TODO: Create tests for plotter return upset_plot, plot_obj -def _expand_subset_column(df): - # TODO: Fill in this method for alternate data representation - # TODO: Add input for subset_column - pass +def _expand_subset_column(df, subset_column, subset_order): + """ + Takes a column of iterables and expands into binary columns representing inclusion. Also returns subset_names. + """ + subset_names = ( + subset_order + if subset_order is not None + else list(df[subset_column].explode().unique()) + ) + new_df = df.copy() + for name in subset_names: + new_df[name] = new_df[subset_column].apply(lambda x: int(name in x)) + new_df = new_df[subset_names] + return new_df, subset_names def _make_binary(t): @@ -36,16 +64,24 @@ def _transform_upset_data(df): """ Takes raw data of binary vectors for set inclusion and produces counts over each. """ - intersect_counts = pd.DataFrame({'Intersections': list(df.value_counts().to_dict().keys()), - 'Counts': list(df.value_counts().to_dict().values())}) + intersect_counts = pd.DataFrame( + { + "Intersections": list(df.value_counts().to_dict().keys()), + "Counts": list(df.value_counts().to_dict().values()), + } + ) return intersect_counts -def _sort_intersect_counts(df, sort_by='Counts', asc=True): +def _sort_intersect_counts(df, sort_by="Counts", asc=True): """ Takes output from _transform_upset_data and sorts by method requested. """ - key = None if (sort_by == 'Counts') else lambda x: x.apply(lambda y: (sum(y), _make_binary(y))) + key = ( + None + if (sort_by == "Counts") + else lambda x: x.apply(lambda y: (sum(y), _make_binary(y))) + ) df = df.sort_values(by=sort_by, key=key, ascending=asc) return df @@ -55,9 +91,27 @@ class _Upset: Represents builder object for UpSet plot. Refer to figure_factory.create_upset() for full docstring. """ - def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', asc=False, mode='Counts', - max_subsets=50, subset_bgcolor='#C9C9C9', subset_fgcolor='#000000', category_orders=None, - color_discrete_sequence=None, color_discrete_map=None, log_y=False, barmode='group', textangle=0): + def __init__( + self, + data_frame, + x=None, + color=None, + title=None, + sort_by="Counts", + asc=False, + mode="Counts", + max_subsets=50, + subset_column=None, + subset_order=None, + subset_bgcolor="#C9C9C9", + subset_fgcolor="#000000", + category_orders=None, + color_discrete_sequence=None, + color_discrete_map=None, + log_y=False, + barmode="group", + textangle=0, + ): # Plot inputs and settings self.df = data_frame @@ -67,7 +121,9 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', self.sort_by = sort_by self.asc = asc self.mode = mode - self.max_subsets = max_subsets, + self.max_subsets = max_subsets + self.subset_column = subset_column + self.subset_order = subset_order self.subset_bgcolor = subset_bgcolor self.subset_fgcolor = subset_fgcolor self.category_orders = category_orders @@ -82,7 +138,7 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', # Figure-building specific attributes self.fig = go.Figure() self.intersect_counts = pd.DataFrame() - self.subset_col_names = [c for c in data_frame.columns if c != x and c != color] + self.subset_names = None self.switchboard_heights = [] # Validate inputs @@ -92,6 +148,22 @@ def __init__(self, data_frame, x=None, color=None, title=None, sort_by='Counts', self.test = None def make_upset_plot(self): + # If subset_column provided, expand into standard wider format + if self.subset_column is not None: + color_column = self.df[self.color] if self.color is not None else None + x_column = self.df[self.x] if self.x is not None else None + self.df, self.subset_names = _expand_subset_column( + self.df, self.subset_column, self.subset_order + ) + if self.color is not None: + self.df = pd.concat([self.df, color_column], axis=1) + if self.x is not None: + self.df = pd.concat([self.df, x_column], axis=1) + else: + self.subset_names = [ + c for c in self.df.columns if c != self.x and c != self.color + ] + # Create intersect_counts df depending on if color provided color = self.color df = self.df @@ -101,29 +173,47 @@ def make_upset_plot(self): sub_df = df[df[color] == c].drop(columns=[color]) if self.x is not None: # TODO: Check counting code for clustering by x value for distribution plots - new_df = sub_df.groupby(self.x).apply(lambda x: _transform_upset_data(x.drop(columns=['self.x']))) - new_df = new_df.reset_index()[[self.x, 'Intersections', 'Counts']] + new_df = sub_df.groupby(self.x).apply( + lambda x: _transform_upset_data(x.drop(columns=["self.x"])) + ) + new_df = new_df.reset_index()[[self.x, "Intersections", "Counts"]] else: new_df = _transform_upset_data(sub_df) # Sort subgroup in requested order - new_df = _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc).reset_index(drop=True).reset_index() + new_df = ( + _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc) + .reset_index(drop=True) + .reset_index() + ) new_df[color] = c self.intersect_counts = pd.concat([self.intersect_counts, new_df]) # TODO: Need to saturate each cluster with 0 value for subsets in one but not other... else: self.intersect_counts = _transform_upset_data(df) - self.intersect_counts = _sort_intersect_counts(self.intersect_counts, sort_by=self.sort_by, asc=self.asc) - self.intersect_counts = self.intersect_counts.reset_index(drop=True).reset_index() + self.intersect_counts = _sort_intersect_counts( + self.intersect_counts, sort_by=self.sort_by, asc=self.asc + ) + self.intersect_counts = self.intersect_counts.reset_index( + drop=True + ).reset_index() # Rescale for percents if requested mode = self.mode - if mode == 'Percent': + if mode == "Percent": if color is not None: denom = self.intersect_counts.groupby(color).sum().reset_index() - denom_dict = dict(zip(denom[color], denom['Counts'])) - self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts[color].map(denom_dict), 2) + denom_dict = dict(zip(denom[color], denom["Counts"])) + self.intersect_counts["Counts"] = round( + self.intersect_counts["Counts"] + / self.intersect_counts[color].map(denom_dict), + 2, + ) else: - self.intersect_counts['Counts'] = round(self.intersect_counts['Counts'] / self.intersect_counts['Counts'].sum(), 2) + self.intersect_counts["Counts"] = round( + self.intersect_counts["Counts"] + / self.intersect_counts["Counts"].sum(), + 2, + ) # Create 3 main components for figure self.make_primary_plot() @@ -139,32 +229,45 @@ def validate_upset_inputs(self): # Check sorting inputs are valid sort_by = self.sort_by try: - assert (sort_by == 'Counts') or (sort_by == 'Intersections') + assert (sort_by == "Counts") or (sort_by == "Intersections") except AssertionError: - raise ValueError(f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}') + raise ValueError( + f'Invalid input for "sort_by". Must be either "Counts" or "Intersections" but you provided {sort_by}' + ) # Check mode is either Counts or Percent mode = self.mode try: - assert (mode == 'Counts') or (mode == 'Percent') + assert (mode == "Counts") or (mode == "Percent") except AssertionError: - raise ValueError(f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}') + raise ValueError( + f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}' + ) def make_primary_plot(self): bar_args = { - 'color': self.color, - 'category_orders': self.category_orders, - 'color_discrete_sequence': self.color_discrete_sequence, - 'color_discrete_map': self.color_discrete_map, - 'barmode': self.barmode, - 'log_y': self.log_y + "color": self.color, + "category_orders": self.category_orders, + "color_discrete_sequence": self.color_discrete_sequence, + "color_discrete_map": self.color_discrete_map, + "barmode": self.barmode, + "log_y": self.log_y, } # TODO: Override default hover info for something more useful - self.fig = px.bar(self.intersect_counts, x='index', y='Counts', text='Counts', **bar_args) - self.fig.update_traces(textposition='outside', cliponaxis=False, textangle=self.textangle) - self.fig.update_layout(plot_bgcolor='#FFFFFF', xaxis_visible=False, xaxis_showticklabels=False, - yaxis_visible=False, yaxis_showticklabels=False) + self.fig = px.bar( + self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args + ) + self.fig.update_traces( + textposition="outside", cliponaxis=False, textangle=self.textangle + ) + self.fig.update_layout( + plot_bgcolor="#FFFFFF", + xaxis_visible=False, + xaxis_showticklabels=False, + yaxis_visible=False, + yaxis_showticklabels=False, + ) def make_switchboard(self): """ @@ -172,28 +275,45 @@ def make_switchboard(self): Returns updated figure, and list of heights of dots for downstream convenience. """ # Compute coordinates for bg subset scatter points - d = len(self.subset_col_names) - num_bars = len(self.fig.data[0]['x']) - x_bg_scatter = np.repeat(self.fig.data[0]['x'], d) - y_scatter_offset = 0.2 # Offsetting ensures bars will hover just above the subset scatterplot - y_max = (1 + y_scatter_offset) * max([max(bar['y']) for bar in self.fig.data]) - self.switchboard_heights = [-y_max / d * i - y_scatter_offset * y_max for i in list(range(d))] + d = len(self.subset_names) + num_bars = len(self.fig.data[0]["x"]) + x_bg_scatter = np.repeat(self.fig.data[0]["x"], d) + y_scatter_offset = ( + 0.2 # Offsetting ensures bars will hover just above the subset scatterplot + ) + y_max = (1 + y_scatter_offset) * max([max(bar["y"]) for bar in self.fig.data]) + self.switchboard_heights = [ + -y_max / d * i - y_scatter_offset * y_max for i in list(range(d)) + ] y_bg_scatter = num_bars * self.switchboard_heights # Add bg subset scatter points to figure below bar chart - self.fig.add_trace(go.Scatter(x=x_bg_scatter, y=y_bg_scatter, mode='markers', showlegend=False, - marker=dict(size=16, color=self.subset_bgcolor))) - self.fig.update_layout(xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), - margin=dict(t=40, l=150)) + self.fig.add_trace( + go.Scatter( + x=x_bg_scatter, + y=y_bg_scatter, + mode="markers", + showlegend=False, + marker=dict(size=16, color=self.subset_bgcolor), + ) + ) + self.fig.update_layout( + xaxis=dict(showgrid=False, zeroline=False), + yaxis=dict(showgrid=True, zeroline=False), + margin=dict(t=40, l=150), + ) # Compute list of intersections intersections = None if self.color is not None: # Pull out full list of possible intersection combinations from first color grouping - query = self.intersect_counts[self.color] == self.intersect_counts[self.color].iloc[0] - intersections = list(self.intersect_counts[query]['Intersections']) + query = ( + self.intersect_counts[self.color] + == self.intersect_counts[self.color].iloc[0] + ) + intersections = list(self.intersect_counts[query]["Intersections"]) else: - intersections = list(self.intersect_counts['Intersections']) + intersections = list(self.intersect_counts["Intersections"]) # Then fill in subset markers with fg color x = 0 @@ -208,8 +328,15 @@ def make_switchboard(self): y += 1 x += 1 # TODO: Add hover information for subset/intersection description - self.fig.add_trace(go.Scatter(x=x_subsets, y=y_subsets, mode='markers+lines', showlegend=False, - marker=dict(size=16, color=self.subset_fgcolor, showscale=False))) + self.fig.add_trace( + go.Scatter( + x=x_subsets, + y=y_subsets, + mode="markers+lines", + showlegend=False, + marker=dict(size=16, color=self.subset_fgcolor, showscale=False), + ) + ) def make_margin_plot(self): """ @@ -217,44 +344,59 @@ def make_margin_plot(self): """ # Group and count according to color input color = self.color - counts_df = self.df.groupby(color).sum().reset_index() if color is not None else self.df.sum().reset_index().rename( - columns={'index': 'variable', 0: 'value'}) + counts_df = ( + self.df.groupby(color).sum().reset_index() + if color is not None + else self.df.sum() + .reset_index() + .rename(columns={"index": "variable", 0: "value"}) + ) bar_args = { - 'color': self.color, - 'category_orders': self.category_orders, - 'color_discrete_sequence': self.color_discrete_sequence, - 'color_discrete_map': self.color_discrete_map, - 'barmode': self.barmode, - 'log_y': self.log_y + "color": self.color, + "category_orders": self.category_orders, + "color_discrete_sequence": self.color_discrete_sequence, + "color_discrete_map": self.color_discrete_map, + "barmode": self.barmode, + "log_y": self.log_y, } # Create counts px.bar chart plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df - if self.mode == 'Percent': + if self.mode == "Percent": if color is not None: denom = plot_df.groupby(color).sum().reset_index() - denom_dict = dict(zip(denom[color], denom['value'])) - plot_df['value'] = round(plot_df['value'] / plot_df[color].map(denom_dict), 2) + denom_dict = dict(zip(denom[color], denom["value"])) + plot_df["value"] = round( + plot_df["value"] / plot_df[color].map(denom_dict), 2 + ) else: - plot_df['value'] = round(plot_df['value'] / plot_df['value'].sum(), 2) - counts_bar = px.bar(plot_df, x='value', y='variable', orientation='h', text='value', **bar_args) - counts_bar.update_traces(textposition='outside', cliponaxis=False) + plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2) + counts_bar = px.bar( + plot_df, x="value", y="variable", orientation="h", text="value", **bar_args + ) + counts_bar.update_traces(textposition="outside", cliponaxis=False) # TODO: Change hover info to be more useful # Add subset names as text into plot - subset_names = self.subset_col_names + subset_names = self.subset_names # subset_names = counts_bar.data[0]['y'] max_name_len = max([len(s) for s in subset_names]) annotation_center = -1 + -0.01 * max_name_len for i, s in enumerate(subset_names): - self.fig.add_annotation(x=annotation_center, y=self.switchboard_heights[i], text=s, showarrow=False, - font=dict(size=12, color='#000000'), align='left') + self.fig.add_annotation( + x=annotation_center, + y=self.switchboard_heights[i], + text=s, + showarrow=False, + font=dict(size=12, color="#000000"), + align="left", + ) # Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights for trace in counts_bar.data: - trace['x'] = -trace['x'] / max(trace['x']) - trace['y'] = self.switchboard_heights + trace["x"] = -trace["x"] / max(trace["x"]) + trace["y"] = self.switchboard_heights counts_bar.update_traces(base=annotation_center - 1, showlegend=False) # Add counts chart traces to main fig From cef0fb84417ef39d62bb0b047d02f52875a9f24e Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Wed, 10 May 2023 16:34:56 -0400 Subject: [PATCH 03/13] Padded intersection counts with zeros when color groups had some missing subsets. --- .../plotly/plotly/figure_factory/_upset.py | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index 2506320d3ac..cf8bc78a3e7 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -7,7 +7,6 @@ pd = optional_imports.get_module("pandas") np = optional_imports.get_module("numpy") - CHART_TYPES = ["bar", "box", "violin"] @@ -37,7 +36,7 @@ def create_upset( return upset_plot, plot_obj -def _expand_subset_column(df, subset_column, subset_order): +def _expand_subset_column(df, subset_column, subset_order=None): """ Takes a column of iterables and expands into binary columns representing inclusion. Also returns subset_names. """ @@ -166,30 +165,44 @@ def make_upset_plot(self): # Create intersect_counts df depending on if color provided color = self.color - df = self.df - if color is not None: - # TODO: Consider refactor using groupby instead of looping over colors - for c in df[color].unique(): - sub_df = df[df[color] == c].drop(columns=[color]) - if self.x is not None: - # TODO: Check counting code for clustering by x value for distribution plots - new_df = sub_df.groupby(self.x).apply( - lambda x: _transform_upset_data(x.drop(columns=["self.x"])) - ) - new_df = new_df.reset_index()[[self.x, "Intersections", "Counts"]] - else: - new_df = _transform_upset_data(sub_df) - # Sort subgroup in requested order - new_df = ( - _sort_intersect_counts(new_df, sort_by=self.sort_by, asc=self.asc) - .reset_index(drop=True) - .reset_index() + # TODO: Add grouping by x value input + if self.color is not None: + intersect_df = self.df.groupby(self.color).apply( + lambda df: _transform_upset_data( + df.drop(columns=[self.color]) + ).reset_index(drop=True) + ) + + # Fill in counts for subsets where count is zero for certain color groups + filled_df = ( + intersect_df.pivot_table( + index="Intersections", + columns=[self.color], + values="Counts", + fill_value=0, + ) + .unstack() + .reset_index() + .rename(columns={0: "Counts"}) + ) + + # Perform sorting within each color group + # WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups + self.intersect_counts = ( + filled_df.groupby(self.color) + .apply( + lambda df: _sort_intersect_counts( + df.drop(columns=[self.color]), + sort_by="Intersections", + asc=self.asc, + ).reset_index() ) - new_df[color] = c - self.intersect_counts = pd.concat([self.intersect_counts, new_df]) - # TODO: Need to saturate each cluster with 0 value for subsets in one but not other... + .reset_index() + .drop(columns=["index"]) + .rename(columns={"level_1": "index"}) + ) else: - self.intersect_counts = _transform_upset_data(df) + self.intersect_counts = _transform_upset_data(self.df) self.intersect_counts = _sort_intersect_counts( self.intersect_counts, sort_by=self.sort_by, asc=self.asc ) @@ -200,12 +213,12 @@ def make_upset_plot(self): # Rescale for percents if requested mode = self.mode if mode == "Percent": - if color is not None: - denom = self.intersect_counts.groupby(color).sum().reset_index() - denom_dict = dict(zip(denom[color], denom["Counts"])) + if self.color is not None: + denom = self.intersect_counts.groupby(self.color).sum().reset_index() + denom_dict = dict(zip(denom[self.color], denom["Counts"])) self.intersect_counts["Counts"] = round( self.intersect_counts["Counts"] - / self.intersect_counts[color].map(denom_dict), + / self.intersect_counts[self.color].map(denom_dict), 2, ) else: From df2531870506697d36e297dbbf28585b23d0d01b Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Wed, 10 May 2023 17:16:26 -0400 Subject: [PATCH 04/13] Added more useful hover data for switchboard. --- .../plotly/plotly/figure_factory/_upset.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index cf8bc78a3e7..a41bd5babb8 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -120,6 +120,7 @@ def __init__( self.sort_by = sort_by self.asc = asc self.mode = mode + # TODO: Implement max_subsets in code self.max_subsets = max_subsets self.subset_column = subset_column self.subset_order = subset_order @@ -267,7 +268,6 @@ def make_primary_plot(self): "log_y": self.log_y, } - # TODO: Override default hover info for something more useful self.fig = px.bar( self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args ) @@ -287,6 +287,18 @@ def make_switchboard(self): Method to add subset points to input fig px.bar chart in the style of UpSet plot. Returns updated figure, and list of heights of dots for downstream convenience. """ + # Compute list of intersections + intersections = None + if self.color is not None: + # Pull out full list of possible intersection combinations from first color grouping + query = ( + self.intersect_counts[self.color] + == self.intersect_counts[self.color].iloc[0] + ) + intersections = list(self.intersect_counts[query]["Intersections"]) + else: + intersections = list(self.intersect_counts["Intersections"]) + # Compute coordinates for bg subset scatter points d = len(self.subset_names) num_bars = len(self.fig.data[0]["x"]) @@ -301,6 +313,14 @@ def make_switchboard(self): y_bg_scatter = num_bars * self.switchboard_heights # Add bg subset scatter points to figure below bar chart + labels = np.repeat( + [ + "+".join([x for x, y in zip(self.subset_names, s) if y != 0]) + for s in intersections + ], + d, + ) + labels = ["None" if x == "" else x for x in labels] self.fig.add_trace( go.Scatter( x=x_bg_scatter, @@ -308,6 +328,8 @@ def make_switchboard(self): mode="markers", showlegend=False, marker=dict(size=16, color=self.subset_bgcolor), + text=labels, + hovertemplate="%{text}", ) ) self.fig.update_layout( @@ -316,18 +338,6 @@ def make_switchboard(self): margin=dict(t=40, l=150), ) - # Compute list of intersections - intersections = None - if self.color is not None: - # Pull out full list of possible intersection combinations from first color grouping - query = ( - self.intersect_counts[self.color] - == self.intersect_counts[self.color].iloc[0] - ) - intersections = list(self.intersect_counts[query]["Intersections"]) - else: - intersections = list(self.intersect_counts["Intersections"]) - # Then fill in subset markers with fg color x = 0 for s in intersections: @@ -340,7 +350,6 @@ def make_switchboard(self): y_subsets += [-y_max / d * y - y_scatter_offset * y_max] y += 1 x += 1 - # TODO: Add hover information for subset/intersection description self.fig.add_trace( go.Scatter( x=x_subsets, @@ -348,6 +357,9 @@ def make_switchboard(self): mode="markers+lines", showlegend=False, marker=dict(size=16, color=self.subset_fgcolor, showscale=False), + text=["+".join([x for x, y in zip(self.subset_names, s) if y != 0])] + * sum(s), + hovertemplate="%{text}", ) ) From a6f04a722c360d092f743b1cd46a90c56613db22 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Thu, 11 May 2023 12:07:16 -0400 Subject: [PATCH 05/13] Refactored plot args, updated hovers, etc --- .../plotly/plotly/figure_factory/_upset.py | 103 ++++++++++++------ 1 file changed, 71 insertions(+), 32 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index a41bd5babb8..d22b3eb4a0b 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -7,7 +7,7 @@ pd = optional_imports.get_module("pandas") np = optional_imports.get_module("numpy") -CHART_TYPES = ["bar", "box", "violin"] +VALID_PLOT_TYPES = ["bar", "box", "violin"] def create_upset( @@ -18,7 +18,7 @@ def create_upset( sort_by="Counts", asc=False, mode="Counts", - max_subsets=50, + max_subsets=20, subset_column=None, subset_order=None, subset_bgcolor="#C9C9C9", @@ -30,6 +30,7 @@ def create_upset( barmode="group", textangle=0, ): + # TODO: Add docstring and webpage documentation plot_obj = _Upset(**locals()) upset_plot = plot_obj.make_upset_plot() # TODO: Create tests for plotter @@ -52,13 +53,6 @@ def _expand_subset_column(df, subset_column, subset_order=None): return new_df, subset_names -def _make_binary(t): - """ - Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order. - """ - return sum([t[i] * 2**i for i in range(len(t))]) - - def _transform_upset_data(df): """ Takes raw data of binary vectors for set inclusion and produces counts over each. @@ -72,6 +66,13 @@ def _transform_upset_data(df): return intersect_counts +def _make_binary(t): + """ + Converts tuple of 0,1s to binary number. Used in _transform_upset_data for sort order. + """ + return sum([t[i] * 2**i for i in range(len(t))]) + + def _sort_intersect_counts(df, sort_by="Counts", asc=True): """ Takes output from _transform_upset_data and sorts by method requested. @@ -96,10 +97,11 @@ def __init__( x=None, color=None, title=None, + plot_type="bar", sort_by="Counts", asc=False, mode="Counts", - max_subsets=50, + max_subsets=20, subset_column=None, subset_order=None, subset_bgcolor="#C9C9C9", @@ -110,6 +112,11 @@ def __init__( log_y=False, barmode="group", textangle=0, + boxmode="group", + points="outliers", + notched=False, + violinmode="group", + box=False, ): # Plot inputs and settings @@ -117,6 +124,7 @@ def __init__( self.x = x self.color = color self.title = title + self.plot_type = plot_type self.sort_by = sort_by self.asc = asc self.mode = mode @@ -132,8 +140,37 @@ def __init__( self.log_y = log_y self.barmode = barmode self.textangle = textangle + self.boxmode = (boxmode,) + self.points = (points,) + self.notched = (notched,) + self.violinmode = (violinmode,) + self.box = box + + # Aggregate common plotting args + self.common_plot_args = { + "color": self.color, + "category_orders": self.category_orders, + "color_discrete_sequence": self.color_discrete_sequence, + "color_discrete_map": self.color_discrete_map, + "log_y": self.log_y, + } + + # Collect plot specific args + self.bar_args = { + "barmode": self.barmode, + } - # TODO: Refactor code for "common plot args" that can be reused for eventual box/violin plots + self.box_args = { + "boxmode": self.boxmode, + "points": self.points, + "notched": self.notched, + } + + self.violin_args = { + "violinmode": self.violinmode, + "box": self.box, + "points": self.points, + } # Figure-building specific attributes self.fig = go.Figure() @@ -164,6 +201,7 @@ def make_upset_plot(self): c for c in self.df.columns if c != self.x and c != self.color ] + self.test = self.df.copy() # Create intersect_counts df depending on if color provided color = self.color # TODO: Add grouping by x value input @@ -258,15 +296,17 @@ def validate_upset_inputs(self): f'Invalid input for "mode". Must be either "Counts" or "Percent" but you provided {mode}' ) + # Check plot_type is valid + plot_type = self.plot_type + try: + assert plot_type in VALID_PLOT_TYPES + except AssertionError: + raise ValueError( + f'Invalid input for "plot_type". Must be one of "bar", "box", or "violin" but you provided {plot_type}' + ) + def make_primary_plot(self): - bar_args = { - "color": self.color, - "category_orders": self.category_orders, - "color_discrete_sequence": self.color_discrete_sequence, - "color_discrete_map": self.color_discrete_map, - "barmode": self.barmode, - "log_y": self.log_y, - } + bar_args = {**self.common_plot_args, **self.bar_args} self.fig = px.bar( self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args @@ -329,7 +369,7 @@ def make_switchboard(self): showlegend=False, marker=dict(size=16, color=self.subset_bgcolor), text=labels, - hovertemplate="%{text}", + hovertemplate="%{text}", ) ) self.fig.update_layout( @@ -359,7 +399,7 @@ def make_switchboard(self): marker=dict(size=16, color=self.subset_fgcolor, showscale=False), text=["+".join([x for x, y in zip(self.subset_names, s) if y != 0])] * sum(s), - hovertemplate="%{text}", + hovertemplate="%{text}", ) ) @@ -377,15 +417,6 @@ def make_margin_plot(self): .rename(columns={"index": "variable", 0: "value"}) ) - bar_args = { - "color": self.color, - "category_orders": self.category_orders, - "color_discrete_sequence": self.color_discrete_sequence, - "color_discrete_map": self.color_discrete_map, - "barmode": self.barmode, - "log_y": self.log_y, - } - # Create counts px.bar chart plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df if self.mode == "Percent": @@ -397,11 +428,19 @@ def make_margin_plot(self): ) else: plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2) + + hover_data = {"variable": False} + bar_args = {**self.common_plot_args, **self.bar_args} counts_bar = px.bar( - plot_df, x="value", y="variable", orientation="h", text="value", **bar_args + plot_df, + x="value", + y="variable", + orientation="h", + text="value", + hover_data=hover_data, + **bar_args, ) counts_bar.update_traces(textposition="outside", cliponaxis=False) - # TODO: Change hover info to be more useful # Add subset names as text into plot subset_names = self.subset_names From c654b807edc2aefb4cb40333f488f01a0a26e81f Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Thu, 11 May 2023 21:39:24 -0400 Subject: [PATCH 06/13] Added main docstring, fixed some functionality for grouping, etc --- .../plotly/plotly/figure_factory/_upset.py | 245 +++++++++++++----- 1 file changed, 175 insertions(+), 70 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index d22b3eb4a0b..76fa72aefd8 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -9,12 +9,13 @@ VALID_PLOT_TYPES = ["bar", "box", "violin"] - +# TODO: Add webpage documentation def create_upset( data_frame, x=None, color=None, title=None, + plot_type="bar", sort_by="Counts", asc=False, mode="Counts", @@ -27,10 +28,89 @@ def create_upset( color_discrete_sequence=None, color_discrete_map=None, log_y=False, + show_yaxis=False, barmode="group", textangle=0, + boxmode="group", + points="outliers", + notched=False, + violinmode="group", + box=False, ): - # TODO: Add docstring and webpage documentation + """ + Creates an UpSet plot, a scalable alternative to Venn diagrams. The interface supports a flexible range of use cases + input data formats. + + :param (pandas.DataFrame) data_frame: a DataFrame either in wide format with subset/intersection inclusion data, or + with a column in condensed format; see the tutorial for more details + :param (str) x: (optional) column name in data_frame for data point labels, e.g. sample name to cluster intersection + observations by + :param (str) color: (optional) column name in data_frame for grouping intersection counts, similar to plotly.express + inputs + :param (str) title: (optional) title for plot + :param (str) plot_type: (default="bar") type of plot to visualize intersection count data; must be one of "bar", "box", or "violin"; + the latter two should only be used if x is provided, in which case they represent the distribution of intersection + counts (across color groups) + :param (str) sort_by: (default="Counts") order in which counts are displayed; must be one of "Counts" or "Intersections"; + ignored if color is provided + :param (bool) asc: (default=False) sort in ascending order + :param (str) mode: (default="Counts") how to represent counts; must be one of "Counts" or "Percent" + :param (int) max_subsets: (default=20) maximum number of intersection subsets to display + :param (str) subset_column: (optional) if data is formatted in condensed form, input column name here with that data; + do not use if data is already formatted in wide format + :param (list) subset_order: (optional) if subset_column is provided, use this list of entries to specify order of labels + :param (str) subset_bgcolor: (default="#C9C9C9") color for background dots on switchboard + :param (str) subset_fgcolor: (default="#000000") color for foreground dots on switchboard + :param (dict) category_orders: (optional) specify order for groups in color, as in plotly.express inputs + :param (list) color_discrete_sequence: (optional) list of colors to use for color input, as in plotly.express inputs + :param (dict) color_discrete_map: (optional) map of color categories to color, as in plotly.express inputs + :param (bool) log_y: (default=False) use logarithmic y scale + :param (bool) show_yaxis: (default=False) show y-axis tickmarks + :param (str) barmode: (default="group") argument passed to plotly.express.bar when selected for plotting + :param (int) textangle: (default=0) angle to use when displaying counts above bars in a bar chart + :param (str) boxmode: (default="group") argument passed to plotly.express.box when selected for plotting + :param (str) points: (default="outliers") argument passed to plotly.express.box when selected for plotting + :param (bool) notched: (default=False) argument passed to plotly.express.box when selected for plotting + :param (str) violinmode: (default="group") argument passed to plotly.express.violin when selected for plotting + :param (bool) box: (default=False) argument passed to plotly.express.violin when selected for plotting + + :rtype (plotly.graph_objects.Figure): returns UpSet plot rendered according to input settings. + + Example 1: Simple Counts + + >>> import plotly.express as px + >>> import plotly.figure_factory as ff + + >>> df = px.data.iris() + >>> # Create 4 subsets defined by qualitative "large" conditions + >>> df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6)) + >>> df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3)) + >>> df['PL'] = df['petal_length'].apply(lambda x: int(x > 3)) + >>> df['PW'] = df['petal_width'].apply(lambda x: int(x > 1)) + + >>> df = df[['species', 'SL', 'SW', 'PL', 'PW']] + >>> # Only use columns with inclusion in subset (0/1) values for this example + >>> fig = ff.create_upset(df.drop(columns=['species']), color_discrete_sequence=['#000000']) + >>> fig.show() + + Example 2: Counting by Group + + >>> # Continued from Example 1 + >>> fig = ff.create_upset(df, color='species', asc=True) + >>> fig.show() + + Example 3: Tracking Variance of Counts Across a Category + + >>> # Continued from Example 1 + >>> import numpy as np + + >>> np.random.seed(100) + >>> # Add a dummy variable for "day entry was observed" to track variation of subset counts across the days + >>> df['day'] = np.random.randint(0, 5, len(df)) + >>> fig = ff.create_upset(df.drop(columns=['species']), x='day', plot_type='box', show_yaxis=True) + >>> fig.update_layout(yaxis_side="right") + >>> fig.show() + """ plot_obj = _Upset(**locals()) upset_plot = plot_obj.make_upset_plot() # TODO: Create tests for plotter @@ -110,6 +190,7 @@ def __init__( color_discrete_sequence=None, color_discrete_map=None, log_y=False, + show_yaxis=False, barmode="group", textangle=0, boxmode="group", @@ -128,7 +209,6 @@ def __init__( self.sort_by = sort_by self.asc = asc self.mode = mode - # TODO: Implement max_subsets in code self.max_subsets = max_subsets self.subset_column = subset_column self.subset_order = subset_order @@ -138,12 +218,13 @@ def __init__( self.color_discrete_sequence = color_discrete_sequence self.color_discrete_map = color_discrete_map self.log_y = log_y + self.show_yaxis = show_yaxis self.barmode = barmode self.textangle = textangle - self.boxmode = (boxmode,) - self.points = (points,) - self.notched = (notched,) - self.violinmode = (violinmode,) + self.boxmode = boxmode + self.points = points + self.notched = notched + self.violinmode = violinmode self.box = box # Aggregate common plotting args @@ -201,22 +282,20 @@ def make_upset_plot(self): c for c in self.df.columns if c != self.x and c != self.color ] - self.test = self.df.copy() # Create intersect_counts df depending on if color provided - color = self.color - # TODO: Add grouping by x value input - if self.color is not None: - intersect_df = self.df.groupby(self.color).apply( - lambda df: _transform_upset_data( - df.drop(columns=[self.color]) - ).reset_index(drop=True) + groups = [x for x in [self.color, self.x] if x is not None] + if len(groups) > 0: + intersect_df = self.df.groupby(groups).apply( + lambda df: _transform_upset_data(df.drop(columns=groups)).reset_index( + drop=True + ) ) # Fill in counts for subsets where count is zero for certain color groups filled_df = ( intersect_df.pivot_table( index="Intersections", - columns=[self.color], + columns=groups, values="Counts", fill_value=0, ) @@ -228,18 +307,26 @@ def make_upset_plot(self): # Perform sorting within each color group # WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups self.intersect_counts = ( - filled_df.groupby(self.color) + filled_df.groupby(groups) .apply( lambda df: _sort_intersect_counts( - df.drop(columns=[self.color]), + df.drop(columns=groups), sort_by="Intersections", asc=self.asc, ).reset_index() ) .reset_index() .drop(columns=["index"]) - .rename(columns={"level_1": "index"}) + .rename( + columns={"level_1": "index", "level_2": "index"} + ) # Not sure how to tell the two apart... ) + + # Truncate subsets if necessary + self.intersect_counts = self.intersect_counts.groupby(groups).head( + self.max_subsets + ) + else: self.intersect_counts = _transform_upset_data(self.df) self.intersect_counts = _sort_intersect_counts( @@ -249,8 +336,11 @@ def make_upset_plot(self): drop=True ).reset_index() + self.intersect_counts = self.intersect_counts.head(self.max_subsets) + # Rescale for percents if requested mode = self.mode + # TODO: Check this input still works with all the subsetting changes... if mode == "Percent": if self.color is not None: denom = self.intersect_counts.groupby(self.color).sum().reset_index() @@ -306,20 +396,34 @@ def validate_upset_inputs(self): ) def make_primary_plot(self): - bar_args = {**self.common_plot_args, **self.bar_args} + plot_function = None + args = {} + update_traces = {} + + if self.plot_type == "bar": + plot_function = px.bar + args = {**self.common_plot_args, **self.bar_args, "text": "Counts"} + update_traces = { + "textposition": "outside", + "cliponaxis": False, + "textangle": self.textangle, + } + elif self.plot_type == "box": + plot_function = px.box + args = {**self.common_plot_args, **self.box_args} + elif self.plot_type == "violin": + plot_function = px.violin + args = {**self.common_plot_args, **self.violin_args} + + self.fig = plot_function(self.intersect_counts, x="index", y="Counts", **args) + self.fig.update_traces(**update_traces) - self.fig = px.bar( - self.intersect_counts, x="index", y="Counts", text="Counts", **bar_args - ) - self.fig.update_traces( - textposition="outside", cliponaxis=False, textangle=self.textangle - ) self.fig.update_layout( plot_bgcolor="#FFFFFF", xaxis_visible=False, xaxis_showticklabels=False, - yaxis_visible=False, - yaxis_showticklabels=False, + yaxis_visible=self.show_yaxis, + yaxis_showticklabels=self.show_yaxis, ) def make_switchboard(self): @@ -327,22 +431,13 @@ def make_switchboard(self): Method to add subset points to input fig px.bar chart in the style of UpSet plot. Returns updated figure, and list of heights of dots for downstream convenience. """ - # Compute list of intersections - intersections = None - if self.color is not None: - # Pull out full list of possible intersection combinations from first color grouping - query = ( - self.intersect_counts[self.color] - == self.intersect_counts[self.color].iloc[0] - ) - intersections = list(self.intersect_counts[query]["Intersections"]) - else: - intersections = list(self.intersect_counts["Intersections"]) + # Pull out full list of possible intersection combinations + intersections = list(self.intersect_counts["Intersections"].unique()) # Compute coordinates for bg subset scatter points d = len(self.subset_names) - num_bars = len(self.fig.data[0]["x"]) - x_bg_scatter = np.repeat(self.fig.data[0]["x"], d) + num_bars = len(intersections) + x_bg_scatter = np.repeat(range(num_bars), d) y_scatter_offset = ( 0.2 # Offsetting ensures bars will hover just above the subset scatterplot ) @@ -367,7 +462,7 @@ def make_switchboard(self): y=y_bg_scatter, mode="markers", showlegend=False, - marker=dict(size=16, color=self.subset_bgcolor), + marker=dict(size=16, color=self.subset_bgcolor, showscale=False), text=labels, hovertemplate="%{text}", ) @@ -375,7 +470,7 @@ def make_switchboard(self): self.fig.update_layout( xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), - margin=dict(t=40, l=150), + margin=dict(t=0, l=0), ) # Then fill in subset markers with fg color @@ -407,18 +502,28 @@ def make_margin_plot(self): """ Method to add left margin count px.bar chart in style of UpSet plot. """ - # Group and count according to color input + # Group and count according to inputs color = self.color - counts_df = ( - self.df.groupby(color).sum().reset_index() - if color is not None - else self.df.sum() - .reset_index() - .rename(columns={"index": "variable", 0: "value"}) - ) + groups = [x for x in [self.color, self.x] if x is not None] + # if len(groups) > 0: + # counts_df = self.df.groupby(groups).sum().reset_index() + if self.color is not None: + counts_df = self.df.groupby(self.color).sum().reset_index() + if self.x is not None: + counts_df = counts_df.drop(columns=[self.x]) + else: + counts_df = ( + self.df.sum() + .reset_index() + .rename(columns={"index": "variable", 0: "value"}) + ) # Create counts px.bar chart - plot_df = counts_df.melt(id_vars=color) if color is not None else counts_df + plot_df = ( + counts_df.melt(id_vars=[self.color]) + if self.color is not None + else counts_df + ) if self.mode == "Percent": if color is not None: denom = plot_df.groupby(color).sum().reset_index() @@ -429,25 +534,24 @@ def make_margin_plot(self): else: plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2) - hover_data = {"variable": False} - bar_args = {**self.common_plot_args, **self.bar_args} - counts_bar = px.bar( - plot_df, - x="value", - y="variable", - orientation="h", - text="value", - hover_data=hover_data, - **bar_args, + plot_function = px.bar + update_traces = {"textposition": "outside", "cliponaxis": False} + args = { + **self.common_plot_args, + **self.bar_args, + "text": "value", + "hover_data": {"variable": False}, + } + + counts_fig = plot_function( + plot_df, x="value", y="variable", orientation="h", **args ) - counts_bar.update_traces(textposition="outside", cliponaxis=False) + counts_fig.update_traces(**update_traces) # Add subset names as text into plot - subset_names = self.subset_names - # subset_names = counts_bar.data[0]['y'] - max_name_len = max([len(s) for s in subset_names]) + max_name_len = max([len(s) for s in self.subset_names]) annotation_center = -1 + -0.01 * max_name_len - for i, s in enumerate(subset_names): + for i, s in enumerate(self.subset_names): self.fig.add_annotation( x=annotation_center, y=self.switchboard_heights[i], @@ -458,11 +562,12 @@ def make_margin_plot(self): ) # Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights - for trace in counts_bar.data: + for trace in counts_fig.data: trace["x"] = -trace["x"] / max(trace["x"]) trace["y"] = self.switchboard_heights - counts_bar.update_traces(base=annotation_center - 1, showlegend=False) + # if self.plot_type == 'bar': + counts_fig.update_traces(base=annotation_center - 1, showlegend=False) # Add counts chart traces to main fig - for trace in counts_bar.data: + for trace in counts_fig.data: self.fig.add_trace(trace) From 537ec39c80c609227c7dbdcada211489d862fa94 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Thu, 11 May 2023 22:26:32 -0400 Subject: [PATCH 07/13] Changed margins to fix title issue; added webdocs --- doc/python/upset-plots.md | 143 ++++++++++++++++++ .../plotly/plotly/figure_factory/_upset.py | 6 +- 2 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 doc/python/upset-plots.md diff --git a/doc/python/upset-plots.md b/doc/python/upset-plots.md new file mode 100644 index 00000000000..a701f538850 --- /dev/null +++ b/doc/python/upset-plots.md @@ -0,0 +1,143 @@ +--- +jupyter: + jupytext: + notebook_metadata_filter: all + text_representation: + extension: .md + format_name: markdown + format_version: '1.1' + jupytext_version: 1.2.3 + kernelspec: + display_name: Python 3 + language: python + name: python3 + language_info: + codemirror_mode: + name: ipython + version: 3 + file_extension: .py + mimetype: text/x-python + name: python + nbconvert_exporter: python + pygments_lexer: ipython3 + version: 3.7.3 + plotly: + description: How to make an UpSet plot in Python, which can be used to display counts of +arbitrarily complex set intersections. + display_as: scientific + language: python + layout: base + name: UpSet Plots + order: 10 + permalink: python/quiver-plots/ + thumbnail: thumbnail/quiver-plot.jpg +--- + +[UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) allow you to visualize data that counts different intersections +subsets inside a set. This could arise by actual intersections, or counting tag occurrences on data which need not be +disjoint. Data used in this method must be in one of two forms: wide or condensed. If the latter is provided, then the +data will be transformed into the wide format before proceeding with the plot generation. + +#### A Simple UpSet Plot +```python +import plotly.express as px +import plotly.figure_factory as ff + +df = px.data.iris() + +# Create categorical non-disjoint tags for "large" features +df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6)) +df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3)) +df['PL'] = df['petal_length'].apply(lambda x: int(x > 3)) +df['PW'] = df['petal_width'].apply(lambda x: int(x > 1)) + +df = df[['SL', 'SW', 'PL', 'PW']] # data in "wide" form +fig = ff.create_upset(df, color_discrete_sequence=['#000000']) +fig.show() +``` + + +#### Using Condensed Format + +Sometimes it's more convenient to have data where one column is given as a list of (possibly) overlapping tags that data +point has. This can be thought of as dividing our dataset into a family of subsets, one for each tag. UpSet plots can help +analyze how different combinations of these tags are distributed in the data. + +As long as the entries in this column are a list/tuple, this method can handle the preprocessing step of getting the +data into the "wide" format like above. We simulate this below. + +```python +import plotly.express as px +import plotly.figure_factory as ff + +df = px.data.iris() + +# Create categorical non-disjoint tags for "large" features +df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6)) +df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3)) +df['PL'] = df['petal_length'].apply(lambda x: int(x > 3)) +df['PW'] = df['petal_width'].apply(lambda x: int(x > 1)) + +# Simulate "tags" column +df['tags'] = df['sepal_length'].apply(lambda x: ['SL'] if x > 6 else ['']) + df['sepal_width'].apply(lambda x: ['SW'] if x > 3 else ['']) + \ + df['petal_length'].apply(lambda x: ['PL'] if x > 3 else ['']) + df['petal_width'].apply(lambda x: ['PW'] if x > 1 else ['']) +df['tags'] = df['tags'].apply(lambda x: [y for y in x if y != '']) + +# Note we can (optionally) choose the order for how the method unpacks the tags +fig = ff.create_upset(df, subset_column='tags', subset_order=['PW', 'SW', 'PL', 'SL'], color_discrete_sequence=['#000000']) +fig.show() +``` + +#### Grouping Data by Color + +This method supports two ways of grouping data to visualize counts of subset intersections. The first, shown here, +allows you to see how these counts vary by subset in parallel across categories described by another column. + +```python +import plotly.express as px +import plotly.figure_factory as ff + +df = px.data.iris() + +# Create categorical non-disjoint tags for "large" features +df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6)) +df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3)) +df['PL'] = df['petal_length'].apply(lambda x: int(x > 3)) +df['PW'] = df['petal_width'].apply(lambda x: int(x > 1)) + +df = df[['species', 'SL', 'SW', 'PL', 'PW']] # data in "wide" form, with extra "species" column +# Note: ONLY the extra color column was kept, as rest of columns are inferred to make "wide" format subset data +fig = ff.create_upset(df, color='species', asc=True) # Can toggle in "asc" order +fig.show() +``` + +#### Visualizing Distributions of Counts by Subset + +The other way to group data is to provide a column which provides label for different clusters of observations. This +could be e.g. the day of the observation, different samples in biology, or any other way of dividing up the same +observations in different situations. This technique lets you see how the different subset counts vary across this +dimension. + +```python +import plotly.express as px +import plotly.figure_factory as ff +import numpy as np + +df = px.data.iris() + +# Create categorical non-disjoint tags for "large" features +df['SL'] = df['sepal_length'].apply(lambda x: int(x > 6)) +df['SW'] = df['sepal_width'].apply(lambda x: int(x > 3)) +df['PL'] = df['petal_length'].apply(lambda x: int(x > 3)) +df['PW'] = df['petal_width'].apply(lambda x: int(x > 1)) +df = df[['SL', 'SW', 'PL', 'PW']] + +# Simulate random "day" of observation +np.random.seed(100) +df['day'] = np.random.randint(0, 5, len(df)) + +fig = ff.create_upset(df, x='day', plot_type='box', show_yaxis=True, title='Variation of Tags by Day') +fig.update_layout(yaxis_side="right") +fig.show() +``` + diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index 76fa72aefd8..d9455836f49 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from plotly import exceptions, optional_imports +from plotly import optional_imports import plotly.graph_objects as go import plotly.express as px @@ -9,7 +9,7 @@ VALID_PLOT_TYPES = ["bar", "box", "violin"] -# TODO: Add webpage documentation + def create_upset( data_frame, x=None, @@ -470,7 +470,7 @@ def make_switchboard(self): self.fig.update_layout( xaxis=dict(showgrid=False, zeroline=False), yaxis=dict(showgrid=True, zeroline=False), - margin=dict(t=0, l=0), + margin=dict(t=40, l=40), ) # Then fill in subset markers with fg color From 41f9c400889b6455e77b203d8323a12b91edf374 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Thu, 11 May 2023 23:03:44 -0400 Subject: [PATCH 08/13] Added some simple tests and removed some debugging code --- .../plotly/plotly/figure_factory/_upset.py | 11 ++--- .../test_figure_factory.py | 49 +++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index d9455836f49..0ffc279721e 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -113,8 +113,7 @@ def create_upset( """ plot_obj = _Upset(**locals()) upset_plot = plot_obj.make_upset_plot() - # TODO: Create tests for plotter - return upset_plot, plot_obj + return upset_plot def _expand_subset_column(df, subset_column, subset_order=None): @@ -124,7 +123,9 @@ def _expand_subset_column(df, subset_column, subset_order=None): subset_names = ( subset_order if subset_order is not None - else list(df[subset_column].explode().unique()) + else [ + x for x in df[subset_column].explode().unique() if not pd.isnull(x) + ] # Remove empty subset = NaN ) new_df = df.copy() for name in subset_names: @@ -262,9 +263,6 @@ def __init__( # Validate inputs self.validate_upset_inputs() - # DEBUG - self.test = None - def make_upset_plot(self): # If subset_column provided, expand into standard wider format if self.subset_column is not None: @@ -340,7 +338,6 @@ def make_upset_plot(self): # Rescale for percents if requested mode = self.mode - # TODO: Check this input still works with all the subsetting changes... if mode == "Percent": if self.color is not None: denom = self.intersect_counts.groupby(self.color).sum().reset_index() diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py index 8783dce1ab4..49a9843d456 100644 --- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py +++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py @@ -4516,3 +4516,52 @@ def test_build_dataframe(self): assert len(fig6.frames) == n_frames assert len(fig7.frames) == n_frames assert fig6.data[0].geojson == fig1.data[0].geojson + + +class TestUpset(TestCaseNoTemplate): + # Test compatibilities between using wide format input data vs condensed + def test_wide_vs_condensed(self): + np.random.seed(0) + + a = np.random.randint(0, 2, 1000) + b = np.random.randint(0, 2, 1000) + c = np.random.randint(0, 2, 1000) + color = np.random.randint(0, 3, 1000).astype(str) + + df = pd.DataFrame({"a": a, "b": b, "c": c, "color": color}) + fig1 = ff.create_upset(df.drop(columns=["color"])) + fig2 = ff.create_upset( + df.drop(columns=["color"]), sort_by="Intersections", asc=False + ) + fig3 = ff.create_upset(df, color="color") + + for tag in ["a", "b", "c"]: + df[tag] = df[tag].map({1: [tag], 0: [""]}) + + df["tags"] = df["a"] + df["b"] + df["c"] + df["tags"] = df["tags"].apply(lambda x: [y for y in x if y != ""]) + + fig4 = ff.create_upset( + df.drop(columns=["a", "b", "c", "color"]), subset_column="tags" + ) + fig5 = ff.create_upset( + df.drop(columns=["a", "b", "c"]), + subset_column="tags", + sort_by="Intersections", + asc=False, + ) + fig6 = ff.create_upset( + df.drop(columns=["a", "b", "c"]), subset_column="tags", color="color" + ) + + for data in zip(fig1.data, fig4.data): + self.assert_fig_equal(data[0], data[1]) + self.assert_fig_equal(fig1.layout, fig4.layout) + + for data in zip(fig2.data, fig5.data): + self.assert_fig_equal(data[0], data[1]) + self.assert_fig_equal(fig2.layout, fig5.layout) + + for data in zip(fig3.data, fig6.data): + self.assert_fig_equal(data[0], data[1]) + self.assert_fig_equal(fig3.layout, fig6.layout) From dbfab5df61d1859937af975bac9f5e3a32b91395 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Fri, 12 May 2023 09:42:20 -0400 Subject: [PATCH 09/13] Fixed inheritence for test --- .../test_optional/test_figure_factory/test_figure_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py index 49a9843d456..c128f62a581 100644 --- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py +++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py @@ -4518,7 +4518,7 @@ def test_build_dataframe(self): assert fig6.data[0].geojson == fig1.data[0].geojson -class TestUpset(TestCaseNoTemplate): +class TestUpset(NumpyTestUtilsMixin, TestCaseNoTemplate): # Test compatibilities between using wide format input data vs condensed def test_wide_vs_condensed(self): np.random.seed(0) From 1af712f2cdb270a20e9ca3f39785a1d6da604c95 Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Fri, 12 May 2023 10:01:38 -0400 Subject: [PATCH 10/13] Changed order for subset labels in test --- .../test_figure_factory/test_figure_factory.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py index c128f62a581..3a0ca92cd84 100644 --- a/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py +++ b/packages/python/plotly/plotly/tests/test_optional/test_figure_factory/test_figure_factory.py @@ -4542,16 +4542,22 @@ def test_wide_vs_condensed(self): df["tags"] = df["tags"].apply(lambda x: [y for y in x if y != ""]) fig4 = ff.create_upset( - df.drop(columns=["a", "b", "c", "color"]), subset_column="tags" + df.drop(columns=["a", "b", "c", "color"]), + subset_column="tags", + subset_order=["a", "b", "c"], ) fig5 = ff.create_upset( df.drop(columns=["a", "b", "c"]), subset_column="tags", + subset_order=["a", "b", "c"], sort_by="Intersections", asc=False, ) fig6 = ff.create_upset( - df.drop(columns=["a", "b", "c"]), subset_column="tags", color="color" + df.drop(columns=["a", "b", "c"]), + subset_column="tags", + subset_order=["a", "b", "c"], + color="color", ) for data in zip(fig1.data, fig4.data): From 38f7c8f55036221674642c18098a2991507fcdaa Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Fri, 12 May 2023 11:19:44 -0400 Subject: [PATCH 11/13] Updated permalink in notebook doc --- doc/python/upset-plots.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/python/upset-plots.md b/doc/python/upset-plots.md index a701f538850..0b9b454a346 100644 --- a/doc/python/upset-plots.md +++ b/doc/python/upset-plots.md @@ -29,8 +29,7 @@ arbitrarily complex set intersections. layout: base name: UpSet Plots order: 10 - permalink: python/quiver-plots/ - thumbnail: thumbnail/quiver-plot.jpg + permalink: python/upset-plots/ --- [UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) allow you to visualize data that counts different intersections From fd2420db455a2c289545bc891fc567da97cf962d Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Fri, 12 May 2023 12:13:59 -0400 Subject: [PATCH 12/13] Updated CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 880774085a9..931648b5d46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ This project adheres to [Semantic Versioning](http://semver.org/). - Fixed another compatibility issue with Pandas 2.0, just affecting `px.*(line_close=True)` [[#4190](https://github.com/plotly/plotly.py/pull/4190)] - Added some rounding to the `make_subplots` function to handle situations where the user-input specs cause the domain to exceed 1 by small amounts https://github.com/plotly/plotly.py/pull/4153 +### Added + + - Added implementation of [UpSet plots](https://en.wikipedia.org/wiki/UpSet_Plot) in `plotly.figure_factory` via the `create_upset` method [[#4204](https://github.com/plotly/plotly.py/pull/4204)] + ## [5.14.1] - 2023-04-05 ### Fixed From 4e0f60f701defcea0ef16f3c56d9b5768911872d Mon Sep 17 00:00:00 2001 From: rickymagner <81349869+rickymagner@users.noreply.github.com> Date: Fri, 12 May 2023 15:35:16 -0400 Subject: [PATCH 13/13] Fixed some bugs in scaling/labeling margin plot --- .../python/plotly/plotly/figure_factory/_upset.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/python/plotly/plotly/figure_factory/_upset.py b/packages/python/plotly/plotly/figure_factory/_upset.py index 0ffc279721e..8f02659e90f 100644 --- a/packages/python/plotly/plotly/figure_factory/_upset.py +++ b/packages/python/plotly/plotly/figure_factory/_upset.py @@ -304,6 +304,7 @@ def make_upset_plot(self): # Perform sorting within each color group # WARNING: If sort_by="Counts" it will be ignored here since this won't make sense when using groups + # TODO: Make sensible behavior for sort by "Counts" in this case self.intersect_counts = ( filled_df.groupby(groups) .apply( @@ -523,13 +524,18 @@ def make_margin_plot(self): ) if self.mode == "Percent": if color is not None: - denom = plot_df.groupby(color).sum().reset_index() + denom = ( + self.df.groupby(color) + .apply(lambda df: len(df)) + .reset_index() + .rename(columns={0: "value"}) + ) denom_dict = dict(zip(denom[color], denom["value"])) plot_df["value"] = round( plot_df["value"] / plot_df[color].map(denom_dict), 2 ) else: - plot_df["value"] = round(plot_df["value"] / plot_df["value"].sum(), 2) + plot_df["value"] = round(plot_df["value"] / len(self.df), 2) plot_function = px.bar update_traces = {"textposition": "outside", "cliponaxis": False} @@ -559,10 +565,10 @@ def make_margin_plot(self): ) # Reflect horizontally the bars while preserving labels; Shift heights to match input subset scatter heights + max_x = max([max(t["x"]) for t in counts_fig.data]) for trace in counts_fig.data: - trace["x"] = -trace["x"] / max(trace["x"]) + trace["x"] = -trace["x"] / max_x trace["y"] = self.switchboard_heights - # if self.plot_type == 'bar': counts_fig.update_traces(base=annotation_center - 1, showlegend=False) # Add counts chart traces to main fig