In [None]:
import os 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# use this when creating PDF/ CDF
from scipy.stats import gaussian_kde

# Grab the colormap to use from matplotlib
cmap = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Set the figure size for all plots
FIGSIZE = (10, 6)

# Set various font sizes
global_font_size = 30
title_font_size = 40
hoverlabel_font_size = 20



In [None]:
# Grab the dataset
data_dir = os.path.join('..', 'data', 'bca_wisconsin.csv')
df = pd.read_csv(data_dir)
df.head()

In [None]:
# Create a plotly figure of Texture Mean
fig = px.histogram(df, x="texture_mean", color="diagnosis", labels={'texture_mean':'Texture Mean', 'count':'Number of Samples'})

# Modify both traces
fig.update_traces(opacity=0.8,
                  marker_line_width=1,
                  marker_line_color="black")

# Modify malignant
fig.update_traces(marker_color=cmap[0],
                  name="Malignant",
                  selector=dict(name="diagnosis=M"))

# Modify benign
fig.update_traces(marker_color=cmap[1],
                  name="Benign",
                  selector=dict(name="diagnosis=B"))

# Update the bar mode so we can see one set of bars behind another
fig.update_layout(barmode="overlay",
                  #title="Texture Mean Feature Value",
                  yaxis_title_text="Number of Samples",
                  plot_bgcolor="#fff",
                  #yaxis_gridcolor="#fff",
                  yaxis_showgrid=False,
                  autosize=True,
                  font_size=global_font_size,
                  title_font_size=title_font_size,
                  hoverlabel_font_size=hoverlabel_font_size)

# Write figure, but also show it to the screen
fig.write_html(os.path.join('..', 'plots', 'texture_mean.html'), auto_open=False, include_plotlyjs='cdn')
fig.show()

In [None]:
fig = px.histogram(df, x="radius_mean", color="diagnosis", labels={'radius_mean':'Radius Mean'})

# Modify both traces
fig.update_traces(opacity=0.8,
                  marker_line_width=1,
                  marker_line_color="black")

# Modify malignant
fig.update_traces(marker_color=cmap[0],
                  name="Malignant",
                  selector=dict(name="diagnosis=M"))

# Modify benign
fig.update_traces(marker_color=cmap[1],
                  name="Benign",
                  selector=dict(name="diagnosis=B"))

# Update the bar mode so we can see one set of bars behind another
fig.update_layout(barmode="overlay",
                  #title="Radius Mean Feature Value",
                  yaxis_title_text="Number of Samples",
                  plot_bgcolor="#fff",
                  yaxis_showgrid=False,
                  #yaxis_gridcolor="#fff",
                  font_size=global_font_size,
                  title_font_size=title_font_size,
                  autosize=True,
                  hoverlabel_font_size=hoverlabel_font_size)

fig.write_html(os.path.join('..', 'plots', 'radius_mean.html'), auto_open=False, include_plotlyjs='cdn')
fig.show()

In [None]:
# Create a PDF and CDF using our "good" feature

# Create data
xm_texmean = df[df['diagnosis']=='M'].texture_mean
xb_texmean = df[df['diagnosis']=='B'].texture_mean

xm_radmean = df[df['diagnosis']=='M'].radius_mean
xb_radmean = df[df['diagnosis']=='B'].radius_mean

# Support
x_min = np.min(np.concatenate((np.array(xm_radmean), np.array(xb_radmean)))) - 5
x_max = np.max(np.concatenate((np.array(xm_radmean), np.array(xb_radmean)))) + 5
x = np.linspace(0, 35, 1000)

# Calculate the KDE of this dataset
bandwidth = 1
kdem = gaussian_kde(xm_radmean, bw_method=bandwidth / xm_radmean.std(ddof=1))
kdeb = gaussian_kde(xb_radmean, bw_method=bandwidth / xb_radmean.std(ddof=1)) 

ym = kdem.evaluate(x)
yb = kdeb.evaluate(x)


In [None]:
figname = 'pdf_cdf.html'

# Figure line styles
linewidth = 5

# Create a subplot to show PDF / CDF side-by-side
fig = make_subplots(rows=1, cols=2, subplot_titles=("Probability Density Function", "Cumulative Density Function"))

## PDF ##
# Add the histograms for each class
fig.append_trace(go.Histogram(
    x=xm_radmean, 
    histnorm="probability density",
    marker_color=cmap[0], 
    showlegend=False,
    name="Malignant",
    opacity=0.4),
    row=1, col=1)
fig.append_trace(go.Histogram(
    x=xb_radmean, 
    histnorm="probability density",
    marker_color=cmap[1], 
    showlegend=False,
    name="Benign",
    opacity=0.4),
    row=1, col=1)

# Add the density plots
fig.append_trace(go.Scatter(
    x=x,
    y=ym,
    mode="lines",
    line=dict(
        color=cmap[0],
        width=linewidth
    ),
    name="Malignant",
    showlegend=False),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x,
    y=yb,
    mode="lines",
    line=dict(
        color=cmap[1],
        width=linewidth
    ),
    name="Benign",
    showlegend=False),
    row=1, col=1)

# Add the lines corresponding to the feature value of interest
xloc = 16

fig.append_trace(go.Scatter(
    x=[xloc, xloc],
    y=[0, np.max([kdem.evaluate(xloc), kdeb.evaluate(xloc)])],
    mode="lines",
    line=dict(
        color="#000",
        dash="dash",
        width=linewidth
    ),
    name="Single Value"),
    row=1, col=1)

# Add the intercepts with the density
fig.append_trace(go.Scatter(
    x=[xloc],
    y=kdem.evaluate(xloc),
    mode="markers",
    marker_color=cmap[0],
    marker_size=16,
    name="Malignant Probability",
    showlegend=False),
    row=1, col=1)

fig.append_trace(go.Scatter(
    x=[xloc],
    y=kdeb.evaluate(xloc),
    mode="markers",
    marker_color=cmap[1],
    marker_size=16,
    name="Benign Probability",
    showlegend=False),
    row=1, col=1)

# Add the lines going from intercept back to the y axis
# fig.append_trace(go.Scatter(
#     x=[1, xloc],
#     y=[kde1.evaluate(xloc)[0], kde1.evaluate(xloc)[0]],
#     mode="lines",
#     line=dict(
#         color=cmap[0],
#         width=linewidth
#     ),
#     showlegend=False),
#     row=1, col=1)
# fig.append_trace(go.Scatter(
#     x=[1, xloc],
#     y=[kde2.evaluate(xloc)[0], kde2.evaluate(xloc)[0]],
#     mode="lines",
#     line=dict(
#         color=cmap[1],
#         width=linewidth
#     ),
#     showlegend=False),
#     row=1, col=1)

## CDF
# Combine into a conditional probability distribution
p_x  = ym*0.5 + yb*0.5
pm_x = ym*0.5 / p_x
pb_x = yb*0.5 / p_x

# Plot the CDF lines
fig.append_trace(go.Scatter(
    x=x,
    y=pm_x,
    mode="lines",
    name=r"Malignant",
    line=dict(
        color=cmap[0],
        width=linewidth
    )),
    row=1, col=2)
fig.append_trace(go.Scatter(
    x=x,
    y=pb_x,
    mode="lines",
    name=r"Benign",
    line=dict(
        color=cmap[1],
        width=linewidth
    )),
    row=1, col=2)

# Add the lines for CDF for a particular feature value
ylocm = kdem.evaluate(xloc)*0.5 / (kdem.evaluate(xloc)*0.5 + kdeb.evaluate(xloc)*0.5)
ylocb = kdeb.evaluate(xloc)*0.5 / (kdem.evaluate(xloc)*0.5 + kdeb.evaluate(xloc)*0.5)

fig.append_trace(go.Scatter(
    x=[xloc, xloc],
    y=[0, np.max([ylocm, ylocb])],
    mode="lines",
    line=dict(
        color="#000",
        dash="dash",
        width=linewidth
    ),
    name="Single Value",
    showlegend=False),
    row=1, col=2)

# Add the intercepts with the density
fig.append_trace(go.Scatter(
    x=[xloc],
    y=[ylocm[0]],
    mode="markers",
    marker_color=cmap[0],
    marker_size=16,
    name="Malignant<br> Probability",
    showlegend=False),
    row=1, col=2)

fig.append_trace(go.Scatter(
    x=[xloc],
    y=[ylocb[0]],
    mode="markers",
    marker_color=cmap[1],
    marker_size=16,
    name="Benign<br> Probability",
    showlegend=False),
    row=1, col=2)

# Add the lines going from intercept back to the y axis
# fig.append_trace(go.Scatter(
#     x=[1, xloc],
#     y=[yloc1[0], yloc1[0]],
#     mode="lines",
#     line=dict(
#         color=cmap[0],
#         width=linewidth
#     ),
#     showlegend=False),
#     row=1, col=2)
# fig.append_trace(go.Scatter(
#     x=[1, xloc],
#     y=[yloc2[0], yloc2[0]],
#     mode="lines",
#     line=dict(
#         color=cmap[1],
#         width=linewidth
#     ),
#     showlegend=False),
#     row=1, col=2)

# Update the left figure
fig.update_xaxes(title_text=r'$x: \textrm{Feature Value}$', range=[5, 35], row=1, col=1)
fig.update_yaxes(title_text=r'$p(x|\omega_{j})$', gridcolor="#e4e4e4", showgrid=False, row=1, col=1)


# Update the right figure
fig.update_xaxes(title_text=r'$x: \textrm{Feature Value}$', range=[5, 35], row=1, col=2)
fig.update_yaxes(title_text=r'$P(\omega_{j}|x)$', gridcolor="#e4e4e4", showgrid=False, row=1, col=2)

fig.update_layout(barmode="overlay",
                  title="",
                  plot_bgcolor="#fff",
                  font_size=global_font_size,
                  title_font_size=title_font_size,
                  hoverlabel_font_size=hoverlabel_font_size)

# Hack to fix the subtitle font size
for i in fig['layout']['annotations']:
    i['font']['size'] = title_font_size
    
fig.write_html(os.path.join('..', 'plots', figname), auto_open=False, include_plotlyjs='cdn', include_mathjax='cdn')
fig.show()

In [None]:
# Two-Dimensional Plot
figname = 'scatter_histogram_plot.html'
fig = go.Figure()

x_texture = df['texture_mean']
x_radius = df['radius_mean']

# Create the contour plot for all data
# fig.add_trace(go.Histogram2dContour(
#         x=x,
#         y=y,
#         colorscale = 'Blues',
#         reversescale = True,
#         contours=dict(
#             coloring='none'
#         ),
#         xaxis = 'x',
#         yaxis = 'y'
#     ))

# Add scatter points
fig.add_trace(go.Scatter(
        x = xm_texmean,
        y = xm_radmean,
        xaxis = 'x',
        yaxis = 'y',
        name="Malignant",
        mode = 'markers',
        marker = dict(
            color = cmap[0],# 'rgba(255,255,255,0.3)',
            size = 10,
            opacity=0.5
        )
    ))

fig.add_trace(go.Scatter(
        x = xb_texmean,
        y = xb_radmean,
        xaxis = 'x',
        yaxis = 'y',
        name = "Benign",
        mode = 'markers',
        marker = dict(
            color = cmap[1],#'rgba(255,255,255,0.3)',
            size = 10,
            opacity=0.5
        )
    ))

# Add top and right histograms
# fig.add_trace(go.Histogram(
#         y = x_radius,
#         xaxis = 'x2',
#         marker = dict(
#             color = 'rgba(0,0,0,0.25)',
#             line = dict(
#                 width = 1,
#                 color = 'rgba(0,0,0,1)'
#             )
#         ),
#         name="Texture Mean"
#     ))

# Add the histograms for each class
fig.add_trace(go.Histogram(
    y=xm_radmean,
    xaxis = 'x2',
    marker = dict(
        color=cmap[0], 
        line = dict(
            width = 1,
            color = '#000'
        )
    ),
    name="Malignant Radius",
    opacity=0.4))

fig.add_trace(go.Histogram(
    y = xb_radmean,
    xaxis = 'x2',
    marker = dict(
        color=cmap[1], 
        line = dict(
            width = 1,
            color = '#000'
        )
    ),
    name="Benign Radius",
    opacity=0.4))

# fig.add_trace(go.Histogram(
#         x = x_texture,
#         yaxis = 'y2',
#         marker = dict(
#             color = 'rgba(0,0,0,0.25)',
#             line = dict(
#                 width = 1,
#                 color = 'rgba(0,0,0,1)'
#             )
#         )
#     ))
fig.add_trace(go.Histogram(
    x = xm_texmean,
    yaxis = 'y2',
    marker = dict(
        color=cmap[0], 
        line = dict(
            width = 1,
            color = '#000'
        )
    ),
    name="Malignant Texture",
    opacity=0.4))

fig.add_trace(go.Histogram(
    x = xb_texmean,
    yaxis = 'y2',
    marker = dict(
        color=cmap[1], 
        line = dict(
            width = 1,
            color = '#000'
        )
    ),
    name="Benign Texture",
    opacity=0.4))

fig.update_layout(
    autosize=False,
    xaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = True,
        gridcolor = "#e4e4e4",
        title_text="Texture Mean"
    ),
    yaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = True,
        gridcolor = "#e4e4e4",
        title_text="Radius Mean"
    ),
    xaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False,
        title_text="Radius Mean Distribution"
    ),
    yaxis2 = dict(
        zeroline = False,
        domain = [0.85,1],
        showgrid = False,
        title_text="Texture Mean<br> Distribution"
    ),
    margin=dict(
        l = 350,
    ),
    height=700,
    width=1500,
    bargap = 0,
    barmode = 'overlay',
    hovermode = 'closest',
    showlegend = False,
    plot_bgcolor = '#fff',
    font_size=int(global_font_size*.5),
    title_font_size=title_font_size,
    hoverlabel_font_size=hoverlabel_font_size
)
    
fig.write_html(os.path.join('..', 'plots', figname), auto_open=False, include_plotlyjs='cdn', include_mathjax='cdn')
fig.show()

In [None]:
# Two-Dimensional Plot
figname = 'scatter_plot.html'
fig = go.Figure()

x_texture = df['texture_mean']
x_radius = df['radius_mean']

# Add scatter points
fig.add_trace(go.Scatter(
        x = xm_texmean,
        y = xm_radmean,
        xaxis = 'x',
        yaxis = 'y',
        name="Malignant",
        mode = 'markers',
        marker = dict(
            color = cmap[0],
            size = 10,
            opacity=0.5
        )
    ))

fig.add_trace(go.Scatter(
        x = xb_texmean,
        y = xb_radmean,
        xaxis = 'x',
        yaxis = 'y',
        name = "Benign",
        mode = 'markers',
        marker = dict(
            color = cmap[1],
            size = 10,
            opacity=0.5
        )
    ))

fig.update_layout(
    autosize=False,
    xaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = True,
        gridcolor = "#e4e4e4",
        title_text="Texture Mean"
    ),
    yaxis = dict(
        zeroline = False,
        domain = [0,0.85],
        showgrid = True,
        gridcolor = "#e4e4e4",
        title_text="Radius Mean"
    ),
    margin=dict(
        t = 0,
        l = 500,
    ),
    height=700,
    width=1400,
    bargap = 0,
    barmode = 'overlay',
    hovermode = 'closest',
    #showlegend = False,
    plot_bgcolor = '#fff',
    font_size=int(global_font_size*.5),
    title_font_size=title_font_size,
    hoverlabel_font_size=hoverlabel_font_size
)
    
fig.write_html(os.path.join('..', 'plots', figname), auto_open=False, include_plotlyjs='cdn', include_mathjax='cdn')
fig.show()