In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly
import plotly.graph_objs as go

%matplotlib inline

In [2]:
# Get data.
bunch = load_breast_cancer()
df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Pre-process data.
scaler = StandardScaler()
scaler.fit(df)
preprocessed_data = scaler.transform(df)
preprocessed_data

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ..., 
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [4]:
# Decompose data (PCA).
pca = PCA(n_components=3)
pca.fit(preprocessed_data)
decomposed_data = pca.transform(preprocessed_data)
decomposed_data

array([[  9.19283683,   1.94858305,  -1.12316549],
       [  2.38780179,  -3.76817178,  -0.52929154],
       [  5.73389628,  -1.07517379,  -0.5517477 ],
       ..., 
       [  1.25617928,  -1.9022967 ,   0.56273   ],
       [ 10.37479406,   1.67201012,  -1.87702979],
       [ -5.4752433 ,  -0.67063681,   1.49044382]])

In [5]:
# Get explained variance.
print(pca.explained_variance_ratio_, sum(pca.explained_variance_ratio_))

[ 0.44272026  0.18971182  0.09393163] 0.726363709089


In [6]:
# Get components.
comp_df = pd.DataFrame(pca.components_, columns=bunch.feature_names)
comp_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.218902,0.103725,0.227537,0.220995,0.14259,0.239285,0.2584,0.260854,0.138167,0.064363,...,0.227997,0.104469,0.23664,0.224871,0.127953,0.210096,0.228768,0.250886,0.122905,0.131784
1,-0.233857,-0.059706,-0.215181,-0.231077,0.186113,0.151892,0.060165,-0.034768,0.190349,0.366575,...,-0.219866,-0.045467,-0.199878,-0.219352,0.172304,0.143593,0.097964,-0.008257,0.141883,0.275339
2,-0.008531,0.06455,-0.009314,0.0287,-0.104292,-0.074092,0.002734,-0.025563,-0.04024,-0.022574,...,-0.047507,-0.042298,-0.048546,-0.011902,-0.259798,-0.236076,-0.173057,-0.170344,-0.271313,-0.232791


In [7]:
# Connect notebook and download JS files.
plotly.offline.init_notebook_mode(connected=True)

In [8]:
# Create data frame.
decomposed_df = pd.DataFrame(decomposed_data, columns=['x', 'y', 'z'])
decomposed_df['malignant'] = 1 - bunch.target
decomposed_df.head()

Unnamed: 0,x,y,z,malignant
0,9.192837,1.948583,-1.123165,1
1,2.387802,-3.768172,-0.529292,1
2,5.733896,-1.075174,-0.551748,1
3,7.122953,10.275589,-3.23279,1
4,3.935302,-1.948072,1.389767,1


In [9]:
# Filter data frame.
malignant = decomposed_df[decomposed_df.malignant == 1]
benign = decomposed_df[decomposed_df.malignant == 0]

In [10]:
# Create line style.
line_style = dict(color='rgba(0, 0, 0, 0.14)',width=0.5)

# Create scatters.
malignant_scatter = go.Scatter3d(
    x=malignant['x'],
    y=malignant['y'],
    z=malignant['z'],
    mode='markers',
    marker=dict(
        color='rgb(181, 20, 37)',
        size=12,
        opacity=0.8,
        line=line_style
    ),
    name='Malignant'
)
benign_scatter = go.Scatter3d(
    x=benign['x'],
    y=benign['y'],
    z=benign['z'],
    mode='markers',
    marker=dict(
        color='rgb(5, 99, 226)',
        size=12,
        opacity=0.8,
        line=line_style
    ),
    name='Benign'
)

# Create data array. Ensure malignant scatter is rendered above (can we merge layers somehow?).
data = [benign_scatter, malignant_scatter]

# Create layout.
layout = go.Layout(showlegend=True, margin=dict(l=0,r=0,b=0,t=0))

# Render (offline).
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='3d-scatter')

In [11]:
bunch.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='<U23')

In [12]:
bunch.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='<U23')

In [13]:
# Create heat map data.
data = go.Heatmap(z=pca.components_, 
                  x=bunch.feature_names, 
                  y=['PC 1', 'PC 2', 'PC 3'], 
                  colorscale='Viridis')

# Plot heatmap.
plotly.offline.iplot([data], filename='heatmap')