### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import time 

import matplotlib.pyplot as plt

# from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection
from scipy.stats import kurtosis

from textwrap import wrap

### Load Dataset

In [2]:
RANDOM_SEED = 25


columns = ['Radius','Texture','Perimeter','Area','Smoothness','Compactness',
           'Concavity','Concave_Points','Symmetry','Fractal_Dimension',
           'Malignant/Benign']

# Read CSV file into pandas df
df = pd.read_csv('../datasets/breast_cancer/breast-cancer-wisconsin.csv',
                 delimiter=',', quotechar='"', names=columns)

### Dataframe without Preprocessing 

In [3]:
print("Printing dataframe head (without any preprocessing)....")
print(df.head(10))

Printing dataframe head (without any preprocessing)....
    Radius  Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
0  1000025        5          1     1           1            2         1   
1  1002945        5          4     4           5            7        10   
2  1015425        3          1     1           1            2         2   
3  1016277        6          8     8           1            3         4   
4  1017023        4          1     1           3            2         1   
5  1017122        8         10    10           8            7        10   
6  1018099        1          1     1           1            2        10   
7  1018561        2          1     2           1            2         1   
8  1033078        2          1     1           1            2         1   
9  1033078        4          2     1           1            2         1   

   Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
0               3         1                  1     

### Dataset Preprocessing 

In [4]:
# Shuffle
df = shuffle(df, random_state=RANDOM_SEED)

# DROP USELESS ROWS AND COLUMNS
df.dropna(inplace=True)
cols = [0]
# Drop ID column (it's not attribute or target)
df.drop(df.columns[cols],axis=1,inplace=True)
# Drop all data points with missing variables  (denoted by '?' entry)
nostrings_row_list = [x.isdigit() for x in df.iloc[:,5]]
df = df[nostrings_row_list]


# Handle categorical data
# df = pd.get_dummies(df)


# Split data into X and y vectors
X = df.ix[:, df.columns != 'Malignant/Benign']
y = df['Malignant/Benign']

# Change 2 -> 0 (benign) and 4 -> 1 (malignant)
y.replace(2, 0, inplace=True)
y.replace(4, 1, inplace=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


### Sanity Check on Dataframe 


In [5]:
print("Sanity Check: Printing out dataframe and shape after preprocessing... ")
print(df.head(10))
print("df.shape: ", df.shape)


Sanity Check: Printing out dataframe and shape after preprocessing... 
     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   

     Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
437               1         1                  1                 0  
511               2         1             

### Training and Testing Split, Scaling 

In [6]:
# # Split into 30%  training data, 70% testing data
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     test_size=0.30, random_state=RANDOM_SEED)


# # Apply scaling. Large values of certain features undesireable for NN
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

### Sanity Check on X

In [6]:
print(X)

     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   
586        8         10    10          10            6        10   
264        7          9     4          10           10         3   
554        3          1     1           1            2         1   
509        2          1     1           1       

### Analysis Helper Function 

In [64]:
def analyze_data_list(data_list): 
    arr = np.array(data_list)
    variance = np.var(arr)
    mean = np.mean(arr)
    
    return mean, variance 
    

### Clustering Without Dimensionality Reduction 

In [7]:
clf = KMeans(n_clusters=2, random_state=0)
clf.fit(X)
error = mean_squared_error(y, clf.predict(X))
print("Printing error (without Dim. Reduc.) ... ", error)

Printing error (without Dim. Reduc.) ...  0.03953147877013177


### Randomized Projections - Followed by Clustering 

In [68]:
# NOTE: Need to run this one multiple times, keep the best runs, since it's random 

iterations_per_n_comp = 10

error_list = list()

# Specify number of eignevectors to use for PCA 
# n_comp = 3


# pca = PCA(n_components=n_comp)
# X_dim_reduced = pca.fit(X).transform(X)
# print(pca.explained_variance_ratio_)

# print("Printing X_dim_reduced...")
# print(X_dim_reduced)


for i in range(iterations_per_n_comp): 


    rp = GaussianRandomProjection(n_components=n_comp)
    rp = rp.fit(X)
    X_dim_reduced = rp.transform(X)
    print(rp.components_)


    clf_dimReduced = KMeans(n_clusters=2, random_state=0)
    clf_dimReduced.fit(X_dim_reduced)
    error_dimReduced = mean_squared_error(y, clf_dimReduced.predict(X_dim_reduced))
#     print("Printing error_dimReduced ... ", error_dimReduced)
    
    error_list.append(error_dimReduced)

# Get mean and variance from helper function 
mean, var = analyze_data_list(error_list)

print("Mean: ", mean, "Variance: ", var)

[[-0.42643436 -0.74205514  0.78116886 -0.68258382 -0.74144219 -0.26909874
  -0.65571131 -0.85877011  0.84440656]
 [ 0.78972199  0.42986544  0.46096199  0.28910531  0.30397782 -0.19647131
  -0.00592606  0.37053697  0.56878386]
 [-0.48005542 -0.45890725  0.94084246  0.91607815  0.48496626  0.60216161
  -0.32645795 -0.60490595  0.14584035]]
[[-1.2826074  -0.30600844  0.51440669  0.19177812 -0.16143588 -0.30928614
   0.62831584  0.22866404 -1.36514773]
 [ 0.54388955  0.36112754 -0.12286517 -0.77350558 -1.57119535 -0.13892799
   0.12860376  0.3319936   0.64712882]
 [-0.2624797   0.17914617 -0.65369974  0.22158144  0.16429531  0.66736404
   0.39396168 -0.09101541 -1.02954322]]
[[ 1.34134957e-02  1.71811095e-01 -3.17397983e-01 -1.75111789e-01
   8.09154454e-02  6.48540588e-01  2.78596331e-01 -2.65994370e-01
  -5.62541139e-01]
 [-8.63015631e-01 -1.70047791e-01 -5.71502467e-01 -1.33069007e-01
   2.14822913e-04  6.29413487e-02 -7.45706075e-01 -1.60049780e-01
  -3.55174574e-01]
 [-6.80058103e-01 

NameError: name 'mean' is not defined