### Import Libraries

In [40]:
import numpy as np
import pandas as pd
import time 

import matplotlib.pyplot as plt

# from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

from sklearn.cluster import KMeans
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from scipy.stats import kurtosis

from textwrap import wrap

### Load Dataset

In [2]:
RANDOM_SEED = 25


columns = ['Radius','Texture','Perimeter','Area','Smoothness','Compactness',
           'Concavity','Concave_Points','Symmetry','Fractal_Dimension',
           'Malignant/Benign']

# Read CSV file into pandas df
df = pd.read_csv('../datasets/breast_cancer/breast-cancer-wisconsin.csv',
                 delimiter=',', quotechar='"', names=columns)

### Dataframe without Preprocessing 

In [4]:
print("Printing dataframe head (without any preprocessing)....")
print(df.head(10))

Printing dataframe head (without any preprocessing)....
    Radius  Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
0  1000025        5          1     1           1            2         1   
1  1002945        5          4     4           5            7        10   
2  1015425        3          1     1           1            2         2   
3  1016277        6          8     8           1            3         4   
4  1017023        4          1     1           3            2         1   
5  1017122        8         10    10           8            7        10   
6  1018099        1          1     1           1            2        10   
7  1018561        2          1     2           1            2         1   
8  1033078        2          1     1           1            2         1   
9  1033078        4          2     1           1            2         1   

   Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
0               3         1                  1     

### Dataset Preprocessing 

In [5]:
# Shuffle
df = shuffle(df, random_state=RANDOM_SEED)

# DROP USELESS ROWS AND COLUMNS
df.dropna(inplace=True)
cols = [0]
# Drop ID column (it's not attribute or target)
df.drop(df.columns[cols],axis=1,inplace=True)
# Drop all data points with missing variables  (denoted by '?' entry)
nostrings_row_list = [x.isdigit() for x in df.iloc[:,5]]
df = df[nostrings_row_list]


# Handle categorical data
# df = pd.get_dummies(df)


# Split data into X and y vectors
X = df.ix[:, df.columns != 'Malignant/Benign']
y = df['Malignant/Benign']

# Change 2 -> 0 (benign) and 4 -> 1 (malignant)
y.replace(2, 0, inplace=True)
y.replace(4, 1, inplace=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


### Sanity Check on Dataframe 


In [6]:
print("Sanity Check: Printing out dataframe and shape after preprocessing... ")
print(df.head(10))
print("df.shape: ", df.shape)


Sanity Check: Printing out dataframe and shape after preprocessing... 
     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   

     Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
437               1         1                  1                 0  
511               2         1             

### Training and Testing Split, Scaling 

In [None]:
# # Split into 30%  training data, 70% testing data
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     test_size=0.30, random_state=RANDOM_SEED)


# # Apply scaling. Large values of certain features undesireable for NN
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

### Sanity Check on X_train

In [None]:
# print("Sanity Check: Printing out X_train... ")
# print(X_train[:4])
# print("X_train.shape: ", X_train.shape)
# print("X_test.shape: ", X_test.shape)
# print("y_train.shape: ", y_train.shape)
# print("y_test.shape: ", y_test.shape)

### Hyperparameters 

In [14]:
n_comp = 4

In [11]:
print(X)

     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   
586        8         10    10          10            6        10   
264        7          9     4          10           10         3   
554        3          1     1           1            2         1   
509        2          1     1           1       

### Run PCA

In [31]:
pca = PCA(n_components=n_comp)
X_dim_reduced = pca.fit(X).transform(X)
print(pca.explained_variance_ratio_)

print("Printing X_dim_reduced...")
print(X_dim_reduced)


[0.69050756 0.07195066 0.06055921 0.04442012]
Printing X_dim_reduced...
[[-5.36299426e+00  8.05705204e-02 -1.11124737e+00  1.06752071e-01]
 [-4.77418017e+00  1.55436119e-02 -1.84084424e+00  1.12592753e-01]
 [ 9.98768241e+00  3.65294376e+00 -1.66526630e-01 -1.54485169e+00]
 ...
 [-5.06625849e+00  7.06387671e-03 -1.96325633e+00  1.53815806e-01]
 [-4.91982546e+00  9.18695072e-01  1.78457077e+00  4.49582730e-01]
 [ 1.23875561e+01 -7.39214050e-01  3.54122346e+00  2.71633788e+00]]


In [28]:
fitter = GaussianMixture(n_components=2,covariance_type='full',n_init=10,max_iter=200).fit(X)
print(fitter.lower_bound_)
print(X_dim_reduced.shape)

-6.901316640162848
(683, 4)


### Run ICA

In [43]:
# Set n_comp for ICA portion of code 
n_comp = 7
algs = ['parallel','deflation']

# When you're running the real thing, iterate through these with a for loop 
alg = algs[0]
ica = FastICA(n_components=n_comp,whiten=False,algorithm=alg)
# kur0 = sum(kurtosis(X))
ica = ica.fit(X)
x_dimReduced_ICA = ica.transform(X)
kur1 = sum(kurtosis(x_dimReduced_ICA))
print(ica.components_)

[[-0.20826284 -0.08509828 -0.54112874 -0.21770731  0.15470086 -0.16251488
   0.01970548  0.42313939 -0.61594593]
 [ 0.67862971 -0.32251087 -0.27562235 -0.23881558 -0.0136758  -0.4604734
  -0.02277221 -0.29403922  0.05698658]
 [ 0.22004797  0.10357374  0.12929481 -0.65028219 -0.1613688   0.29685289
   0.50083843  0.335072    0.15489801]
 [-0.00788009 -0.29758725 -0.2651934  -0.11453505 -0.34240802  0.67097776
  -0.16981883 -0.41807914 -0.23843427]
 [ 0.48294998  0.22690557  0.03724099  0.47199937 -0.52854154  0.04301909
   0.00997507  0.34569428 -0.30048634]
 [-0.24631531 -0.65017436 -0.01686189  0.06347376 -0.40576129 -0.13069574
  -0.11897638  0.4152056   0.37949198]
 [-0.02193947  0.2082625   0.34878156 -0.47316296 -0.22556674 -0.16004757
  -0.71165382  0.0648583  -0.15316755]
 [-0.30036415 -0.12596234  0.35570647 -0.05939838 -0.3822313  -0.3718944
   0.44457196 -0.35159183 -0.39773402]
 [ 0.25098451 -0.50597106  0.54251307  0.07322412  0.4377419   0.20091707
  -0.03263887  0.1664539

