### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import time 

import matplotlib.pyplot as plt

# from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

from sklearn.cluster import KMeans
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from scipy.stats import kurtosis

from scipy.spatial.distance import cdist

from textwrap import wrap

### Load Dataset

In [2]:
RANDOM_SEED = 25


columns = ['Radius','Texture','Perimeter','Area','Smoothness','Compactness',
           'Concavity','Concave_Points','Symmetry','Fractal_Dimension',
           'Malignant/Benign']

# Read CSV file into pandas df
df = pd.read_csv('../datasets/breast_cancer/breast-cancer-wisconsin.csv',
                 delimiter=',', quotechar='"', names=columns)

### Dataframe without Preprocessing 

In [3]:
print("Printing dataframe head (without any preprocessing)....")
print(df.head(10))

Printing dataframe head (without any preprocessing)....
    Radius  Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
0  1000025        5          1     1           1            2         1   
1  1002945        5          4     4           5            7        10   
2  1015425        3          1     1           1            2         2   
3  1016277        6          8     8           1            3         4   
4  1017023        4          1     1           3            2         1   
5  1017122        8         10    10           8            7        10   
6  1018099        1          1     1           1            2        10   
7  1018561        2          1     2           1            2         1   
8  1033078        2          1     1           1            2         1   
9  1033078        4          2     1           1            2         1   

   Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
0               3         1                  1     

### Dataset Preprocessing 

In [4]:
# Shuffle
df = shuffle(df, random_state=RANDOM_SEED)

# DROP USELESS ROWS AND COLUMNS
df.dropna(inplace=True)
cols = [0]
# Drop ID column (it's not attribute or target)
df.drop(df.columns[cols],axis=1,inplace=True)
# Drop all data points with missing variables  (denoted by '?' entry)
nostrings_row_list = [x.isdigit() for x in df.iloc[:,5]]
df = df[nostrings_row_list]


# Handle categorical data
# df = pd.get_dummies(df)


# Split data into X and y vectors
X = df.ix[:, df.columns != 'Malignant/Benign']
y = df['Malignant/Benign']

# Change 2 -> 0 (benign) and 4 -> 1 (malignant)
y.replace(2, 0, inplace=True)
y.replace(4, 1, inplace=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


### Sanity Check on Dataframe 


In [5]:
print("Sanity Check: Printing out dataframe and shape after preprocessing... ")
print(df.head(10))
print("df.shape: ", df.shape)


Sanity Check: Printing out dataframe and shape after preprocessing... 
     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   

     Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
437               1         1                  1                 0  
511               2         1             

### Training and Testing Split, Scaling 

In [None]:
# # Split into 30%  training data, 70% testing data
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     test_size=0.30, random_state=RANDOM_SEED)


# # Apply scaling. Large values of certain features undesireable for NN
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

### Sanity Check on X

In [None]:
print(X)

### ICA - (Followed by Clustering Later)

In [6]:
n_comp_max = 10 

n_comp_list = [x for x in range(1, n_comp_max)]

errorlist = []
errorlist_dimReduced = []
kurtosis_list = [] 

for n_comp in n_comp_list: 
    
    algs = ['parallel','deflation']
    alg = algs[0]
    ica = FastICA(n_components=n_comp,whiten=False,algorithm=alg)
#     kur0 = sum(kurtosis(X))
    ica = ica.fit(X)
    x_dimReduced_ICA = ica.transform(X)
    kurtosis(x_dimReduced_ICA)
    kur1 = sum(kurtosis(x_dimReduced_ICA))
#     print(ica.components_)
#     print("kur0: ", kur0)
    print("kur1: ", kur1)
    kurtosis_list.append(kur1)

    # Without ICA
    clf = KMeans(n_clusters=2, random_state=0)
    clf.fit(X)
    error = mean_squared_error(y, clf.predict(X))
    errorlist.append(error)
    print("Printing error without ICA ... ", error)
    
    # After PCA
    clf_dimReduced = KMeans(n_clusters=2, random_state=0)
    clf_dimReduced.fit(x_dimReduced_ICA)
    error_dimReduced = mean_squared_error(y, clf_dimReduced.predict(x_dimReduced_ICA))
    errorlist_dimReduced.append(error_dimReduced)
    print("Printing error_dimReduced ... ", error_dimReduced)
    
    



kur1:  7.0973433558415735
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177
kur1:  7.195489326455472
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177




kur1:  7.253598631374575
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177
kur1:  8.13078943991839
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177




kur1:  7.1311024960235105
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177




kur1:  7.363833525221575
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177
kur1:  7.896025802812462
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177
kur1:  9.253116128395776




Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177
kur1:  7.88480094893176
Printing error without ICA ...  0.03953147877013177
Printing error_dimReduced ...  0.03953147877013177




In [7]:
print(min(errorlist))
print(min(errorlist_dimReduced))

0.03953147877013177
0.03953147877013177


### Clustering after ICA

In [None]:
# clf_dimReduced = KMeans(n_clusters=2, random_state=0)
# clf_dimReduced.fit(X_dim_reduced)
# error_dimReduced = mean_squared_error(y, clf_dimReduced.predict(X_dim_reduced))
# print("Printing error_dimReduced ... ", error_dimReduced)


# Hyperparameters 

# Vary this as needed 
init_method = "k-means++"
# Number of times to run algo with different centroid seeds 
n_init = 1
max_iter = 10 
# Runs each of the n_inits in parallel using specified number of threads
n_jobs = 1


# k means determine k
distortions = []
K = range(1,31)

for k in K:
    print("clustering after pca for k-value: ", k)
    kmeanModel = KMeans(n_clusters=k).fit(X_dim_reduced)
    kmeanModel.fit(X_dim_reduced)
    
    clf_kMeans = KMeans(n_clusters=k, random_state=0)
    clf_kMeans.fit(X)
    distortions.append(sum(np.min(cdist(X_dim_reduced, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

    
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing varying k on WBCD (after PCA)')
plt.xlabel('Number of cluster centers')
plt.ylabel('Sum of Squared Error')
plt.show()

### Compare clustering with k = 2 (Dim reduced k-means, vs just k-means)

In [None]:

# for n_comp in n_comp_list: 
#     n_comp = 9
    
#     ica = FastICA(n_components=n_comp,whiten=True,algorithm=alg)
# #     kur0 = sum(kurtosis(X))
#     ica = ica.fit(X)
#     x_dimReduced_ICA = ica.transform(X)
#     kur1 = sum(kurtosis(x_dimReduced_ICA))
# #     print(ica.components_)
# #     print("kur0: ", kur0)
#     print("kur1: ", kur1)

errorlist = []
errorlist_dimReduced = []

n_comp_list = range(1, 10)

algs = ['parallel','deflation']
alg = algs[0]


for n_comp in n_comp_list: 
    ica = FastICA(n_components=n_comp,whiten=True,algorithm=alg)
    ica = ica.fit(X)
    X_dim_reduced = ica.transform(X)

    # Without ICA
    clf = KMeans(n_clusters=2, random_state=0)
    clf.fit(X)
    error = mean_squared_error(y, clf.predict(X))
    errorlist.append(error)
    print("Printing error without PCA ... ", error)
    
    # After ICA
    clf_dimReduced = KMeans(n_clusters=2, random_state=0)
    clf_dimReduced.fit(X_dim_reduced)
    error_dimReduced = mean_squared_error(y, clf_dimReduced.predict(X_dim_reduced))
    errorlist_dimReduced.append(error_dimReduced)
    print("Printing error_dimReduced ... ", error_dimReduced)

In [None]:
print(min(errorlist))
print(min(errorlist_dimReduced))

In [None]:
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing varying k on WBCD (after PCA)')
plt.xlabel('Number of cluster centers')
plt.ylabel('Sum of Squared Error')
plt.show()

### EM Clustering after PCA (when k = 2) 

In [None]:
errorlist = []
errorlist_dimReduced = []

for n_comp in range(1,10): 
    
    pca = PCA(n_components=n_comp)
    X_dim_reduced = pca.fit(X).transform(X)
    print(pca.explained_variance_ratio_)

    # Without PCA
    fitter = GaussianMixture(n_components=2,covariance_type='full',n_init=10,max_iter=200).fit(X)
    fitter.fit(X)
    error = mean_squared_error(y, fitter.predict(X))
    errorlist.append(error)
    print("Printing error without PCA ... ", error)
    
    # After PCA
    fitter_dimReduced= GaussianMixture(n_components=2,covariance_type='full',n_init=10,max_iter=200).fit(X)
    fitter_dimReduced.fit(X_dim_reduced)
    error_dimReduced = mean_squared_error(y, fitter_dimReduced.predict(X_dim_reduced))
    errorlist_dimReduced.append(error_dimReduced)
    print("Printing error_dimReduced ... ", error_dimReduced)

In [None]:
print(min(errorlist))
print(min(errorlist_dimReduced))