WBCD Dataset Neural Network 

### Import Libraries

In [2]:
import mlrose

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score

### Import more Libraries


In [27]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings('ignore')

### Load Dataset

In [5]:
RANDOM_SEED = 25


columns = ['Radius','Texture','Perimeter','Area','Smoothness','Compactness',
           'Concavity','Concave_Points','Symmetry','Fractal_Dimension',
           'Malignant/Benign']

# Read CSV file into pandas df
df = pd.read_csv('../datasets/breast_cancer/breast-cancer-wisconsin.csv',
                 delimiter=',', quotechar='"', names=columns)

### Dataframe without Preprocessing 

In [6]:
print("Printing dataframe head (without any preprocessing)....")
print(df.head(10))

Printing dataframe head (without any preprocessing)....
    Radius  Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
0  1000025        5          1     1           1            2         1   
1  1002945        5          4     4           5            7        10   
2  1015425        3          1     1           1            2         2   
3  1016277        6          8     8           1            3         4   
4  1017023        4          1     1           3            2         1   
5  1017122        8         10    10           8            7        10   
6  1018099        1          1     1           1            2        10   
7  1018561        2          1     2           1            2         1   
8  1033078        2          1     1           1            2         1   
9  1033078        4          2     1           1            2         1   

   Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
0               3         1                  1     

### Dataset Preprocessing 

In [7]:
# Shuffle
df = shuffle(df, random_state=RANDOM_SEED)

# DROP USELESS ROWS AND COLUMNS
df.dropna(inplace=True)
cols = [0]
# Drop ID column (it's not attribute or target)
df.drop(df.columns[cols],axis=1,inplace=True)
# Drop all data points with missing variables  (denoted by '?' entry)
nostrings_row_list = [x.isdigit() for x in df.iloc[:,5]]
df = df[nostrings_row_list]


# Handle categorical data
# df = pd.get_dummies(df)


# Split data into X and y vectors
X = df.ix[:, df.columns != 'Malignant/Benign']
y = df['Malignant/Benign']

# Change 2 -> 0 (benign) and 4 -> 1 (malignant)
y.replace(2, 0, inplace=True)
y.replace(4, 1, inplace=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


### Sanity Check on Dataframe 

In [8]:
print("Sanity Check: Printing out dataframe and shape after preprocessing... ")
print(df.head(10))
print("df.shape: ", df.shape)


Sanity Check: Printing out dataframe and shape after preprocessing... 
     Texture  Perimeter  Area  Smoothness  Compactness Concavity  \
437        4          1     1           1            2         1   
511        5          1     1           1            2         1   
215        8          7     8           7            5         5   
684        1          1     1           1            2         1   
302       10         10    10           7            9        10   
341        1          1     1           1            2         1   
608        5         10    10          10           10        10   
366        6         10    10          10            8        10   
205        5         10    10           9            6        10   
270        8          4     7           1            3        10   

     Concave_Points  Symmetry  Fractal_Dimension  Malignant/Benign  
437               1         1                  1                 0  
511               2         1             

### Training and Testing Split, Scaling 

In [9]:
# Split into 30%  training data, 70% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30, random_state=RANDOM_SEED)


# Apply scaling. Large values of certain features undesireable for NN
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


### Sanity Check on X_train

In [10]:
print("Sanity Check: Printing out X_train... ")
print(X_train[:4])
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)


Sanity Check: Printing out X_train... 
[[ 0.21607913  0.26729122  0.89900862  1.44426357  2.63187492  0.94645923
   1.85978038  2.25742091 -0.34434076]
 [-0.1425522   1.54175711  1.22554822  2.47980054  0.356122    1.76653327
   1.45180653  0.66496637 -0.34434076]
 [ 2.00923577 -0.36994172 -0.4071498  -0.62681039 -0.55417916  0.67310122
  -0.99603658 -0.60899726  0.20067541]
 [-1.21844619 -0.68855819 -0.7336894  -0.62681039 -1.00932974 -0.69368885
  -0.99603658 -0.60899726 -0.34434076]]
X_train.shape:  (478, 9)
X_test.shape:  (205, 9)
y_train.shape:  (478,)
y_test.shape:  (205,)


### Test different learning rates (Insert Algo, Sigmoid)  

In [None]:
# Initialize neural network object and fit object - attempt 1

learning_rates = list()

for x in range(0.001, 1, 0.005):
    learning_rates.append(x)
            

# Create list to hold data on each trial
data = []
# Columns for df we'll create after loop 
cols = ["Learning Rate", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for lr in learning_rates: 
    print("Training lr: ", lr)

    lr_nn_model_RHC_sigmoid = mlrose.NeuralNetwork(hidden_nodes = [10,10,10], activation = 'sigmoid', 
                                        algorithm = 'random_hill_climb', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = lr, early_stopping = True, 
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_RHC_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_RHC_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_RHC_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    # Append to data list 
    data.append([lr, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

learning_rate_result_df = pd.DataFrame(data, columns=cols)
print(learning_rate_result_df)

# Save to csv 
learning_rate_result_df.to_csv("RHC_Sigmoid_LearningRate_data.csv")


### Test Different Architectures (RHC, Sigmoid)

In [None]:
# Initialize neural network object and fit object - attempt 1

hidden_node_architecture = list()

for x in range(5, 40, 5):
    for y in range(5, 40, 5):
        for z in range(5, 40, 5):
            hidden_node_architecture.append([x,y,z])
            
# hidden_node_architecture = [[5,5,5]]

# Create list to hold data on each trial
data = []
# Columns for df we'll create after loop 
cols = ["Nodes in each layer", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for architecture in hidden_node_architecture: 
    print("Training architecture: ", architecture)

    lr_nn_model_RHC_sigmoid = mlrose.NeuralNetwork(hidden_nodes = architecture, activation = 'sigmoid', 
                                        algorithm = 'random_hill_climb', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = 0.01, early_stopping = True, 
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_RHC_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_RHC_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_RHC_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    architecture_tup = (x,y,z)
    
    data.append([architecture_tup, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

result_df = pd.DataFrame(data, columns=cols)
print(result_df)
    


In [None]:
result_df.to_csv("RHC_Sigmoid_Architecture_data.csv")

In [None]:
data.sort(key=lambda x: x[2], reverse=True)
result_df_sorted = pd.DataFrame(data, columns=cols)
result_df_sorted.to_csv("RHC_Sigmoid_Architecture_data(SORTED).csv")


### Test Different Architectures (Gradient Descent, Sigmoid)


In [None]:
# Initialize neural network object and fit object - attempt 1

hidden_node_architecture = list()

for x in range(5, 40, 5):
    for y in range(5, 40, 5):
        for z in range(5, 40, 5):
            hidden_node_architecture.append([x,y,z])

# Create list to hold data on each trial
gd_data = []
# Columns for df we'll create after loop 
cols = ["Nodes in each layer", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for architecture in hidden_node_architecture: 
    print("Training architecture: ", architecture)

    lr_nn_model_RHC_sigmoid = mlrose.NeuralNetwork(hidden_nodes = architecture, activation = 'sigmoid', 
                                        algorithm = 'gradient_descent', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = 0.01, early_stopping = True, 
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_RHC_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_RHC_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_RHC_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    architecture_tup = (x,y,z)
    
    gd_data.append([architecture_tup, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

gradient_descent_result_df = pd.DataFrame(gd_data, columns=cols)
print(gradient_descent_result_df)
    


In [None]:
gd_data.sort(key=lambda x: x[2], reverse=True)
gd_result_df_sorted = pd.DataFrame(gd_data, columns=cols)
gd_result_df_sorted.to_csv("WBCD_GD_Sigmoid_Architecture_data(SORTED)_VERSION2.csv")

### Test Different Architectures (Simulated Annealing, Sigmoid)


In [None]:
# Initialize neural network object and fit object - attempt 1

hidden_node_architecture = list()

for x in range(5, 40, 5):
    for y in range(5, 40, 5):
        for z in range(5, 40, 5):
            hidden_node_architecture.append([x,y,z])
            
# hidden_node_architecture = [[5,5,5]]

# Create list to hold data on each trial
sm_data = []
# Columns for df we'll create after loop 
cols = ["Nodes in each layer", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for architecture in hidden_node_architecture: 
    print("Training architecture: ", architecture)

    lr_nn_model_RHC_sigmoid = mlrose.NeuralNetwork(hidden_nodes = architecture, activation = 'sigmoid', 
                                        algorithm = 'simulated_annealing', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = 0.01, early_stopping = True, 
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_RHC_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_RHC_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_RHC_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    architecture_tup = (x,y,z)
    
    sm_data.append([architecture_tup, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

simulated_annealing_result_df = pd.DataFrame(sm_data, columns=cols)
print(simulated_annealing_result_df)
    


In [None]:
sm_data.sort(key=lambda x: x[2], reverse=True)
simulated_annealing_result_df = pd.DataFrame(sm_data, columns=cols)
simulated_annealing_result_df.to_csv("WBCD_SM_Sigmoid_Architecture_data.csv")

### Test Different Architectures (Genetic Algorithms, Sigmoid)


In [None]:
# Initialize neural network object and fit object - attempt 1

hidden_node_architecture = list()

for x in range(5, 40, 5):
    for y in range(5, 40, 5):
        for z in range(5, 40, 5):
            hidden_node_architecture.append([x,y,z])
            
# hidden_node_architecture = [[5,5,5]]

# Create list to hold data on each trial
ga_data = []
# Columns for df we'll create after loop 
cols = ["Nodes in each layer", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for architecture in hidden_node_architecture: 
    print("Training architecture: ", architecture)

    lr_nn_model_GA_sigmoid = mlrose.NeuralNetwork(hidden_nodes = architecture, activation = 'sigmoid', 
                                        algorithm = 'genetic_alg', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = 0.01, early_stopping = True, 
                                        pop_size=100,
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_GA_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_GA_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_GA_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    architecture_tup = (x,y,z)
    
    ga_data.append([architecture_tup, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

genetic_alg_result_df = pd.DataFrame(ga_data, columns=cols)
print(genetic_alg_result_df)
    


Training architecture:  [5, 5, 5]
Training architecture:  [5, 5, 10]
Training architecture:  [5, 5, 15]
Training architecture:  [5, 5, 20]
Training architecture:  [5, 5, 25]
Training architecture:  [5, 5, 30]
Training architecture:  [5, 5, 35]
Training architecture:  [5, 10, 5]
Training architecture:  [5, 10, 10]
Training architecture:  [5, 10, 15]
Training architecture:  [5, 10, 20]
Training architecture:  [5, 10, 25]
Training architecture:  [5, 10, 30]
Training architecture:  [5, 10, 35]
Training architecture:  [5, 15, 5]
Training architecture:  [5, 15, 10]
Training architecture:  [5, 15, 15]
Training architecture:  [5, 15, 20]
Training architecture:  [5, 15, 25]
Training architecture:  [5, 15, 30]
Training architecture:  [5, 15, 35]
Training architecture:  [5, 20, 5]
Training architecture:  [5, 20, 10]
Training architecture:  [5, 20, 15]
Training architecture:  [5, 20, 20]
Training architecture:  [5, 20, 25]
Training architecture:  [5, 20, 30]
Training architecture:  [5, 20, 35]
Tra

In [None]:
ga_data.sort(key=lambda x: x[2], reverse=True)
genetic_alg_result_df = pd.DataFrame(ga_data, columns=cols)
genetic_alg_result_df.to_csv("WBCD_GA_Sigmoid_Architecture_data.csv")

### Test Specific Architecture. Specify Everything 

In [48]:
# Initialize neural network object and fit object - attempt 1

hidden_node_architecture = list()

# for x in range(5, 40, 5):
#     for y in range(5, 40, 5):
#         for z in range(5, 40, 5):
#             hidden_node_architecture.append([x,y,z])
            
# hidden_node_architecture = [[10, 10, 10], [15,15,15], [20, 30, 30], [30, 30, 10], [30, 25, 20], 
#                             [5,20,20], [30,15,35]]

hidden_node_architecture = [[10, 10, 10], [13, 13, 13], [15, 15, 15] ]
# hidden_node_architecture = [[15, 15, 15] ]

# Create list to hold data on each trial
ga_data = []
# Columns for df we'll create after loop 
cols = ["Nodes in each layer", "training accuracy", "testing accuracy", "training MSE", "testing MSE"]

# COMMENCE TRAINING LOOP 

np.random.seed(5)

for architecture in hidden_node_architecture: 
    print("Training architecture: ", architecture)

    lr_nn_model_GA_sigmoid = mlrose.NeuralNetwork(hidden_nodes = architecture, activation = 'tanh', 
                                        algorithm = 'simulated_annealing', 
                                        max_iters = 1000, bias = True, is_classifier = True, 
                                        learning_rate = 0.01, early_stopping = True, 
                                        pop_size=100,
                                        clip_max = 5, max_attempts = 100)

    lr_nn_model_GA_sigmoid.fit(X_train, y_train)
    
    # Prediction based on y_train and y_test 
    y_train_pred = lr_nn_model_GA_sigmoid.predict(X_train)
    y_test_pred = lr_nn_model_GA_sigmoid.predict(X_test)
    
    # MSE Values 
    train_err = mean_squared_error(y_train,
                        y_train_pred)
    test_err = mean_squared_error(y_test,
                        y_test_pred) 

    # Accuracy Values
    y_train_accuracy = accuracy_score(y_train, y_train_pred)
    y_test_accuracy = accuracy_score(y_test, y_test_pred)
    
    x = architecture[0]
    y = architecture[1]
    z = architecture[2]
    
    architecture_tup = (x,y,z)
    
    ga_data.append([architecture_tup, y_train_accuracy, y_test_accuracy, train_err, test_err])
    
    
# Store results from above into df 

genetic_alg_result_df = pd.DataFrame(ga_data, columns=cols)
print(genetic_alg_result_df)
    


Training architecture:  [10, 10, 10]
Training architecture:  [13, 13, 13]
Training architecture:  [15, 15, 15]
  Nodes in each layer  training accuracy  testing accuracy  training MSE  \
0        (10, 10, 10)           0.822176          0.839024      0.177824   
1        (13, 13, 13)           0.165272          0.180488      0.834728   
2        (15, 15, 15)           0.755230          0.717073      0.244770   

   testing MSE  
0     0.160976  
1     0.819512  
2     0.282927  


In [22]:
ga_data.sort(key=lambda x: x[2], reverse=True)
genetic_alg_result_df = pd.DataFrame(ga_data, columns=cols)
genetic_alg_result_df.to_csv("WBCD_RHC_TEST_data.csv")