In [2]:
import tensorflow as tf
import numpy
import pandas as pd 
from tensorflow.keras import layers 
from tensorflow.keras.utils import to_categorical

from numpy import random

from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

import cPickle as pickle


data = pd.read_csv('raw.txt', sep='\t')
types = pd.read_csv('types-numeric.txt', sep='\t')
labels = pd.read_csv('types-labels.txt', sep='\t')


In [3]:
random.seed(69)
ii = numpy.random.rand(len(data)) < 0.7 

np_data = data.values
np_types = types.values
np_labels = labels.values

train = np_data[ii]
test = np_data[~ii]

# types = numbers assigned (0-16)
train_types = np_types[ii]
test_types = np_types[~ii]

# labels = string values assigned (then one-hot encoded later)
train_labels = np_labels[ii]
test_labels = np_labels[~ii] 

# ravel 
r_train_types = train_types.ravel()
r_test_types = test_types.ravel()

r_train_labels = train_labels.ravel()
r_test_labels = test_labels.ravel()

# One hot encoding of string labels for keras model 
encoded_train = to_categorical(r_train_types)
encoded_test = to_categorical(r_test_types)


In [4]:
# Learning rate: 0.01 to 0.000001
# encoded labels are one-hot encoded 
# Test labels are treated with ravel
learnloss = {} 
histories = {}
def learnLoss(learningRate, epochs, train, encoded_train, test, encoded_test, test_labels):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation='sigmoid'))
    model.add(layers.Dense(128, activation='sigmoid'))
    model.add(layers.Dense(17, activation='softmax'))
    model.compile(optimizer=tf.train.RMSPropOptimizer(learningRate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    
    model.fit(train, encoded_train, validation_data=(test, encoded_test), epochs=epochs, batch_size=32)
    
    
    # test 
    pred_y = model.predict_classes(test)
    nnyhat = confusion_matrix(test_types, pred_y)
    accuracy = metrics.accuracy_score(test_labels, pred_y)
    print("Accuracy: ", accuracy)
    learnloss[learningRate] = accuracy 
    histories[learningRate] = model.history.history

In [5]:
# Epochs = 10 unless otherwise noted 
# Learning rate: 0.01 = ~40% 
# Learning rate: 0.001 = ~63%
# Learning rate: 0.007 = ~35% 
# Learning rate: 0.0009 = ~68% 
# Learning rate: 0.0005 = ~72%  // epochs = 50
# Learning rate: 0.0002 = ~73%  // epochs = 50
# Learning rate: 0.00009 = ~77% // epochs = 100
# Learning rate: 0.00009 = ~78% // epochs = 200

learningRates =  numpy.geomspace(0.01, 0.000001, num=1)
print(learningRates)
for lr in learningRates:
    learnLoss(lr, 10, train, encoded_train, test, encoded_test, r_test_types)

[0.01]
Train on 5574 samples, validate on 2376 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
('Accuracy: ', 0.38173400673400676)


In [None]:
### CAREFUL
### You are about to DUMP and replace the current saved variables

filename='500epoch50learn'
with open(filename, 'wb')as fp:
    pickle.dump(learnloss, fp)
    pickle.dump(histories, fp)

## code for load 
with open(filename, 'rb') as fp:
    learnloss = pickle.load(fp)
    histories = pickle.load(fp)

In [6]:
### Model debug 

model = tf.keras.Sequential()

#model.add(layers.Dense(64, input_shape=(2403,)))
# model.add(layers.Dense(128, activation='sigmoid'))
# model.add(layers.Dense(128, activation='sigmoid'))
# model.add(layers.Dense(128, activation='sigmoid'))
model.add(layers.Dense(128, activation='sigmoid'))
model.add(layers.Dense(128, activation='sigmoid'))
# model.add(layers.Dense(128, activation='relu'))
# model.add(layers.Dense(128, activation='relu'))
# model.add(layers.Dense(64, activation='sigmoid'))

# softmax with 17 output units; one for each class label 
model.add(layers.Dense(17, activation='softmax'))

model.compile(optimizer=tf.train.RMSPropOptimizer(0.00009),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Epochs = 10 unless otherwise noted 
# Learning rate: 0.01 = ~40% 
# Learning rate: 0.001 = ~63%
# Learning rate: 0.007 = ~35% 
# Learning rate: 0.0009 = ~68% 
# Learning rate: 0.0005 = ~72%  // epochs = 50
# Learning rate: 0.0002 = ~73%  // epochs = 50
# Learning rate: 0.00009 = ~77% // epochs = 100
# Learning rate: 0.00009 = ~78% // epochs = 200

# setup model training 
# model.compile(optimizer=tf.train.AdamOptimizer(0.01),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])
# Configure a model for categorical classification.
# model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
#               loss=tf.keras.losses.categorical_crossentropy,
#               metrics=[tf.keras.metrics.categorical_accuracy])

In [7]:
# train! 
# epochs is how many times you go through training set
# batch_size is the number of samples to propogate at once

model.fit(train, encoded_train, validation_data=(test, encoded_test), epochs=10, batch_size=32)


Train on 5574 samples, validate on 2376 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1075950d0>

In [8]:
nn_pred = model.predict_classes(test)

nn_cm = confusion_matrix(test_types, nn_pred)
print(nn_cm)
print("Accuracy: ", metrics.accuracy_score(r_test_types, nn_pred))
# for i in range(len(r_test_labels)):
#     print("X=%s, Predicted=%s" % (r_test_labels[i], ynew[i]))

y_true = pd.Series(r_test_labels)
nn_pred = pd.Series(nn_pred)

pd.crosstab(y_true, nn_pred, rownames=['True'], colnames=['Predicted'], margins=True)

[[139   0   2   0   0   0   0   0   1   0   0   0   0   0   2   0   0]
 [  0   0  38   0   0   0  16   0   1   0  32  20  16   6   3   0   0]
 [  0   0 297   0   0   0   1   0  41   2   8   3   0   3  18   0   0]
 [  0   0   2   0   0   0   0   0   0   4   6   0   0   0   0   0   0]
 [  2   0  24   0  69   0   0   0   0   0   9  19  17   0  14   0   0]
 [  0   0   8   0   1   0  11   0   0   1  25   6  13   1   0   0   0]
 [  0   0  26   0   0   0 105   0   0   1  12   4   6   3   0   0   0]
 [  0   0   1   0   0   0   0   0  20   0   0   0   0   3   6   0   0]
 [  0   0  49   0   0   0   0   0 127   0   0   1   0   1   0   0   0]
 [  0   0  46   0   1   0   0   0   0  47  11   2   0  12   0   0   1]
 [  3   0  70   0   2   0   2   0   0   0  57  11  14  11   3   0   0]
 [  0   0   8   0   1   0   0   0   1   0   0 155   4   1   0   0   0]
 [  0   0  25   0   7   0   4   0   0   4  27  29  48   1   1   0   0]
 [  1   0  36   0   0   0   1   0   2   3  21   8   1  96   0   0   0]
 [  0 

Predicted,0,2,4,6,8,9,10,11,12,13,14,16,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
blca,0,38,0,16,1,0,32,20,16,6,3,0,132
brca,0,297,0,1,41,2,8,3,0,3,18,0,373
chol,0,2,0,0,0,4,6,0,0,0,0,0,12
coad,2,24,69,0,0,0,9,19,17,0,14,0,154
esca,0,8,1,11,0,1,25,6,13,1,0,0,66
hnsc,0,26,0,105,0,1,12,4,6,3,0,0,157
kich,0,1,0,0,20,0,0,0,0,3,6,0,30
kirc,0,49,0,0,127,0,0,1,0,1,0,0,178
lich,0,46,1,0,0,47,11,2,0,12,0,1,120
luad,3,70,2,2,0,0,57,11,14,11,3,0,173
