In [50]:
import os
import pandas as pd
import matplotlib.pylab as plt
# data cleaning and manipulation 
import numpy as np
import tensorflow as tf

In [140]:
# The data needs to be split into a training set and a test set
# To use 80/20, set the training size to .8
training_set_size_portion = .8
# Set to True to shuffle the data before you split into training and # test sets
do_shuffle = True
# Keep track of the accuracy score
accuracy_score = 0
# The DNN has hidden units, set the spec for them here
hidden_units_spec = [10,20,10]
n_classes_spec = 2
# Define the temp directory for keeping the model and checkpoints
tmp_dir_spec = "tmp/model"
# The number of training steps
steps_spec = 2000
# The number of epochs
epochs_spec = 15
# File Name
file_name = "clinical.csv"
#input variables
features = ['TISSUE_SOURCE_SITE','ANEUPLOIDY_SCORE']
# Here's the label that we want to predict
labels = ['PERSON_NEOPLASM_CANCER_STATUS']
# Here's the name we'll give our data


In [141]:
my_data = pd.read_csv(file_name, delimiter=',')
my_data.head()

Unnamed: 0,PATIENT_NAME,CANCER_TYPE_ACRONYM,AGE,DAYS_TO_BIRTH,PERSON_NEOPLASM_CANCER_STATUS,RACE,TISSUE_SOURCE_SITE,ANEUPLOIDY_SCORE,GRADE,IGF2,COL1A1,EEF1A1,CLU,H19,GAPDH,CD74,C3,RPL8,RPS18
0,TCGA-04-1348,1,44,16236,1,0,4,5,2,0.009529,0.069028,0.183659,0.028697,0.197605,0.449789,1.0,0.13119,0.145715,0.165588
1,TCGA-04-1357,1,52,18999,1,0,4,12,2,0.040637,0.144819,0.658017,0.501043,0.00417,0.359466,1.0,0.126835,0.359117,0.336082
2,TCGA-04-1362,1,59,21745,1,0,4,16,2,0.493489,0.050296,0.571312,0.086541,0.00064,0.30187,0.125584,0.068805,0.368181,0.304159
3,TCGA-04-1364,1,61,22294,1,0,4,31,2,1.0,0.01131,0.16435,0.012596,0.009681,0.050258,0.004711,0.01259,0.149953,0.119808
4,TCGA-04-1365,1,87,31925,1,0,4,9,2,0.019999,0.052526,0.334952,0.105748,1.0,0.483308,0.421103,0.073854,0.298796,0.221242


In [53]:
# If the doShuffle property is true, we will shuffle with this
# You really SHOULD shuffle to make sure that trends in data don't affect your learning
if do_shuffle:
    randomized_data = my_data.reindex(np.random.permutation(my_data.index))
else:
    randomized_data = my_data


Now that you have randomized data, you can split it. Earlier you specified a training size portion, so calculate how many records should be in the training set based on that, and the rest will be in the test set. This code gives you the size of each set

In [54]:
total_records = len(randomized_data)
training_set_size = int(total_records * training_set_size_portion)
test_set_size = total_records = training_set_size

and this code then splits your data into the training features and labels sets based on the size I specified for my training features and labels. We’ll take from the ‘head’ of the randomized_data.

In [55]:
# Build the training features and labels
training_features = randomized_data.head(training_set_size)[features].copy()
training_labels = randomized_data.head(training_set_size)[labels].copy()
print(training_features.head())
print(training_labels.head())
type(training_labels)

     TISSUE_SOURCE_SITE  ANEUPLOIDY_SCORE
182                  29                17
264                  61                10
24                   13                 7
86                   24                14
111                  24                12
     PERSON_NEOPLASM_CANCER_STATUS
182                              1
264                              0
24                               0
86                               1
111                              1


pandas.core.frame.DataFrame

So, similarly, the records at the ‘tail’ can contain our test set. We’ll take test_set_size amount of them:



In [56]:

testing_features = randomized_data.tail(test_set_size)[features].copy()
testing_labels = randomized_data.tail(test_set_size)[labels].copy()

# Create TensorFlow Feature Columns
The Neural Network classifier expects the feature columns to be specified as tf.feature_column types. As our columns are numbers, we set them to numeric_column types.

In [57]:
feature_columns = [tf.feature_column.numeric_column(key) for key in features]

# Define the Neural Network used to classify the data
Given that we have all our data, we can now create our neural network object that we’ll train on the data. This takes the feature columns that you just created as well as parameters defining the number of hidden units in the neural network, as well as the number of classes. As it trains the network, it saves temporary files and checkpoints as well as the finished model out to the specified model directory.

The hidden units are a direct specification of what the network looks like — so, for example our default here is [10, 20, 10], which means there’ll be a layer of 10 neurons, with each connected to 20 neurons in the next layer, each of which is connected to 10 neurons in the third layer.

The classes are the number of classes we are classifying to. In this case we’re doing if a person has cancer or not, as far as breast cancer goes and it has 2 classifications, so we will train on 2 classes.
give this code if you have classes in breast cancer data  ->     "n_classes=n_classes_spec",

In [144]:
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns, 
    hidden_units=hidden_units_spec,
    model_dir=tmp_dir_spec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'tmp/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11fa103c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


# Train the network
The next step is to train the classifier using the data. to do this you build an input function that specifies the features (aka ‘x’) and the labels (aka ‘y’). This is done by specifiying it as a pandas_input_fn:# 

In [145]:
# Define the training input function
train_input_fn = tf.estimator.inputs.pandas_input_fn(x=training_features,
                                                     y=training_labels['PERSON_NEOPLASM_CANCER_STATUS'],
                                                     num_epochs=epochs_spec,shuffle=True)
    

# 
And now you can train the neural network by giving it the input function, and the number of steps you want to use to train it. Experiment with different step numbers to get different results. In my case 2000 steps

In [146]:
classifier.train(input_fn=train_input_fn, steps=steps_spec)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from tmp/model/model.ckpt-162
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 162 into tmp/model/model.ckpt.
INFO:tensorflow:loss = 65.57825, step = 163
INFO:tensorflow:Saving checkpoints for 189 into tmp/model/model.ckpt.
INFO:tensorflow:Loss for final step: 7.7222624.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x11fa10c18>

In [147]:
# Define the test input function
 test_input_fn = tf.estimator.inputs.pandas_input_fn(
                    x=testing_features, 
                    y=testing_labels['PERSON_NEOPLASM_CANCER_STATUS'], 
                    num_epochs=epochs_spec, 
                    shuffle=False)

Now, we can ask the classifier to tell evaluate the test input function, and tell us its accuracy. It goes through the test set, and compares its classifications to the actual values, and uses this to calculate how often it was right, giving us an accuracy score:

In [148]:
# Evaluate accuracy.
accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
print("Accuracy = {}".format(accuracy_score))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-05-07:51:01
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from tmp/model/model.ckpt-189
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-05-07:51:01
INFO:tensorflow:Saving dict for global step 189: accuracy = 0.75784755, accuracy_baseline = 0.75784755, auc = 0.5258054, auc_precision_recall = 0.7684066, average_loss = 0.5576782, global_step = 189, label/mean = 0.75784755, loss = 69.09013, precision = 0.75784755, prediction/mean = 0.754101, recall = 1.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 189: tmp/model/model.ckpt-189
Accuracy = 0.7578475475311279


In [149]:
# Create a prediction set -- this is a list of input features that you want to classify
prediction_set = pd.DataFrame({'TISSUE_SOURCE_SITE':[89], 'ANEUPLOIDY_SCORE':[34]})

In [150]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
                      x=prediction_set, 
                      num_epochs=1, 
                      shuffle=False)

In [151]:
# Get a list of the predictions
predictions = list(classifier.predict(input_fn=predict_input_fn))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from tmp/model/model.ckpt-189
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [152]:
print(predictions)

[{'logits': array([2.3756077], dtype=float32), 'logistic': array([0.9149483], dtype=float32), 'probabilities': array([0.08505175, 0.9149483 ], dtype=float32), 'class_ids': array([1]), 'classes': array([b'1'], dtype=object)}]


In [143]:
#predicted_classes = [p["classes"] for p in predictions] 
#results=np.concatenate(predictions) 
#print(predicted_classes)