In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import Imputer
import numpy as np
from sklearn.cross_validation import train_test_split

  from ._conv import register_converters as _register_converters


# Reading the dataset

In [2]:
hcc = pd.read_csv("hcc-data.csv")

In [3]:
hcc.head(n=2)

Unnamed: 0,Gender,Symptoms,Alcohol,Hepatitis B Surface Antigen,Hepatitis B e Antigen,Hepatitis B Core Antibody,Hepatitis C Virus Antibody,Cirrhosis,Endemic Countries,Smoking,...,Alkaline phosphatase,Total Proteins,Creatinine,Number of Nodules,Major dimension of nodule,Direct Bilirubin,Iron,Oxygen Saturation,Ferritin,Class
0,1,0.0,1,0.0,0.0,0.0,0.0,1,0.0,1.0,...,150.0,7.1,0.7,1.0,3.5,0.5,,,,1
1,0,,0,0.0,0.0,0.0,1.0,1,,,...,,,,1.0,1.8,,,,,1


In [4]:
hcc.columns

Index(['Gender', 'Symptoms', 'Alcohol', 'Hepatitis B Surface Antigen',
       'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',
       'Hepatitis C Virus Antibody', 'Cirrhosis', 'Endemic Countries',
       'Smoking', 'Diabetes', 'Obesity', 'Hemochromatosis',
       'Arterial Hypertension', 'Chronic Renal Insufficiency',
       'Human Immunodeficiency Virus', 'Nonalcoholic Steatohepatitis',
       'Esophageal Varices', 'Splenomegaly', 'Portal Hypertension',
       'Portal Vein Thrombosis', 'Liver Metastasis', 'Radiological Hallmark',
       'Age_at_diagnosis', 'Grams of Alcohol per day',
       'Packs of cigarets per year', 'Performance Status',
       'Encefalopathy degree', 'Ascites degree',
       'International Normalised Ratio', 'Alpha-Fetoprotein', 'Haemoglobin',
       'Mean Corpuscular Volume', 'Leukocytes', 'Platelets', 'Albumin',
       'Total Bilirubin', 'Alanine transaminase', 'Aspartate transaminase',
       'Gamma glutamyl transferase', 'Alkaline phosphatase', 'Total P

# Handling the unknown values for continuous attributes

In [5]:
continuous_x = hcc.iloc[:,23:49]

In [6]:
continuous_x.head()

Unnamed: 0,Age_at_diagnosis,Grams of Alcohol per day,Packs of cigarets per year,Performance Status,Encefalopathy degree,Ascites degree,International Normalised Ratio,Alpha-Fetoprotein,Haemoglobin,Mean Corpuscular Volume,...,Gamma glutamyl transferase,Alkaline phosphatase,Total Proteins,Creatinine,Number of Nodules,Major dimension of nodule,Direct Bilirubin,Iron,Oxygen Saturation,Ferritin
0,67,137.0,15.0,0,1.0,1.0,1.53,95.0,13.7,106.6,...,183.0,150.0,7.1,0.7,1.0,3.5,0.5,,,
1,62,0.0,,0,1.0,1.0,,,,,...,,,,,1.0,1.8,,,,
2,78,50.0,50.0,2,1.0,2.0,0.96,5.8,8.9,79.8,...,202.0,109.0,7.0,2.1,5.0,13.0,0.1,28.0,6.0,16.0
3,77,40.0,30.0,0,1.0,1.0,0.95,2440.0,13.4,97.1,...,94.0,174.0,8.1,1.11,2.0,15.7,0.2,,,
4,76,100.0,30.0,0,1.0,1.0,0.94,49.0,14.3,95.1,...,173.0,109.0,6.9,1.8,1.0,9.0,,59.0,15.0,22.0


In [7]:
y_value = hcc.iloc[: , 49]

In [8]:
type(y_value)

pandas.core.series.Series

In [9]:
imputer = Imputer(missing_values = "NaN" ,strategy = 'mean', axis = 0)  

In [10]:
imputer = imputer.fit(continuous_x)

In [11]:
continuous_x = imputer.transform(continuous_x)

In [12]:
print(continuous_x)

[[ 67.         137.          15.         ...  85.59883721  37.02894118
  438.99764706]
 [ 62.           0.          20.46428571 ...  85.59883721  37.02894118
  438.99764706]
 [ 78.          50.          50.         ...  28.           6.
   16.        ]
 ...
 [ 65.          71.00854701  48.         ...  85.59883721  37.02894118
  438.99764706]
 [ 44.          71.00854701  20.46428571 ...  85.59883721  37.02894118
  438.99764706]
 [ 52.          50.          20.46428571 ...  85.59883721  37.02894118
  438.99764706]]


In [13]:
continuous_x = pd.DataFrame(data = continuous_x)

In [14]:
continuous_x = continuous_x.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [15]:
continuous_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.643836,0.274,0.029412,0.0,0.0,0.0,0.173367,5.2e-05,0.635036,0.740519,...,0.103093,0.151954,0.03262,0.067568,0.2,0.097561,0.013699,0.382138,0.29388,0.19686
1,0.575342,0.0,0.040126,0.0,0.0,0.0,0.146194,0.01066,0.57511,0.511372,...,0.157878,0.215518,0.051591,0.125282,0.2,0.014634,0.062671,0.382138,0.29388,0.19686
2,0.794521,0.1,0.098039,0.5,0.0,0.5,0.030151,3e-06,0.284672,0.205589,...,0.115335,0.110062,0.0316,0.256757,1.0,0.560976,0.0,0.125,0.047619,0.007175
3,0.780822,0.08,0.058824,0.0,0.0,0.0,0.027638,0.001347,0.613139,0.550898,...,0.045747,0.176475,0.042813,0.122973,0.4,0.692683,0.003425,0.382138,0.29388,0.19686
4,0.767123,0.2,0.058824,0.0,0.0,0.0,0.025126,2.6e-05,0.678832,0.510978,...,0.096649,0.110062,0.030581,0.216216,0.2,0.365854,0.062671,0.263393,0.119048,0.009865


In [16]:
X_train, X_test, y_train, y_test = train_test_split(continuous_x,y_value,test_size=0.3, random_state=100)

In [17]:
X_train.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

continuous_x.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

X_test.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [18]:
continuous_x

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.643836,0.274000,0.029412,0.00,0.0,0.000000,0.173367,5.181333e-05,0.635036,0.740519,...,0.103093,0.151954,0.032620,0.067568,0.2,0.097561,0.013699,0.382138,0.293880,0.196860
1,0.575342,0.000000,0.040126,0.00,0.0,0.000000,0.146194,1.066026e-02,0.575110,0.511372,...,0.157878,0.215518,0.051591,0.125282,0.2,0.014634,0.062671,0.382138,0.293880,0.196860
2,0.794521,0.100000,0.098039,0.50,0.0,0.500000,0.030151,2.540952e-06,0.284672,0.205589,...,0.115335,0.110062,0.031600,0.256757,1.0,0.560976,0.000000,0.125000,0.047619,0.007175
3,0.780822,0.080000,0.058824,0.00,0.0,0.000000,0.027638,1.347147e-03,0.613139,0.550898,...,0.045747,0.176475,0.042813,0.122973,0.4,0.692683,0.003425,0.382138,0.293880,0.196860
4,0.767123,0.200000,0.058824,0.00,0.0,0.000000,0.025126,2.640381e-05,0.678832,0.510978,...,0.096649,0.110062,0.030581,0.216216,0.2,0.365854,0.062671,0.263393,0.119048,0.009865
5,0.753425,0.142017,0.040126,0.25,0.0,0.500000,0.185930,6.009905e-05,0.613139,0.439122,...,0.141108,0.403302,0.017329,0.094595,0.2,0.414634,0.044521,0.236607,0.174603,0.049776
6,0.397260,0.000000,0.000000,0.00,0.0,0.000000,0.140704,7.606286e-05,0.394161,0.648703,...,0.077320,0.214280,0.034659,0.081081,1.0,0.053659,0.071575,0.763393,1.000000,0.651121
7,0.561644,0.142017,0.039216,0.75,0.0,0.000000,0.155779,5.445813e-03,0.423358,0.449102,...,0.103737,0.305215,0.032620,0.043243,0.4,0.365854,0.041096,0.187500,0.198413,0.316592
8,0.410959,0.200000,0.062745,0.25,0.0,0.500000,0.577889,4.198095e-06,0.503650,0.758483,...,0.059278,0.063062,0.022426,0.052703,0.2,0.239024,0.037671,0.379464,0.579365,0.440359
9,0.315068,0.200000,0.000000,0.00,0.0,0.000000,0.070352,3.314286e-07,0.496350,0.365269,...,0.150129,0.308280,0.032620,0.052703,0.2,0.380488,0.020548,0.382138,0.293880,0.196860


In [19]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)
input_func

<function tensorflow.python.estimator.inputs.pandas_io.pandas_input_fn.<locals>.input_fn>

# Making feature column

In [20]:
variable_names = continuous_x

tf_variables =[]

for index in variable_names:
    tf_variables.append(tf.feature_column.numeric_column(index) )

In [21]:
feat_cols = list(tf_variables)

In [22]:
feat_cols

[_NumericColumn(key='a', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='b', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='c', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='d', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='e', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='f', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='g', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='h', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='i', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='j', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='k', shape=(1,), def

In [33]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,20,15,10,20],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\USER\\AppData\\Local\\Temp\\tmpqngk9tty', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B272FD7080>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [34]:
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\USER\AppData\Local\Temp\tmpqngk9tty\model.ckpt.
INFO:tensorflow:loss = 6.942799, step = 1
INFO:tensorflow:global_step/sec: 74.0423
INFO:tensorflow:loss = 7.2824554, step = 101 (1.335 sec)
INFO:tensorflow:global_step/sec: 103.722
INFO:tensorflow:loss = 2.9960322, step = 201 (0.969 sec)
INFO:tensorflow:global_step/sec: 105.722
INFO:tensorflow:loss = 2.8084726, step = 301 (0.948 sec)
INFO:tensorflow:global_step/sec: 101.482
INFO:tensorflow:loss = 2.4841614, step = 401 (0.984 sec)
INFO:tensorflow:global_step/sec: 113.924
INFO:tensorflow:loss = 0.033145137, step = 501 (0.877 sec)
INFO:tensorflow:global_step/sec: 108.28
INFO:tensorflow:loss = 0.119623944, step = 601 (0.929 sec)
INFO:tensorflow:global_step/sec: 107.554
INFO:tensorflow:loss = 0.02719239, step = 701 (0.925 sec)
INFO:tensorflow:global_step/sec: 118.288
INFO:tensorflow:loss = 0.10712769, step = 801 (0.848 sec)
INFO:tensorflow:global

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1b2741b7d30>

In [35]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      y=y_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [36]:
results = dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2018-02-12-00:10:33
INFO:tensorflow:Restoring parameters from C:\Users\USER\AppData\Local\Temp\tmpqngk9tty\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-02-12-00:10:34
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.72, accuracy_baseline = 0.58, auc = 0.7413792, auc_precision_recall = 0.84612536, average_loss = 2.9453745, global_step = 1000, label/mean = 0.58, loss = 29.453745, prediction/mean = 0.7055721


In [37]:
results

{'accuracy': 0.72,
 'accuracy_baseline': 0.58,
 'auc': 0.7413792,
 'auc_precision_recall': 0.84612536,
 'average_loss': 2.9453745,
 'global_step': 1000,
 'label/mean': 0.58,
 'loss': 29.453745,
 'prediction/mean': 0.7055721}

In [None]:
print(tf.__version__)