In [101]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import Imputer
import numpy as np
from sklearn.cross_validation import train_test_split

# Reading the dataset

In [102]:
hcc = pd.read_csv("hcc-data.csv")

In [103]:
hcc.head(n=2)

Unnamed: 0,Gender,Symptoms,Alcohol,Hepatitis B Surface Antigen,Hepatitis B e Antigen,Hepatitis B Core Antibody,Hepatitis C Virus Antibody,Cirrhosis,Endemic Countries,Smoking,...,Alkaline phosphatase,Total Proteins,Creatinine,Number of Nodules,Major dimension of nodule,Direct Bilirubin,Iron,Oxygen Saturation,Ferritin,Class
0,1,0.0,1,0.0,0.0,0.0,0.0,1,0.0,1.0,...,150.0,7.1,0.7,1.0,3.5,0.5,,,,1
1,0,,0,0.0,0.0,0.0,1.0,1,,,...,,,,1.0,1.8,,,,,1


In [104]:
hcc.columns

Index(['Gender', 'Symptoms', 'Alcohol', 'Hepatitis B Surface Antigen',
       'Hepatitis B e Antigen', 'Hepatitis B Core Antibody',
       'Hepatitis C Virus Antibody', 'Cirrhosis', 'Endemic Countries',
       'Smoking', 'Diabetes', 'Obesity', 'Hemochromatosis',
       'Arterial Hypertension', 'Chronic Renal Insufficiency',
       'Human Immunodeficiency Virus', 'Nonalcoholic Steatohepatitis',
       'Esophageal Varices', 'Splenomegaly', 'Portal Hypertension',
       'Portal Vein Thrombosis', 'Liver Metastasis', 'Radiological Hallmark',
       'Age_at_diagnosis', 'Grams of Alcohol per day',
       'Packs of cigarets per year', 'Performance Status',
       'Encefalopathy degree', 'Ascites degree',
       'International Normalised Ratio', 'Alpha-Fetoprotein', 'Haemoglobin',
       'Mean Corpuscular Volume', 'Leukocytes', 'Platelets', 'Albumin',
       'Total Bilirubin', 'Alanine transaminase', 'Aspartate transaminase',
       'Gamma glutamyl transferase', 'Alkaline phosphatase', 'Total P

# Handling the values for continuous attributes

In [105]:
continuous_x = hcc.iloc[:,23:49]

In [106]:
continuous_x.head()

Unnamed: 0,Age_at_diagnosis,Grams of Alcohol per day,Packs of cigarets per year,Performance Status,Encefalopathy degree,Ascites degree,International Normalised Ratio,Alpha-Fetoprotein,Haemoglobin,Mean Corpuscular Volume,...,Gamma glutamyl transferase,Alkaline phosphatase,Total Proteins,Creatinine,Number of Nodules,Major dimension of nodule,Direct Bilirubin,Iron,Oxygen Saturation,Ferritin
0,67,137.0,15.0,0,1.0,1.0,1.53,95.0,13.7,106.6,...,183.0,150.0,7.1,0.7,1.0,3.5,0.5,,,
1,62,0.0,,0,1.0,1.0,,,,,...,,,,,1.0,1.8,,,,
2,78,50.0,50.0,2,1.0,2.0,0.96,5.8,8.9,79.8,...,202.0,109.0,7.0,2.1,5.0,13.0,0.1,28.0,6.0,16.0
3,77,40.0,30.0,0,1.0,1.0,0.95,2440.0,13.4,97.1,...,94.0,174.0,8.1,1.11,2.0,15.7,0.2,,,
4,76,100.0,30.0,0,1.0,1.0,0.94,49.0,14.3,95.1,...,173.0,109.0,6.9,1.8,1.0,9.0,,59.0,15.0,22.0


In [107]:
y_value = hcc.iloc[: , 49]

In [108]:
type(y_value)

pandas.core.series.Series

In [109]:
imputer = Imputer(missing_values = "NaN" ,strategy = 'mean', axis = 0)  

In [110]:
imputer = imputer.fit(continuous_x)

In [111]:
continuous_x = imputer.transform(continuous_x)

In [112]:
print(continuous_x)

[[ 67.         137.          15.         ...  85.59883721  37.02894118
  438.99764706]
 [ 62.           0.          20.46428571 ...  85.59883721  37.02894118
  438.99764706]
 [ 78.          50.          50.         ...  28.           6.
   16.        ]
 ...
 [ 65.          71.00854701  48.         ...  85.59883721  37.02894118
  438.99764706]
 [ 44.          71.00854701  20.46428571 ...  85.59883721  37.02894118
  438.99764706]
 [ 52.          50.          20.46428571 ...  85.59883721  37.02894118
  438.99764706]]


In [113]:
continuous_x = pd.DataFrame(data = continuous_x)

In [114]:
continuous_x = continuous_x.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [115]:
continuous_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.643836,0.274,0.029412,0.0,0.0,0.0,0.173367,5.2e-05,0.635036,0.740519,...,0.103093,0.151954,0.03262,0.067568,0.2,0.097561,0.013699,0.382138,0.29388,0.19686
1,0.575342,0.0,0.040126,0.0,0.0,0.0,0.146194,0.01066,0.57511,0.511372,...,0.157878,0.215518,0.051591,0.125282,0.2,0.014634,0.062671,0.382138,0.29388,0.19686
2,0.794521,0.1,0.098039,0.5,0.0,0.5,0.030151,3e-06,0.284672,0.205589,...,0.115335,0.110062,0.0316,0.256757,1.0,0.560976,0.0,0.125,0.047619,0.007175
3,0.780822,0.08,0.058824,0.0,0.0,0.0,0.027638,0.001347,0.613139,0.550898,...,0.045747,0.176475,0.042813,0.122973,0.4,0.692683,0.003425,0.382138,0.29388,0.19686
4,0.767123,0.2,0.058824,0.0,0.0,0.0,0.025126,2.6e-05,0.678832,0.510978,...,0.096649,0.110062,0.030581,0.216216,0.2,0.365854,0.062671,0.263393,0.119048,0.009865


In [116]:
#X_train, X_test, y_train, y_test = train_test_split(continuous_x,y_value,test_size=0.3, random_state=100)

In [117]:
#X_train.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

continuous_x.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

#X_test.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [118]:
continuous_x.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0.643836,0.274,0.029412,0.0,0.0,0.0,0.173367,5.2e-05,0.635036,0.740519,...,0.103093,0.151954,0.03262,0.067568,0.2,0.097561,0.013699,0.382138,0.29388,0.19686
1,0.575342,0.0,0.040126,0.0,0.0,0.0,0.146194,0.01066,0.57511,0.511372,...,0.157878,0.215518,0.051591,0.125282,0.2,0.014634,0.062671,0.382138,0.29388,0.19686
2,0.794521,0.1,0.098039,0.5,0.0,0.5,0.030151,3e-06,0.284672,0.205589,...,0.115335,0.110062,0.0316,0.256757,1.0,0.560976,0.0,0.125,0.047619,0.007175
3,0.780822,0.08,0.058824,0.0,0.0,0.0,0.027638,0.001347,0.613139,0.550898,...,0.045747,0.176475,0.042813,0.122973,0.4,0.692683,0.003425,0.382138,0.29388,0.19686
4,0.767123,0.2,0.058824,0.0,0.0,0.0,0.025126,2.6e-05,0.678832,0.510978,...,0.096649,0.110062,0.030581,0.216216,0.2,0.365854,0.062671,0.263393,0.119048,0.009865


# Handling the values for nominal values

In [119]:
nom_x = hcc.iloc[:,0:23]

In [120]:
nom_x.head()

Unnamed: 0,Gender,Symptoms,Alcohol,Hepatitis B Surface Antigen,Hepatitis B e Antigen,Hepatitis B Core Antibody,Hepatitis C Virus Antibody,Cirrhosis,Endemic Countries,Smoking,...,Arterial Hypertension,Chronic Renal Insufficiency,Human Immunodeficiency Virus,Nonalcoholic Steatohepatitis,Esophageal Varices,Splenomegaly,Portal Hypertension,Portal Vein Thrombosis,Liver Metastasis,Radiological Hallmark
0,1,0.0,1,0.0,0.0,0.0,0.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,,0,0.0,0.0,0.0,1.0,1,,,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,0.0,1,1.0,0.0,1.0,0.0,1,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,1,1.0,1,0.0,0.0,0.0,0.0,1,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1,1.0,1,1.0,0.0,1.0,0.0,1,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [121]:
imputer = Imputer(missing_values = "NaN" ,strategy = 'most_frequent', axis = 0)  

In [122]:
imputer = imputer.fit(nom_x)

In [123]:
nom_x = imputer.transform(nom_x)

In [125]:
nom_x = pd.DataFrame(data = nom_x)

In [127]:
nom_x.columns = ['aa','ab','ac','ad','ae','af','ag','ah','ai','aj','ak','al','am','an','ao','ap','aq','ar','as','at','au','av','aw']

In [128]:
nom_x.head()

Unnamed: 0,aa,ab,ac,ad,ae,af,ag,ah,ai,aj,...,an,ao,ap,aq,ar,as,at,au,av,aw
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [129]:
frames = [nom_x,continuous_x]

In [130]:
x_data = pd.concat(frames, axis=1)

In [131]:
x_data.head()

Unnamed: 0,aa,ab,ac,ad,ae,af,ag,ah,ai,aj,...,q,r,s,t,u,v,w,x,y,z
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.103093,0.151954,0.03262,0.067568,0.2,0.097561,0.013699,0.382138,0.29388,0.19686
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.157878,0.215518,0.051591,0.125282,0.2,0.014634,0.062671,0.382138,0.29388,0.19686
2,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.115335,0.110062,0.0316,0.256757,1.0,0.560976,0.0,0.125,0.047619,0.007175
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.045747,0.176475,0.042813,0.122973,0.4,0.692683,0.003425,0.382138,0.29388,0.19686
4,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.096649,0.110062,0.030581,0.216216,0.2,0.365854,0.062671,0.263393,0.119048,0.009865


In [134]:
X_train, X_test, y_train, y_test = train_test_split(x_data,y_value,test_size=0.3, random_state=100)

In [135]:
X_train.columns = ['aa','ab','ac','ad','ae','af','ag','ah','ai','aj','ak','al','am','an','ao','ap','aq','ar','as','at','au','av','aw','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

#x_data.columns = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','aa','ab','ac','ad','ae','af','ag','ah','ai','aj','ak','al','am','an','ao','ap','aq','ar','as','at','au','av','aw']

X_test.columns = ['aa','ab','ac','ad','ae','af','ag','ah','ai','aj','ak','al','am','an','ao','ap','aq','ar','as','at','au','av','aw','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [136]:
X_train.columns

Index(['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj', 'ak', 'al',
       'am', 'an', 'ao', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'av', 'aw', 'a',
       'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
       'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='object')

# Input Function

In [137]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)
input_func

<function tensorflow.python.estimator.inputs.pandas_io.pandas_input_fn.<locals>.input_fn>

# Making feature column

In [138]:
variable_names = x_data

tf_variables =[]

for index in variable_names:
    tf_variables.append(tf.feature_column.numeric_column(index) )

In [139]:
feat_cols = list(tf_variables)

In [140]:
feat_cols

[_NumericColumn(key='aa', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ab', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ac', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ad', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ae', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='af', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ag', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ah', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ai', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='aj', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='ak', shap

In [160]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,20,10,20,20,20,20,20,20],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\USER\\AppData\\Local\\Temp\\tmp23wfn2ue', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000022083BD9BE0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [161]:
dnn_model.train(input_fn=input_func,steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\USER\AppData\Local\Temp\tmp23wfn2ue\model.ckpt.
INFO:tensorflow:loss = 6.948332, step = 1
INFO:tensorflow:global_step/sec: 129.008
INFO:tensorflow:loss = 5.680835, step = 101 (0.774 sec)
INFO:tensorflow:global_step/sec: 165.704
INFO:tensorflow:loss = 2.4025407, step = 201 (0.604 sec)
INFO:tensorflow:global_step/sec: 170.256
INFO:tensorflow:loss = 3.7496471, step = 301 (0.587 sec)
INFO:tensorflow:global_step/sec: 151.625
INFO:tensorflow:loss = 1.9552494, step = 401 (0.661 sec)
INFO:tensorflow:global_step/sec: 131.821
INFO:tensorflow:loss = 5.8485045, step = 501 (0.759 sec)
INFO:tensorflow:global_step/sec: 132.776
INFO:tensorflow:loss = 1.864297, step = 601 (0.755 sec)
INFO:tensorflow:global_step/sec: 131.732
INFO:tensorflow:loss = 3.1851919, step = 701 (0.758 sec)
INFO:tensorflow:global_step/sec: 132.785
INFO:tensorflow:loss = 3.96292, step = 801 (0.755 sec)
INFO:tensorflow:global_step/sec

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x22083bd9780>

In [162]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      y=y_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [163]:
results = dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2018-01-26-13:35:07
INFO:tensorflow:Restoring parameters from C:\Users\USER\AppData\Local\Temp\tmp23wfn2ue\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-01-26-13:35:08
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.76, accuracy_baseline = 0.58, auc = 0.82594407, auc_precision_recall = 0.8801939, average_loss = 1.0229053, global_step = 1000, label/mean = 0.58, loss = 10.2290535, prediction/mean = 0.61324286


In [164]:
results

{'accuracy': 0.76,
 'accuracy_baseline': 0.58,
 'auc': 0.82594407,
 'auc_precision_recall': 0.8801939,
 'average_loss': 1.0229053,
 'global_step': 1000,
 'label/mean': 0.58,
 'loss': 10.2290535,
 'prediction/mean': 0.61324286}