In [4]:
'''
autoencoder
'''

import pandas as pd
import os
# import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as lays
from sklearn import preprocessing
import tensorflow as tf
from math import ceil
import numpy as np
from lib.read_data import dataset,Datasets
from lib.net import autoencoder,feedforward_net
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def dense_to_one_hot(labels_dense, num_classes):
  '''
  Convert class labels from scalars to one-hot vectors.
  '''
  # num_labels = labels_dense.shape[0]
  # index_offset = np.arange(num_labels) * num_classes
  # labels_one_hot = np.zeros((num_labels, num_classes))
  # labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return np.eye(num_classes)[np.array(labels_dense).reshape(-1)]

def read_data_set(data_table,test_size=0.25):
    '''
    convert a pandas dataframe data table into Datasets(dataset,dataset)
    '''
    train, test = train_test_split(data_table,test_size=0.25)
    train_x = np.array(train[[col for col in train.columns
    if col not in ['INFO']]])
    test_x = np.array(test[[col for col in train.columns
    if col not in ['INFO']]])
    train_y = np.array(train['INFO'],dtype=np.int8)
    test_y = np.array(test['INFO'],dtype=np.int8)
    return Datasets(train=dataset(train_x,train_y),
    test=dataset(test_x,test_y))

if __name__=='__main__':
    # constant
    batch_size = 50
    epoch_num = 100
    lr = 0.001

    # read dataset

    # all_data = pd.read_csv('data/myh7/myh7_myo5b.csv',sep='\t')
    all_data = pd.read_csv('/Users/gcc/projects/myo5b_project/data/dummy_no_nan_data.csv',sep='\t')
#     all_data = pd.read_csv('data/myh7/myh7_dummy_no_nan_data.csv',sep='\t')
    # print(all_data.shape)
    # print(all_data.head())
    # raise NotImplementedError
    all_data = all_data.drop(['POS'],axis=1)
    mvid = read_data_set(all_data)
    # print(len(mvid.train.labels[mvid.train.labels==1.]),
    # len(mvid.train.labels[mvid.train.labels==0.]),
    # len(mvid.train.labels))
    # print(len(mvid.test.labels[mvid.test.labels==1.]),
    # len(mvid.test.labels[mvid.test.labels==0.]),
    # len(mvid.test.labels))
    # raise NotImplementedError
    batch_per_ep = ceil(mvid.train.num_examples/batch_size)

    # ======================== autoencoder ==============================
    # model
    inputs = tf.placeholder(tf.float32,(None, mvid.train.num_features))
    labels = tf.placeholder(tf.float32,(None, 2))
    ae_outputs,ae_bottle = autoencoder(inputs)
    # loss and training options
    loss_ae = tf.reduce_mean((tf.square(ae_outputs-inputs))) # autoencoder
    # loss_fn = tf.reduce_mean((tf.square(fn_outputs-labels)))
    train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss_ae)

    # initializer
    init = tf.global_variables_initializer()

    # start training
    # mvid.train._epochs_completed = 0
    with tf.Session() as sess:
        sess.run(init)
        for ep in range(epoch_num):
            for batch_no in range(batch_per_ep):
                batch_data, batch_label = mvid.train.next_batch(batch_size)
                batch_label_onehot = dense_to_one_hot(batch_label,2)
                _, recon_data,error = sess.run([train_op,
                                                ae_outputs,loss_ae],
                                                feed_dict={inputs:batch_data,
                                                labels:batch_label_onehot})
                print('Epoch: {0}\tIteration:{1}\tError: {2}\t'.format(
                ep, batch_no, error
                ))

        # test the trained network
        batch_data, batch_label = mvid.test.next_batch(batch_size=50)
        batch_label_onehot = dense_to_one_hot(batch_label,2)
        recon_data, bottle, error = sess.run([ae_outputs,ae_bottle, loss_ae],
        feed_dict={inputs:batch_data,labels:batch_label_onehot})
        print('Test dataset\tError: {0}'.format(error))
        
        # apply LogisticRegression
        batch_label_onehot = dense_to_one_hot(mvid.train.labels,2)
        _,train_ae_features,_ = sess.run([ae_outputs,
                                          ae_bottle,
                                          loss_ae],
                                          feed_dict={inputs:mvid.train.values,
                                          labels:batch_label_onehot})
        batch_label_onehot = dense_to_one_hot(mvid.test.labels,2)
        _,test_ae_features,_ = sess.run([ae_outputs,
                                         ae_bottle,
                                         loss_ae],
                                         feed_dict={inputs:mvid.test.values,
                                         labels:batch_label_onehot})

        # GridSearchCV + Parameters
        class_weight = ['balanced']
        param_grid_logr = [{'logr__penalty':['l1','l2'],
                            'logr__C':[1,2,3,4,5],
                            'logr__class_weight':class_weight}]
        # pipeline
        pipeline_logr = Pipeline(steps=[('logr',LogisticRegression())])
        # display results
        classifier_logr = GridSearchCV(estimator=pipeline_logr,
                                       param_grid=param_grid_logr)

        print('Start training...')
        classifier_logr.fit(train_ae_features,mvid.train.labels)
        print('Model Description:\n',classifier_logr.best_estimator_)
        pred = classifier_logr.predict(test_ae_features)
        tn, fp, fn, tp = confusion_matrix(mvid.test.labels,pred).ravel()
        sensitivity = tp/(fn+tp)
        specificity = tn/(fp+tn)
        print('>>> best model results: sensitivity: {:.{prec}}\tspecificity: {:.{prec}f}'.\
        format(sensitivity,specificity,prec=3))

#         # use bottleneck as features for training the classifiers
#         batch_label_onehot = dense_to_one_hot(mvid.train.labels,2)
#         _,train_ae_features,_ = sess.run([ae_outputs,
#                                           ae_bottle,
#                                           loss_ae],
#                                           feed_dict={inputs:mvid.train.values,
#                                           labels:batch_label_onehot})
#         batch_label_onehot = dense_to_one_hot(mvid.test.labels,2)
#         _,test_ae_features,_ = sess.run([ae_outputs,ae_bottle, loss_ae],
#                                     feed_dict={
#                                     inputs:mvid.test.values,
#                                     labels:batch_label_onehot
#                                     })
#         # classifier_logr = LogisticRegression(class_weight='balanced')
#         classifier_logr = RandomForestClassifier(n_estimators=50,
#                                                 class_weight='balanced')
#         print(train_ae_features.shape, mvid.train.labels.shape)
#         classifier_logr.fit(train_ae_features,mvid.train.labels)
#         pred = classifier_logr.predict(test_ae_features)
#         tn, fp, fn, tp = confusion_matrix(mvid.test.labels,pred).ravel()
#         sensitivity = tp/(fn+tp)
#         specificity = tn/(fp+tn)
#         print(tn,fp,fn,tp)
#         print(sensitivity,specificity)

Epoch: 0	Iteration:0	Error: 0.12377193570137024	
Epoch: 0	Iteration:1	Error: 0.12322364002466202	
Epoch: 0	Iteration:2	Error: 0.11957110464572906	
Epoch: 0	Iteration:3	Error: 0.11939466744661331	
Epoch: 1	Iteration:0	Error: 0.11298613250255585	
Epoch: 1	Iteration:1	Error: 0.11748310178518295	
Epoch: 1	Iteration:2	Error: 0.11941605806350708	
Epoch: 1	Iteration:3	Error: 0.11313163489103317	
Epoch: 2	Iteration:0	Error: 0.10745300352573395	
Epoch: 2	Iteration:1	Error: 0.10285474359989166	
Epoch: 2	Iteration:2	Error: 0.10481110960245132	
Epoch: 2	Iteration:3	Error: 0.0974973514676094	
Epoch: 3	Iteration:0	Error: 0.09326481074094772	
Epoch: 3	Iteration:1	Error: 0.08747346699237823	
Epoch: 3	Iteration:2	Error: 0.08677118271589279	
Epoch: 3	Iteration:3	Error: 0.07939670234918594	
Epoch: 4	Iteration:0	Error: 0.07922841608524323	
Epoch: 4	Iteration:1	Error: 0.07384588569402695	
Epoch: 4	Iteration:2	Error: 0.07052167505025864	
Epoch: 4	Iteration:3	Error: 0.06700103729963303	
Epoch: 5	Iteration:0	

Epoch: 41	Iteration:0	Error: 0.040760889649391174	
Epoch: 41	Iteration:1	Error: 0.037609681487083435	
Epoch: 41	Iteration:2	Error: 0.0408465713262558	
Epoch: 41	Iteration:3	Error: 0.03922668471932411	
Epoch: 42	Iteration:0	Error: 0.039000656455755234	
Epoch: 42	Iteration:1	Error: 0.03921275958418846	
Epoch: 42	Iteration:2	Error: 0.040052469819784164	
Epoch: 42	Iteration:3	Error: 0.03964289277791977	
Epoch: 43	Iteration:0	Error: 0.03989454358816147	
Epoch: 43	Iteration:1	Error: 0.035902924835681915	
Epoch: 43	Iteration:2	Error: 0.03784279525279999	
Epoch: 43	Iteration:3	Error: 0.03599678725004196	
Epoch: 44	Iteration:0	Error: 0.03818255662918091	
Epoch: 44	Iteration:1	Error: 0.03995104506611824	
Epoch: 44	Iteration:2	Error: 0.03651002421975136	
Epoch: 44	Iteration:3	Error: 0.03838490694761276	
Epoch: 45	Iteration:0	Error: 0.03770457208156586	
Epoch: 45	Iteration:1	Error: 0.036718033254146576	
Epoch: 45	Iteration:2	Error: 0.037626925855875015	
Epoch: 45	Iteration:3	Error: 0.0388432033360

Model Description:
 Pipeline(memory=None,
     steps=[('logr', LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
>>> best model results: sensitivity: 0.667	specificity: 0.714


In [18]:
pred

array([1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int8)

In [19]:
mvid.test.labels

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [20]:
mis = mvid.test.labels-pred

In [21]:
mis # {0:tp/tn, 1:fn, -1,fp} 

array([-1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        1,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,
        0,  0, -1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0, -1,
       -1,  0, -1,  1, -1,  0, -1,  0, -1,  0,  0], dtype=int8)

In [22]:
np.where(mis==1)

(array([17, 54]),)

In [23]:
np.where(mis==-1)

(array([ 0,  4, 16, 19, 21, 32, 33, 36, 44, 48, 50, 51, 53, 55, 57, 59]),)

In [21]:
mvid.iloc[np.where(mis==1)]

Unnamed: 0,CpG,EncExp,EncH3K4Me1,EncH3K4Me3,EncOCC,EncOCCombPVal,EncOCDNasePVal,EncOCDNaseSig,EncOCFairePVal,EncOCFaireSig,...,oAA_N,oAA_P,oAA_Q,oAA_QT,oAA_R,oAA_S,oAA_T,oAA_V,oAA_W,oAA_Y
851,0.137931,0.079128,0.227373,0.163611,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
862,0.034483,0.069761,0.431567,0.09732,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
883,0.241379,0.119472,0.014349,0.06488,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
890,0.172414,0.068273,0.014349,0.008463,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
894,0.0,0.055858,0.041943,0.105783,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
905,0.0,0.007805,0.064018,0.029619,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
908,0.137931,0.015611,0.069536,0.077574,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
myo5b_with_genename.iloc[np.where(mis==-1)]

Unnamed: 0,CpG,EncExp,EncH3K4Me1,EncH3K4Me3,EncOCC,EncOCCombPVal,EncOCDNasePVal,EncOCDNaseSig,EncOCFairePVal,EncOCFaireSig,...,oAA_N,oAA_P,oAA_Q,oAA_QT,oAA_R,oAA_S,oAA_T,oAA_V,oAA_W,oAA_Y
855,0.137931,0.10552,0.115894,0.12976,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
858,0.103448,0.097763,0.041943,0.077574,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.241379,0.139644,0.422737,0.084626,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
865,0.37931,0.060517,0.09713,0.141044,0.75,0.0,0.0,0.013333,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
870,0.172414,0.124131,0.069536,0.042313,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
875,0.241379,0.084104,0.041943,0.077574,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
877,0.103448,0.041027,0.069536,0.038082,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
878,0.103448,0.061541,0.080574,0.029619,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
882,0.37931,0.110789,0.014349,0.042313,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
886,0.137931,0.065176,0.069536,0.047955,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
