## Author: Min Shi
## Last updated: 5/18/2021
## Description:
The code was created to implement the NetOIF model to forecast GE data.

In [2]:
import time
import tensorflow as tf
import numpy as np
from models import gcn, lstm
from configs import *
from utils import *
import scipy.sparse
import statistics

# 1. Load the GE data

* **`12859_2008_2579_MOESM2_ESM.xls:`** Human estrogen-responsive breast cancer cells expression after 1, 2, 4, 6, 8, 12, 16, 20, 24, 28 and 32 hours hormonal stimulation
* **`string_interactions.tsv:`** The protein-protein interaction (PPI) network from stringDB

In [3]:
np.random.seed(123)
FLAGS = tf.flags.FLAGS
dataset = FLAGS.GE
time_steps = 7
hidden_dim = 1
hidden_size = 1
train_ratio = FLAGS.train_ratio
window_size = FLAGS.window_size

In [None]:
feat_file = os.path.join('datasets/GE', '12859_2008_2579_MOESM2_ESM_processed.xlsx')
feat_df = pd.read_excel(feat_file, sheet_name='ZR75.1_QUA_P4_cluster_named').set_index('Search_key')

print(feat_df.columns)
feats = feat_df.values
feats = feats.reshape([feats.shape[0],1,8]).transpose([2, 0, 1])

feat_list = []
for i in range(time_steps):
    feat_list.append(feats[i])

protein_names = feat_df.index.tolist()

network_file = os.path.join('datasets/GE', 'string_interactions.tsv')
net_df = pd.read_csv(network_file,sep='\t')

adj = np.zeros([len(protein_names),len(protein_names)])
links = []
no_links = []
for index, row in net_df.iterrows():
    try:
        v1_idx = protein_names.index(row['node1'])
        v2_idx = protein_names.index(row['node2'])
        score = float(row['combined_score'])

        adj[v1_idx,v2_idx] = score
        adj[v2_idx,v1_idx] = score

        links.append((row['node1'], row['node2']))
    except:
        no_links.append((row['node1'], row['node2']))

In [None]:
print('Adjacency matrix:'+str(adj.shape))
print('Time series rppa data:'+str(feats.shape))

In [None]:
adjs, feats, val_idx, test_idx, protein_names = load_data_prediction_GE(dataset, time_steps, 0.2)


print('feats.size:',feats[0].shape)

num_node = adjs[0].shape[0]
num_feat = feats[0].shape[1]
train_idx = np.array(range(num_node))

for i in range(time_steps):
    adjs[i] = sparse_to_tuple(scipy.sparse.coo_matrix(adjs[i]))
#     feats[i] = sparse_to_tuple(scipy.sparse.coo_matrix(feats[i]))
num_features_nonzeros = [x[1].shape for x in feats]


In [None]:
len(protein_names)

# 2. Prepare the model

In [None]:
alpha = 0.9

# define placeholders of the input data 
phs = {
        'adjs': [tf.sparse_placeholder(tf.float32, shape=(None, None), name="adjs") for i in
             range(time_steps)],
        'feats': [tf.placeholder(tf.float32, shape=(None, num_feat), name="feats") for _ in
                 range(time_steps)],
        'train_idx': tf.placeholder(tf.int32, shape=(None,), name="train_idx"),
        'val_idx': tf.placeholder(tf.int32, shape=(None,), name="val_idx"),
        'test_idx': tf.placeholder(tf.int32, shape=(None,), name="test_idx"),
        'sample_idx': tf.placeholder(tf.int32, shape=(FLAGS.batch_size,), name='batch_sample_idx'),
        'dropout_prob': tf.placeholder_with_default(0., shape=()),
        'num_features_nonzeros': [tf.placeholder(tf.int64) for i in range(time_steps)]
        }

# define the GCN model
gcn_model = gcn.GraphConvLayer(time_steps = time_steps,
                               gcn_layers=FLAGS.gcn_layers,
                               input_dim=num_feat,
                               hidden_dim=hidden_dim,
                               output_dim=hidden_size,
                               name='nn_fc1',
                               num_features_nonzeros=phs['num_features_nonzeros'],
                               act=tf.nn.relu,
                               dropout_prob=phs['dropout_prob'],
                               dropout=True)
embeds_list = gcn_model(adjs=phs['adjs'],
                    feats=phs['feats'],
                    sparse=False)

# prepare train data for the LSTM-based prediction model
## replace all missing features at (time_steps-1) with GCN imputed features
# embeds_list[time_steps-1] = tf.add(phs['feats'][time_steps-1], 
#                                    tf.multiply(phs['test_mask'][time_steps-1], embeds_list[time_steps-1]))
## construct training samples for the prediction task
combined_feats = []
for i in range(time_steps):
    combined_feats.append(tf.add(alpha*phs['feats'][i], (1-alpha)*embeds_list[i]))

x_train, y_train, x_val, y_val, x_test, y_test = build_train_samples_prediction(embeds_list=combined_feats, 
                                                                                 feats=phs['feats'], 
                                                                                 time_steps=time_steps,
                                                                                 window_size=window_size,
                                                                                 val_idx=phs['val_idx'], 
                                                                                 test_idx=phs['test_idx'])
print("x_train:",x_train.shape)
print("x_val:",x_val.shape)
print("x_test:",x_test.shape)
print("y_train:",y_train.shape)
print("y_val:",y_val.shape)
print("y_test:",y_test.shape)
# define the bi-directional LSTM model
lstm_model = lstm.BiLSTM(hidden_size=hidden_size,
                         seq_len=window_size,
                         holders=phs)
x_input_seq = tf.gather(x_train, phs['sample_idx'])
y_input_seq_real = tf.gather(y_train, phs['sample_idx'])
y_input_seq_pred = lstm_model(input_seq=x_input_seq)

with tf.name_scope('optimizer'):
    # calculate the train mse and ad
    print(y_input_seq_real.shape)
    train_mse = tf.losses.mean_squared_error(y_input_seq_real, y_input_seq_pred)
    train_absolute_diff = tf.losses.absolute_difference(y_input_seq_real, y_input_seq_pred)
    
    # calculate the val mse and ad
    val_input_seq_pred = lstm_model(input_seq=x_val)
    val_mse = tf.losses.mean_squared_error(y_val, val_input_seq_pred)
    val_absolute_diff = tf.losses.absolute_difference(y_val, val_input_seq_pred)
    
    # calculate the test mse and ad
    test_input_seq_pred = lstm_model(input_seq=x_test)
    test_mse = tf.losses.mean_squared_error(y_test, test_input_seq_pred)
    test_absolute_diff = tf.losses.absolute_difference(y_test, test_input_seq_pred)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
    opt_op = optimizer.minimize(train_mse)

n_cpus = 8
config = tf.ConfigProto(device_count={ "CPU": n_cpus},
                            inter_op_parallelism_threads=n_cpus,
                            intra_op_parallelism_threads=2)
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())

feed_dict = {phs['train_idx']: train_idx,
             phs['val_idx']: val_idx,
             phs['test_idx']: test_idx,
             phs['sample_idx']: None,
             phs['dropout_prob']: FLAGS.dropout_prob}

feed_dict.update({phs['adjs'][t]: adjs[t] for t in range(time_steps)})
feed_dict.update({phs['feats'][t]: feats[t] for t in range(time_steps)})
feed_dict.update({phs['num_features_nonzeros'][t]: num_features_nonzeros[t] for t in range(time_steps)})

feed_dict_val = {phs['train_idx']: train_idx,
                 phs['val_idx']: val_idx,
                 phs['test_idx']: test_idx,
                 phs['dropout_prob']: 0}

feed_dict_val.update({phs['adjs'][t]: adjs[t] for t in range(time_steps)})
feed_dict_val.update({phs['feats'][t]: feats[t] for t in range(time_steps)})
feed_dict_val.update({phs['num_features_nonzeros'][t]: num_features_nonzeros[t] for t in range(time_steps)})


def get_batch_idx(epoch):
    s = FLAGS.batch_size * epoch
    e = FLAGS.batch_size * (epoch + 1)
    idx = []
    for i in range(s,e):
        idx.append(i%len(train_idx))
    return idx

# 3. run the model

In [None]:
epochs = FLAGS.epochs
save_step = 10
t = time.time()

test_MSEs = []
test_ADs = []
for k in range(20):
    for epoch in range(epochs):
        batch_samples = get_batch_idx(epoch)
        feed_dict.update({phs['sample_idx']: batch_samples})
        _, train_MSE, train_AD = sess.run((opt_op, train_mse, train_absolute_diff), feed_dict=feed_dict)
        val_MSE, val_AD, x_val_ = sess.run((val_mse, val_absolute_diff, x_val), 
                                                                 feed_dict=feed_dict_val) 

#         print("Epoch:", '%04d' % (epoch + 1),
#           "train_loss=", "{:.5f}".format(train_MSE),
#           "train_MSE=", "{:.5f}".format(train_MSE),
#           "train_AD=", "{:.5f}".format(train_AD),
#           "val_MSE=", "{:.5f}".format(val_MSE),
#           "val_AD=", "{:.5f}".format(val_AD),
#           "time=", "{:.5f}".format(time.time() - t))

#         if (epoch+1) % save_step == 0:
    test_MSE, test_AD, missing_actual_, missing_predicted_ = sess.run((test_mse, test_absolute_diff,
                                                                      y_test, test_input_seq_pred), 
                                                                      feed_dict=feed_dict_val) 
    print("-------test_MSE=", "{:.5f}".format(test_MSE),
      "test_AD=", "{:.5f}".format(test_AD))
    
    test_MSEs.append(float(test_MSE))
    test_ADs.append(float(test_AD))
    
average_MSE = statistics.mean(test_MSEs)
stdev_MSE = statistics.stdev(test_MSEs)
average_AD = statistics.mean(test_ADs)
stdev_AD = statistics.stdev(test_ADs)
print('average_MSE=%f, stdev_MSE=%f, average_AD=%f, stdev_AD=%f' % (average_MSE, stdev_MSE,
                                                                      average_AD, stdev_AD))   

In [None]:
import matplotlib.ticker as plticker
from matplotlib import colors
import matplotlib.pyplot as plt

protein_names = np.array(protein_names)

protein_names_test = np.array(protein_names[test_idx])

fig = plt.figure(figsize=(3,6))
fig.subplots_adjust(hspace=0, wspace=2.4)
ax1 = fig.add_subplot(121)
ax1.set_aspect('auto')
cax1 = ax1.matshow(missing_actual_,cmap='bwr',norm = colors.DivergingNorm(vcenter=0), aspect="auto")
fig.colorbar(cax1, pad=0.2)
ax1.set_title('Actual_28h',y=-0.1)

loc = plticker.MultipleLocator(base=20) # this locator puts ticks at regular intervals
ax1.yaxis.set_major_locator(loc)
show_protein_names = protein_names_test[range(0, len(protein_names_test), 10)]

ax1.set_yticks(range(len(protein_names_test)))
loc = plticker.MultipleLocator(base=20) # this locator puts ticks at regular intervals
ax1.yaxis.set_major_locator(loc)
show_protein_names = protein_names_test[range(0, len(protein_names_test), 10)]
show_protein_names = np.insert(show_protein_names, 0, 0, axis=0)
ax1.set_yticklabels(show_protein_names, fontsize=11)
ax1.set_xticks([])


fig1 = plt.figure(figsize=(3,6))
fig1.subplots_adjust(hspace=0, wspace=2.4)

ax2 = fig1.add_subplot(122)
ax2.set_aspect('auto')
cax2 = ax2.matshow(missing_predicted_,cmap='bwr',norm = colors.DivergingNorm(vcenter=0), aspect="auto")
fig1.colorbar(cax2, pad=0.2)
ax2.set_title('Imputed_28h',y=-0.1)

ax2.set_yticks(range(len(protein_names_test)))
ax2.yaxis.set_major_locator(loc)
ax2.set_yticklabels(show_protein_names, fontsize=11)
# ax2.set_xticklabels('1', fontsize=11)
ax2.set_xticks([])