## Model with simpler data and Tensorflow

In [1]:
%matplotlib inline
import tensorflow as tf
import numpy as np
import re
import logging 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import os
import pickle
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
lg = logging.getLogger(__name__)

## Generating some toy data to play around

In [3]:
def sim(n, p, null_prop):
    lg.info('n: %s, p: %s', n, p)
    pred = list()
    for i in range(p):
        pred.append(np.random.normal(0, 1, n))
    pred = np.column_stack(pred)
    beta = np.random.normal(0, 0.1**2, p)
    null_index = np.random.choice(range(p),int(p*null_prop),
                     replace=False)
    beta[null_index] = 0
    y = scale(pred.dot(beta.T))
    y = scale(y) + np.random.normal(0, 1, n)
    return y, pred

In [4]:
# n = 10000
# p = 10000
# null_prop = 0.99
# y, pred = sim(n, p, null_prop)
# alpha_values = np.arange(0.001, 0.01, 0.001)
# lg.info('Number of alphas: %s', alpha_values)
# x_train, x_test, y_train, y_test = train_test_split(pred, y,
#                                                     test_size=0.1,
#                                                     random_state=42)
# n_train = x_train.shape[0]
# y_train = y_train.reshape(n_train, 1)
# del pred
# num_blocks = 4
# blocks = [[k for k in range(l)] for l in np.repeat(int(p/num_blocks), num_blocks)]

In [5]:
lg = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

batch_folder = '../data/sample_major_1kg/'
batch_files = os.listdir(batch_folder)
batch_files = [os.path.join(batch_folder, k) for k in batch_files]
lg.info('There are %s file present', len(batch_files))

blocks = pickle.load(open('../data/sim_1000G_chr10.ld_blocks.pickel', 'rb'))
blocks = blocks[10] # remove the chromosom
lg.info('There are %s LD blocks to process', len(blocks))

fam = pd.read_table('../data/sim_1000G_chr10.txt')
pheno = fam['V3'].values
sub_pheno = pheno[0:100]
sub_pheno = sub_pheno.reshape(100, 1)
lg.info('There are %s subjects', len(pheno))

bim = pd.read_table('../data/sim_1000G_chr10.bim', header=None)
snps = bim[1].values
p = len(snps)
lg.info('There are %s SNPs', len(snps))

INFO:__main__:There are 25 file present
INFO:__main__:There are 85 LD blocks to process
INFO:__main__:There are 2504 subjects
INFO:__main__:There are 405378 SNPs


In [6]:
# Function to generate a bool list with the block seperation
def make_block_id(blocks, p):
    output = list()
    u = 0
    for i, b in enumerate(blocks):
        nn = len(b)
        mask = np.zeros(p, dtype=bool)
        mask[u:(u + nn)] = True
        u += nn
        output.append(mask)
        if i % 10 == 0:
            lg.debug('Processing LD block %s', i)
    return output

# def make_block_id(snps, blocks):
#     output = list()
#     u = 0
#     for i, b in enumerate(blocks):
#         nn = len(b)
#         mask = np.zeros(len(snps), dtype=bool)
#         mask[u:(u+nn)] = True
#         u+=nn
#         output.append(mask)
#         if i % 10 == 0:
#             lg.debug('Processing LD block %s', i)
#     return output

In [7]:
bool_blocks = make_block_id(blocks, p)

DEBUG:__main__:Processing LD block 0
DEBUG:__main__:Processing LD block 10
DEBUG:__main__:Processing LD block 20
DEBUG:__main__:Processing LD block 30
DEBUG:__main__:Processing LD block 40
DEBUG:__main__:Processing LD block 50
DEBUG:__main__:Processing LD block 60
DEBUG:__main__:Processing LD block 70
DEBUG:__main__:Processing LD block 80


In [8]:
def n_iterator(X, y, chunk_size=100):
    n, p = X.shape
    grouped = list(zip(*[range(n)] * chunk_size))
    for i in grouped:
        yield X[i,:].reshape(chunk_size, p) , y[i,:].reshape(chunk_size, 1)

In [9]:
dirfold = '../data/sample_major_1kg/'
files = os.listdir(dirfold)
files = [os.path.join(dirfold, k) for k in files]
training_files = np.random.choice(files, replace=False, size=int(len(files)* 0.9))
test_files = [l for l in files if l not in training_files]
lg.info('%s files in total, allocating %s for training and %s for validation', len(files), len(training_files), len(test_files))

INFO:__main__:25 files in total, allocating 22 for training and 3 for validation


In [19]:
test_files

['../data/sample_major_1kg/sample_major_21.npy',
 '../data/sample_major_1kg/sample_major_0.npy',
 '../data/sample_major_1kg/sample_major_16.npy']

In [20]:
training_files

array(['../data/sample_major_1kg/sample_major_8.npy',
       '../data/sample_major_1kg/sample_major_18.npy',
       '../data/sample_major_1kg/sample_major_3.npy',
       '../data/sample_major_1kg/sample_major_15.npy',
       '../data/sample_major_1kg/sample_major_13.npy',
       '../data/sample_major_1kg/sample_major_7.npy',
       '../data/sample_major_1kg/sample_major_17.npy',
       '../data/sample_major_1kg/sample_major_4.npy',
       '../data/sample_major_1kg/sample_major_23.npy',
       '../data/sample_major_1kg/sample_major_9.npy',
       '../data/sample_major_1kg/sample_major_22.npy',
       '../data/sample_major_1kg/sample_major_12.npy',
       '../data/sample_major_1kg/sample_major_14.npy',
       '../data/sample_major_1kg/sample_major_10.npy',
       '../data/sample_major_1kg/sample_major_1.npy',
       '../data/sample_major_1kg/sample_major_5.npy',
       '../data/sample_major_1kg/sample_major_6.npy',
       '../data/sample_major_1kg/sample_major_11.npy',
       '../data/sa

In [10]:
def geno_iterator(paths, y):
    np.random.shuffle(paths)
    for p in paths:
        data, index_vec = np.load(p)
        n, p = data.shape
        yield data, y[index_vec].reshape(n, 1)

## Define TensorFlow model 

I used placeholder since they are more flexible in regards to batch size.

In [11]:
tf.reset_default_graph()

In [12]:
Xp = tf.placeholder(tf.float32, [None, p], name='X')
yp = tf.placeholder(tf.float32, [None, 1], name='y')

##### Define Model

In [13]:
batch_size = Xp.get_shape()[0]
rand_norm_init = tf.initializers.random_normal(0, 0.0001)
linear_combiner = tf.constant(1.0, shape=[len(bool_blocks), 1])
# define initial linear layer for each block
with tf.variable_scope('Genotypes'):
    collector = list()
    for i, b in enumerate(bool_blocks):
        out_list = list()
        l1 = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None)
        with tf.variable_scope('LD_block'+ str(i)):
            small_block = tf.boolean_mask(Xp, b, axis=1)
            small_block.set_shape((batch_size, np.sum(b)))
            y_ = tf.layers.dense(small_block, 1, kernel_regularizer=l1, kernel_initializer=rand_norm_init)
            collector.append(y_)
            if i % 10 == 0:
                lg.debug('generated variables for LD block #%s', i)
        
# define neural layers
collection = tf.concat(collector, name='prediction_matrix', axis=1)
# n1 = tf.layers.dense(collection, 85, name='n1', kernel_initializer=rand_norm_init)
# n2 = tf.layers.dense(n1, 40, name='n2', kernel_initializer=rand_norm_init)
# n3 = tf.layers.dense(n2, 20, name='n3', kernel_initializer=rand_norm_init)
# y_hat = tf.layers.dense(n3, 1, name='output_layer', kernel_initializer=rand_norm_init)
y_hat = tf.matmul(collection, linear_combiner, name='combinging_linear')

DEBUG:__main__:generated variables for LD block #0
DEBUG:__main__:generated variables for LD block #10
DEBUG:__main__:generated variables for LD block #20
DEBUG:__main__:generated variables for LD block #30
DEBUG:__main__:generated variables for LD block #40
DEBUG:__main__:generated variables for LD block #50
DEBUG:__main__:generated variables for LD block #60
DEBUG:__main__:generated variables for LD block #70
DEBUG:__main__:generated variables for LD block #80


In [14]:
mse = tf.losses.mean_squared_error(yp, y_hat)

In [15]:
# optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(mse)
optimizer = tf.train.AdagradOptimizer(0.001).minimize(mse)

In [16]:
from datetime import datetime
now = datetime.now()
now = now.strftime("%Y%m%d-%H%M%S") 
l1_loss = tf.losses.get_regularization_loss()
mse += l1_loss
accuracy = tf.contrib.metrics.streaming_pearson_correlation(y_hat, yp, name='correlation')

sum_accu = tf.summary.scalar('Accuracy', accuracy[1])
sum_loss = tf.summary.scalar('Loss', mse)
sum_l1 = tf.summary.scalar('L1_loss', l1_loss)

merged = tf.summary.merge_all()

train_writer = tf.summary.FileWriter('tensorboard/neural_network/train'+now, tf.get_default_graph())
test_writer = tf.summary.FileWriter('tensorboard/neural_network/test'+now)

In [17]:
re_assemble_testing = True
if re_assemble_testing:
    x_test = list()
    y_test = list()
    genoiter = geno_iterator(test_files, pheno)
    for x, y in genoiter:
        x_test.append(x)
        y_test.append(y)
    x_test = np.concatenate(x_test, axis=0)
    y_test = np.concatenate(y_test, axis=0)

In [18]:
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
    sess.run(init)
    for i in range(400):
        dat = geno_iterator(training_files, pheno)
        pred = list()
        for x, y in dat:
            _, c, summary = sess.run([optimizer, mse, merged], feed_dict={Xp: x, yp: y})
        train_writer.add_summary(summary, i)
        if i % 10 == 0:
            summary = sess.run(merged, feed_dict={Xp: x_test, yp: y_test})
            test_writer.add_summary(summary, i)

KeyboardInterrupt: 