In [1]:
from GENIE3 import *
import sys, os
sys.path.append(os.getcwd())
sys.path.append('/scratch/ab9738/dfdl_imputation/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from scipy import stats
import SERGIO.SERGIO.sergio as sergio
from sklearn.metrics import roc_auc_score
from copy import deepcopy

In [123]:
# mean_range = [0,20]
# std_range = [0,5]

In [6]:
def lib_size_effect(scData, mean=4.5, scale=0.7):
    ret_data = []
    libFactors = np.random.lognormal(mean = mean, sigma = scale, size = (9, 300))
    for binExprMatrix, binFactors in zip(scData, libFactors):
        normalizFactors = np.sum(binExprMatrix, axis = 0)
        binFactors = np.true_divide(binFactors, normalizFactors)
        binFactors = binFactors.reshape(1, 300)
        binFactors = np.repeat(binFactors, 100, axis = 0)
        
        ret_data.append(np.multiply(binExprMatrix, binFactors))

    return libFactors, np.array(ret_data)

In [7]:
def dropout_indicator(scData, shape = 1, percentile = 45):
        scData = np.array(scData)
        scData_log = np.log(np.add(scData,1))
        log_mid_point = np.percentile(scData_log, percentile)
        prob_ber = np.true_divide (1, 1 + np.exp( -1*shape * (scData_log - log_mid_point) ))

        binary_ind = np.random.binomial( n = 1, p = prob_ber)

        return binary_ind

In [8]:
def convert_to_UMIcounts (scData):
    return np.random.poisson (scData)

In [9]:
x_train = []
y_train = []
for iters in tqdm(range(10000)):
    y_means = np.random.uniform(0,20,(9,100))
    y_factors = np.random.uniform(2,4,(9,100))
    y_stds = y_means/y_factors
    y_sums = np.transpose(np.tile(np.sum(y_means,axis=1),(100,1)))
    y_means /= y_sums
    y_stds /= y_sums
    init_dataset = np.zeros((9,100,300))
    for ct in range(9):
        for g in range(100):
            init_dataset[ct,g,:] = np.random.normal(loc=y_means[ct,g], scale=y_stds[ct,g], size=(300,))
    init_dataset[init_dataset<0] = 0.0
    
    _, new_data = lib_size_effect(init_dataset)
    
    drop_data = dropout_indicator(new_data)
    
    final_data = convert_to_UMIcounts(drop_data).astype(float32)
    
    final_data[final_data==0] = np.nan
    
    x_means = np.zeros_like(y_means)
    x_stds = np.zeros_like(y_means)
    
    for ct in range(9):
        for g in range(100):
            x_means[ct,g] = np.nanmean(final_data[ct,g,:])
            x_stds[ct,g] = np.nanstd(final_data[ct,g,:])
    
    x = np.stack([x_means,x_stds],axis=0)
    y = np.stack([y_means,y_stds],axis=0)

    x_train.append(x)
    y_train.append(y)

100%|██████████| 10000/10000 [21:30<00:00,  7.75it/s]


In [10]:
xtrain = np.array(x_train)
ytrain = np.array(y_train)

In [11]:
print(xtrain.shape, ytrain.shape)

(10000, 2, 9, 100) (10000, 2, 9, 100)


In [12]:
np.save('./xtrain.npy',xtrain)
np.save('./ytrain.npy',ytrain)