In [1]:
import numpy as np
import pandas as pd
from SERGIO.sergio import sergio

In [18]:
# create sim object
sim = sergio(number_genes=5000, number_bins = 3, number_sc = 1500, noise_params = 1, 
             decays=0.8, sampling_state=15, noise_type='dpd')
# read clean data
expr_clean = pd.read_csv("./SERGIO_analyais/5000.gene.3ct.v3.clean.csv", header = None).to_numpy()

In [19]:
# expr_clean

In [20]:
# split to cell types
expr = np.stack(np.hsplit(expr_clean, 3)) # 3 is the number of cell types (number_bins)

In [21]:
expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 0.8, scale = 1)

In [22]:
"""
Add Library Size Effect
"""
libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 8, scale = 0.4)

"""
Add Dropouts
"""
binary_ind1 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 10)
expr_O_L_D1 = np.multiply(binary_ind1, expr_O_L)
binary_ind2 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 70)
expr_O_L_D2 = np.multiply(binary_ind2, expr_O_L)
binary_ind3 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 92)
expr_O_L_D3 = np.multiply(binary_ind3, expr_O_L)

In [23]:
count_matrix1 = sim.convert_to_UMIcounts(expr_O_L_D1)
count_matrix_2d1 = np.concatenate(count_matrix1, axis = 1)

count_matrix2 = sim.convert_to_UMIcounts(expr_O_L_D2)
count_matrix_2d2 = np.concatenate(count_matrix2, axis = 1)

count_matrix3 = sim.convert_to_UMIcounts(expr_O_L_D3)
count_matrix_2d3 = np.concatenate(count_matrix3, axis = 1)

In [24]:
print(np.count_nonzero(count_matrix1)/(3 * 5000 * 1500))
print(np.count_nonzero(count_matrix2)/(3 * 5000 * 1500))
print(np.count_nonzero(count_matrix3)/(3 * 5000 * 1500))

0.34540417777777777
0.2157166222222222
0.09069564444444445


In [25]:
np.savetxt('./SERGIO_analyais/5000.gene.3ct.v3.dp1.csv', count_matrix_2d1, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.3ct.v3.dp2.csv', count_matrix_2d2, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.3ct.v3.dp3.csv', count_matrix_2d3, delimiter = ',', fmt='% 4d')

In [93]:
# count_matrix_2d2

array([[2, 1, 2, ..., 0, 1, 1],
       [1, 4, 2, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 1, 0, 1],
       [0, 1, 2, ..., 0, 1, 0],
       [3, 1, 1, ..., 1, 0, 0]])

In [42]:
# create sim object for 5000 gene 4 ct
sim = sergio(number_genes=5000, number_bins = 4, number_sc = 1000, noise_params = 1, 
             decays=0.8, sampling_state=15, noise_type='dpd')
# read clean data
expr_clean = pd.read_csv("./SERGIO_analyais/5000.gene.4ct.v3.dyn.clean.csv", header = None).to_numpy()

In [43]:
expr_clean.shape

(5000, 4000)

In [44]:
# split to cell types
expr = np.stack(np.hsplit(expr_clean, 4)) # is the number of cell types (number_bins)

In [45]:
expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 0.8, scale = 1)

In [46]:
"""
Add Library Size Effect
"""
libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 8, scale = 0.4)

"""
Add Dropouts
"""
binary_ind1 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 10)
expr_O_L_D1 = np.multiply(binary_ind1, expr_O_L)
binary_ind2 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 70)
expr_O_L_D2 = np.multiply(binary_ind2, expr_O_L)
binary_ind3 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 92)
expr_O_L_D3 = np.multiply(binary_ind3, expr_O_L)

In [47]:
count_matrix1 = sim.convert_to_UMIcounts(expr_O_L_D1)
count_matrix_2d1 = np.concatenate(count_matrix1, axis = 1)

count_matrix2 = sim.convert_to_UMIcounts(expr_O_L_D2)
count_matrix_2d2 = np.concatenate(count_matrix2, axis = 1)

count_matrix3 = sim.convert_to_UMIcounts(expr_O_L_D3)
count_matrix_2d3 = np.concatenate(count_matrix3, axis = 1)

In [48]:
print(np.count_nonzero(count_matrix1)/(4 * 5000 * 1000))
print(np.count_nonzero(count_matrix2)/(4 * 5000 * 1000))
print(np.count_nonzero(count_matrix3)/(4 * 5000 * 1000))

0.375133
0.2089336
0.1043842


In [49]:
np.savetxt('./SERGIO_analyais/5000.gene.4ct.v3.dyn.dp1.csv', count_matrix_2d1, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.4ct.v3.dyn.dp2.csv', count_matrix_2d2, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.4ct.v3.dyn.dp3.csv', count_matrix_2d3, delimiter = ',', fmt='% 4d')

In [133]:
# count_matrix_2d1.shape

(5000, 4000)

In [66]:
# create sim object for 5000 gene 6 ct
sim = sergio(number_genes=5000, number_bins = 6, number_sc = 800, noise_params = 1, 
             decays=0.8, sampling_state=15, noise_type='dpd')
# read clean data
expr_clean = pd.read_csv("./SERGIO_analyais/5000.gene.6ct.v3.dyn.clean.csv", header = None).to_numpy()

In [67]:
expr_clean.shape

(5000, 4800)

In [68]:
# split to cell types
expr = np.stack(np.hsplit(expr_clean, 6)) # is the number of cell types (number_bins)

In [69]:
expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 0.8, scale = 1)

In [70]:
"""
Add Library Size Effect
"""
libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 8, scale = 0.4)

"""
Add Dropouts
"""
binary_ind1 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 10)
expr_O_L_D1 = np.multiply(binary_ind1, expr_O_L)
binary_ind2 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 70)
expr_O_L_D2 = np.multiply(binary_ind2, expr_O_L)
binary_ind3 = sim.dropout_indicator(expr_O_L, shape = 6, percentile = 92)
expr_O_L_D3 = np.multiply(binary_ind3, expr_O_L)

In [71]:
count_matrix1 = sim.convert_to_UMIcounts(expr_O_L_D1)
count_matrix_2d1 = np.concatenate(count_matrix1, axis = 1)

count_matrix2 = sim.convert_to_UMIcounts(expr_O_L_D2)
count_matrix_2d2 = np.concatenate(count_matrix2, axis = 1)

count_matrix3 = sim.convert_to_UMIcounts(expr_O_L_D3)
count_matrix_2d3 = np.concatenate(count_matrix3, axis = 1)

In [72]:
print(np.count_nonzero(count_matrix1)/(6 * 5000 * 800))
print(np.count_nonzero(count_matrix2)/(6 * 5000 * 800))
print(np.count_nonzero(count_matrix3)/(6 * 5000 * 800))

0.37450979166666665
0.20912708333333332
0.10330295833333333


In [73]:
np.savetxt('./SERGIO_analyais/5000.gene.6ct.v3.dyn.dp1.csv', count_matrix_2d1, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.6ct.v3.dyn.dp2.csv', count_matrix_2d2, delimiter = ',', fmt='% 4d')
np.savetxt('./SERGIO_analyais/5000.gene.6ct.v3.dyn.dp3.csv', count_matrix_2d3, delimiter = ',', fmt='% 4d')