In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class Simulator:
    def __init__(
        self, 
        num_genes = 200000, 
        avg_num_sgRNAs = 5, 
        num_treatment = 2, 
        num_control = 2, 
        min_total = 1000,
        max_total = 100000,
        total_NTCs = 1000,
        fraction_enriched = 0.1,
        fraction_depleted = 0.1,
        fraction_NTC = 0.1):
        
        self.num_genes = num_genes
        self.avg_num_sgRNAs = avg_num_sgRNAs
        self.num_treatment = num_treatment
        self.num_control = num_control
        self.min_total = min_total
        self.max_total = max_total
        self.total_NTCs = total_NTCs
        
        total_fractions = fraction_enriched + fraction_depleted + fraction_NTC
        
        if ((total_fractions > 0.0) & (total_fractions <= 1.0)):
            self.fraction_enriched = fraction_enriched
            self.fraction_depleted = fraction_depleted
            self.fraction_NTC = fraction_NTC
            self.fraction_normal = 1.0 - (fraction_enriched + fraction_depleted + fraction_NTC)
        else:
            raise Exception("Fractions total cannot exceed 1.")
        
        self.totals_array = np.random.randint(self.min_total, self.max_total, size = self.num_treatment + self.num_control) 
    
    def gene(self):
        return ["gene_" + str(i) for i in np.arange(self.num_genes)]
    
    
    def num_sgRNAs(self):
        sgRNAs = np.random.normal(loc=5, scale=1, size=self.num_genes)
        sgRNAs = np.round(sgRNAs)
        return sgRNAs 
        
    def sum_array(self, index):
        a = np.random.random(self.num_genes)
        a /= a.sum()
        a *= self.totals_array[index]
        a = np.round(a)
        return a
    
    def setting_treatment_libraries(self):
        treatment = [] 
        
        for i in np.arange(self.num_treatment):
            treatment.append(self.sum_array(i))
        
        return treatment
    
    def setting_control_libraries(self):
        control = [] 
        
        for i in np.arange(self.num_control):
            control.append(self.sum_array(-i))
        
        return control
        
    def type_of_change(self):        
        type_of_change = ["enriched"] * round(self.num_genes * self.fraction_enriched)
        type_of_change += ["depleted"] * round(self.num_genes * self.fraction_depleted)
        type_of_change += ["NTC"] * round(self.num_genes * self.fraction_NTC)
        type_of_change += ["normal"] * round(self.num_genes * self.fraction_normal)
        return type_of_change 
    
    
    def sample(self):
        
        gene = pd.DataFrame({"gene": self.gene()})
        sgRNAs = pd.DataFrame({"sgRNAs": self.num_sgRNAs()})
        treatment = pd.DataFrame(self.setting_treatment_libraries()).T
        control = pd.DataFrame(self.setting_control_libraries()).T
        type_of_change = pd.DataFrame({"type": self.type_of_change()})
        
        result = pd.concat([gene, sgRNAs, treatment, control, type_of_change], axis=1, join="inner")

        return result 

In [4]:
trial = Simulator(num_genes=10000, num_treatment=3, fraction_depleted=0.7)
print(trial.sample())


           gene  sgRNAs    0    1     2    0     1      type
0        gene_0     5.0  0.0  3.0  16.0  6.0  13.0  enriched
1        gene_1     4.0  5.0  1.0  10.0  3.0  16.0  enriched
2        gene_2     5.0  3.0  3.0   4.0  6.0  13.0  enriched
3        gene_3     4.0  7.0  0.0  14.0  2.0   8.0  enriched
4        gene_4     5.0  5.0  1.0   7.0  3.0  12.0  enriched
...         ...     ...  ...  ...   ...  ...   ...       ...
9995  gene_9995     6.0  4.0  5.0   7.0  6.0   5.0    normal
9996  gene_9996     4.0  3.0  1.0  11.0  0.0   1.0    normal
9997  gene_9997     5.0  5.0  1.0  11.0  1.0  11.0    normal
9998  gene_9998     3.0  3.0  5.0  15.0  6.0  13.0    normal
9999  gene_9999     5.0  6.0  4.0  14.0  6.0   9.0    normal

[10000 rows x 8 columns]
