In [1]:
from GENIE3 import *
import sys, os
sys.path.append(os.getcwd())
sys.path.append('/scratch/ab9738/dfdl_imputation/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from scipy import stats
import SERGIO.SERGIO.sergio as sergio
from sklearn.metrics import roc_auc_score
from copy import deepcopy

In [2]:
ds1_clean = np.load('../SERGIO/imputation_data/DS1/DS6_clean_iter_0.npy')
ds1_expr = np.load('../SERGIO/imputation_data/DS1/DS6_expr_iter_0.npy')

# Load Simulation

In [3]:
def parse_dataset_name(folder_name):
    pattern1 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_dynamics_(\d+)_DS(\d+)'
    pattern2 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_(\d+)_DS(\d+)'
    match_p1 = re.match(pattern1, folder_name)
    match_p2 = re.match(pattern2, folder_name)
    if match_p1:
        return {
            'number_genes': int(match_p1.group(1)),
            'number_bins': int(match_p1.group(2)),
            'number_sc': int(match_p1.group(3)),
            'dynamics': int(match_p1.group(4)),
            'dataset_id': int(match_p1.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_dynamics_{dynamics}_DS{dataset_id}"
        }
    if match_p2:
        return {
            'number_genes': int(match_p2.group(1)),
            'number_bins': int(match_p2.group(2)),
            'number_sc': int(match_p2.group(3)),
            'dynamics': int(match_p2.group(4)),
            'dataset_id': int(match_p2.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_{dynamics}_DS{dataset_id}"
        }
    return

def get_datasets():
    datasets = []
    for folder_name in os.listdir('../SERGIO/data_sets'):
        dataset_info = parse_dataset_name(folder_name)
        if dataset_info:
            datasets.append(dataset_info)
    return sorted(datasets, key=lambda x: x['dataset_id'])

In [5]:
data_info = get_datasets()[0]

In [6]:
sim = sergio.sergio(
        number_genes=data_info["number_genes"],
        number_bins=data_info["number_bins"], 
        number_sc=data_info["number_sc"],
        noise_params=1,
        decays=0.8, 
        sampling_state=15,
        noise_type='dpd'
    )

# Ground Truth

In [7]:
target_file = '../SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
gt = np.zeros((100,100))

In [8]:
# target_file = '../SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Interaction_cID_5.txt'
# gt = np.zeros((100,100))

In [9]:
f = open(target_file,'r')
Lines = f.readlines()
f.close()
for j in range(len(Lines)):
    line = Lines[j]
    line_list = line.split(',')
    target_index = int(float(line_list[0]))
    num_regs = int(float(line_list[1]))
    for i in range(num_regs):
        try:
            reg_index = int(float(line_list[i+2]))
            gt[reg_index,target_index] = 1 
        except:
            continue

# Generate Noisy Data

In [10]:
def get_noisy_expr(sim, expr, percentile=45):
    """
    Add outlier genes
    """
    expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 5, scale = 1)

    """
    Add Library Size Effect
    """
    libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 4.5, scale = 0.7)

    """
    Add Dropouts
    """
    binary_ind = sim.dropout_indicator(expr_O_L, shape = 8, percentile = percentile)
    expr_O_L_D = np.multiply(binary_ind, expr_O_L)

    """
    Convert to UMI count
    """
    expr_O_L_D_C = sim.convert_to_UMIcounts(expr_O_L_D)

    return(expr_O_L_D_C)

In [12]:
ds1_noisy_1 = np.concatenate(get_noisy_expr(sim, ds1_expr), axis=1)
ds1_noisy_2 = np.concatenate(get_noisy_expr(sim, ds1_expr), axis=1)
ds1_noisy_3 = np.concatenate(get_noisy_expr(sim, ds1_expr), axis=1)
ds1_noisy_4 = np.concatenate(get_noisy_expr(sim, ds1_expr), axis=1)
ds1_noisy_5 = np.concatenate(get_noisy_expr(sim, ds1_expr), axis=1)

In [43]:
ds1_noisy = np.concatenate([ds1_noisy_1,ds1_noisy_2,ds1_noisy_3,ds1_noisy_4,ds1_noisy_5], axis=1)

In [13]:
def substitute_dataset(ds1):
    ds1[ds1 == 0] = np.nan
    for i in range(9):
        ds1_cell_type = ds1[:,i*300:(i+1)*300]
        mean_array = np.nanmean(ds1_cell_type, axis=1)
        var_array = np.nanvar(ds1_cell_type, axis=1)
        for j in range(100):
            ds1_cell_type[j,:] = np.random.normal(loc=mean_array[j],scale=np.sqrt(var_array[j]),size=300)
    ds1[ds1<0] = 0.0
    np.nan_to_num(ds1,copy=False)
    return(ds1)

In [47]:
ds1_substitute_1 = substitute_dataset(ds1_noisy_1.astype(float32))
ds1_substitute_2 = substitute_dataset(ds1_noisy_2.astype(float32))
ds1_substitute_3 = substitute_dataset(ds1_noisy_3.astype(float32))
ds1_substitute_4 = substitute_dataset(ds1_noisy_4.astype(float32))
ds1_substitute_5 = substitute_dataset(ds1_noisy_5.astype(float32))

  mean_array = np.nanmean(ds1_cell_type, axis=1)
  var_array = np.nanvar(ds1_cell_type, axis=1)


In [48]:
ds1_substitute = np.concatenate([ds1_substitute_1,ds1_substitute_2,ds1_substitute_3,ds1_substitute_4,ds1_substitute_5], axis=1)

In [23]:
VIM_1 = GENIE3(np.transpose(ds1_substitute_1), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute_1).shape[1])])
VIM_2 = GENIE3(np.transpose(ds1_substitute_2), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute_2).shape[1])])
VIM_3 = GENIE3(np.transpose(ds1_substitute_3), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute_3).shape[1])])
VIM_4 = GENIE3(np.transpose(ds1_substitute_4), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute_4).shape[1])])
VIM_5 = GENIE3(np.transpose(ds1_substitute_5), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute_5).shape[1])])

Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 67.64 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 84.82 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 69.46 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 75.47 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 67.65 seconds


In [37]:
roc_auc_score(gt.flatten(), VIM_5.flatten())

0.538589405101224

In [36]:
roc_auc_score(gt.flatten(), (VIM_2+VIM_3+VIM_5).flatten())

0.5355119446049154

In [44]:
VIM_1 = GENIE3(np.transpose(ds1_noisy), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_noisy).shape[1])])

Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 77.90 seconds


In [46]:
roc_auc_score(gt.flatten(), VIM_1.flatten())

0.4988370501576329

In [49]:
VIM_2 = GENIE3(np.transpose(ds1_substitute), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds1_substitute).shape[1])])

Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 431.39 seconds


In [50]:
roc_auc_score(gt.flatten(), VIM_2.flatten())

0.5008064657305776