# Imports

In [141]:
from GENIE3 import *
import sys, os
sys.path.append(os.getcwd())
sys.path.append('/scratch/ab9738/dfdl_imputation/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from scipy import stats
import SERGIO.SERGIO.sergio as sergio
from sklearn.metrics import roc_auc_score, roc_curve
from copy import deepcopy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from scipy.stats import pearsonr

# Load Data

In [11]:
# ds1_clean_1 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_1.npy')
# ds1_expr_1 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_expr_iter_1.npy')
# ds1_clean_2 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_2.npy')
# ds1_expr_2 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_expr_iter_2.npy')
# ds1_clean_3 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_3.npy')
# ds1_expr_3 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_expr_iter_3.npy')
# ds1_clean_4 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_4.npy')
# ds1_expr_4 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_expr_iter_4.npy')
# ds1_clean_5 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_5.npy')
# ds1_expr_5 = np.load('../SERGIO/imputation_data/DS1/iterations_seperate/DS6_expr_iter_5.npy')

In [72]:
ds2_clean_0 = np.load('../SERGIO/imputation_data/DS2/DS6_clean_iter_0.npy')
ds2_expr_0 = np.load('../SERGIO/imputation_data/DS2/DS6_expr_iter_0.npy')
ds2_clean_1 = np.load('../SERGIO/imputation_data/DS2/DS6_clean_iter_1.npy')
ds2_expr_1 = np.load('../SERGIO/imputation_data/DS2/DS6_expr_iter_1.npy')
ds2_clean_2 = np.load('../SERGIO/imputation_data/DS2/DS6_clean_iter_2.npy')
ds2_expr_2 = np.load('../SERGIO/imputation_data/DS2/DS6_expr_iter_2.npy')

# Load Simulation

In [73]:
def parse_dataset_name(folder_name):
    pattern1 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_dynamics_(\d+)_DS(\d+)'
    pattern2 = r'De-noised_(\d+)G_(\d+)T_(\d+)cPerT_(\d+)_DS(\d+)'
    match_p1 = re.match(pattern1, folder_name)
    match_p2 = re.match(pattern2, folder_name)
    if match_p1:
        return {
            'number_genes': int(match_p1.group(1)),
            'number_bins': int(match_p1.group(2)),
            'number_sc': int(match_p1.group(3)),
            'dynamics': int(match_p1.group(4)),
            'dataset_id': int(match_p1.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_dynamics_{dynamics}_DS{dataset_id}"
        }
    if match_p2:
        return {
            'number_genes': int(match_p2.group(1)),
            'number_bins': int(match_p2.group(2)),
            'number_sc': int(match_p2.group(3)),
            'dynamics': int(match_p2.group(4)),
            'dataset_id': int(match_p2.group(5)),
            "pattern": "De-noised_{number_genes}G_{number_bins}T_{number_sc}cPerT_{dynamics}_DS{dataset_id}"
        }
    return

def get_datasets():
    datasets = []
    for folder_name in os.listdir('../SERGIO/data_sets'):
        dataset_info = parse_dataset_name(folder_name)
        if dataset_info:
            datasets.append(dataset_info)
    return sorted(datasets, key=lambda x: x['dataset_id'])

In [74]:
data_info = get_datasets()[1]

In [75]:
sim = sergio.sergio(
        number_genes=data_info["number_genes"],
        number_bins=data_info["number_bins"], 
        number_sc=data_info["number_sc"],
        noise_params=1,
        decays=0.8, 
        sampling_state=15,
        noise_type='dpd'
    )

# Ground Truth

In [162]:
target_file = '../SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
gt = np.zeros((100,100))

In [163]:
# target_file = '../SERGIO/data_sets/De-noised_400G_9T_300cPerT_5_DS2/Interaction_cID_5.txt'
# gt = np.zeros((100,100))

In [164]:
f = open(target_file,'r')
Lines = f.readlines()
f.close()
for j in range(len(Lines)):
    line = Lines[j]
    line_list = line.split(',')
    target_index = int(float(line_list[0]))
    num_regs = int(float(line_list[1]))
    for i in range(num_regs):
        try:
            reg_index = int(float(line_list[i+2]))
            gt[reg_index,target_index] = 1 
        except:
            continue

# Generate Noisy Data

In [78]:
def get_noisy_expr(sim, expr, percentile=45):
    """
    Add outlier genes
    """
    expr_O = sim.outlier_effect(expr, outlier_prob = 0.01, mean = 5, scale = 1)

    """
    Add Library Size Effect
    """
    libFactor, expr_O_L = sim.lib_size_effect(expr_O, mean = 4.5, scale = 0.7)

    """
    Add Dropouts
    """
    binary_ind = sim.dropout_indicator(expr_O_L, shape = 8, percentile = percentile)
    expr_O_L_D = np.multiply(binary_ind, expr_O_L)

    """
    Convert to UMI count
    """
    expr_O_L_D_C = sim.convert_to_UMIcounts(expr_O_L_D)

    return(expr_O_L_D_C)

In [12]:
# ds1_noisy_1 = np.concatenate(get_noisy_expr(sim, ds1_expr_1), axis=1)
# ds1_noisy_2 = np.concatenate(get_noisy_expr(sim, ds1_expr_2), axis=1)
# ds1_noisy_3 = np.concatenate(get_noisy_expr(sim, ds1_expr_3), axis=1)
# ds1_noisy_4 = np.concatenate(get_noisy_expr(sim, ds1_expr_4), axis=1)
# ds1_noisy_5 = np.concatenate(get_noisy_expr(sim, ds1_expr_5), axis=1)

In [115]:
ds2_noisy_0 = np.concatenate(get_noisy_expr(sim, ds2_expr_0), axis=1)
ds2_noisy_1 = np.concatenate(get_noisy_expr(sim, ds2_expr_1), axis=1)
ds2_noisy_2 = np.concatenate(get_noisy_expr(sim, ds2_expr_2), axis=1)

# Denoising by Substitution

In [13]:
# def substitute_dataset(ds1):
#     ds1[ds1 == 0] = np.nan
#     for i in range(9):
#         ds1_cell_type = ds1[:,i*300:(i+1)*300]
#         mean_array = np.nanmean(ds1_cell_type, axis=1)
#         var_array = np.nanvar(ds1_cell_type, axis=1)
#         for j in range(100):
#             ds1_cell_type[j,:] = np.random.normal(loc=mean_array[j],scale=np.sqrt(var_array[j]),size=300)
#     ds1[ds1<0] = 0.0
#     np.nan_to_num(ds1,copy=False)
#     return(ds1)

In [131]:
def substitute_dataset(ds2):
    ds2[ds2 == 0] = np.nan
    for i in range(9):
        ds2_cell_type = ds2[:,i*300:(i+1)*300]
        mean_array = np.nanmean(ds2_cell_type, axis=1)
        var_array = np.nanvar(ds2_cell_type, axis=1)
        var_array[var_array==0] = 0.05
        for j in range(400):
            ds2_cell_type[j,:] = np.random.normal(loc=mean_array[j],scale=np.sqrt(var_array[j]),size=300)
    ds2[ds2<0] = 0.0
    np.nan_to_num(ds2,copy=False,nan=0.0)
    return(ds2)

In [129]:
# ds1_substitute_1 = substitute_dataset(ds1_noisy_1.astype(float32))
# ds1_substitute_2 = substitute_dataset(ds1_noisy_2.astype(float32))
# ds1_substitute_3 = substitute_dataset(ds1_noisy_3.astype(float32))
# ds1_substitute_4 = substitute_dataset(ds1_noisy_4.astype(float32))
# ds1_substitute_5 = substitute_dataset(ds1_noisy_5.astype(float32))

In [132]:
ds2_substitute_0 = substitute_dataset(ds2_noisy_0.astype(float32))
ds2_substitute_1 = substitute_dataset(ds2_noisy_1.astype(float32))
ds2_substitute_2 = substitute_dataset(ds2_noisy_2.astype(float32))

  mean_array = np.nanmean(ds2_cell_type, axis=1)
  var_array = np.nanvar(ds2_cell_type, axis=1)


# Generating VIMs

In [83]:
# VIM_1 = GENIE3(np.transpose(ds1_substitute_1), nthreads=80, ntrees=100, regulators='all',\
#                         gene_names=[str(s) for s in range(np.transpose(ds1_substitute_1).shape[1])])
# VIM_2 = GENIE3(np.transpose(ds1_substitute_2), nthreads=80, ntrees=100, regulators='all',\
#                         gene_names=[str(s) for s in range(np.transpose(ds1_substitute_2).shape[1])])
# VIM_3 = GENIE3(np.transpose(ds1_substitute_3), nthreads=80, ntrees=100, regulators='all',\
#                         gene_names=[str(s) for s in range(np.transpose(ds1_substitute_3).shape[1])])
# VIM_4 = GENIE3(np.transpose(ds1_substitute_4), nthreads=80, ntrees=100, regulators='all',\
#                         gene_names=[str(s) for s in range(np.transpose(ds1_substitute_4).shape[1])])
# VIM_5 = GENIE3(np.transpose(ds1_substitute_5), nthreads=80, ntrees=100, regulators='all',\
#                         gene_names=[str(s) for s in range(np.transpose(ds1_substitute_5).shape[1])])

In [135]:
VIM_0 = GENIE3(np.transpose(ds2_substitute_0), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds2_substitute_0).shape[1])])
VIM_1 = GENIE3(np.transpose(ds2_substitute_1), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds2_substitute_1).shape[1])])
VIM_2 = GENIE3(np.transpose(ds2_substitute_2), nthreads=80, ntrees=100, regulators='all',\
                        gene_names=[str(s) for s in range(np.transpose(ds2_substitute_2).shape[1])])

Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 637.32 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 495.68 seconds
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 621.66 seconds


# Aggregation Boosting

In [136]:
# roc_auc_score(gt.flatten(), (VIM_1+VIM_2+VIM_3+VIM_4+VIM_5).flatten())

In [137]:
roc_auc_score(gt.flatten(), (VIM_0+VIM_1+VIM_2).flatten())

0.5781322286053313

In [67]:
# percentage recovery
(0.588-0.433)/(0.690-0.433)

0.603112840466926

In [140]:
roc_auc_score(gt.flatten(), (VIM_2).flatten())

0.5364386121186775

# Pearson Correlation

In [144]:
def get_pearson_correlation(ds):
    # Transpose the matrix so genes are in columns (cells in rows)
    # Genes should be represented as columns for the vectorized Pearson correlation calculation
    X = ds
    
    # Subtract the mean of each gene's expression values (rows)
    X_mean = X - np.mean(X, axis=1, keepdims=True)
    
    # Calculate the numerator (covariance) of the Pearson correlation
    cov_matrix = np.dot(X_mean, X_mean.T)
    
    # Calculate the denominator (product of standard deviations)
    std_devs = np.linalg.norm(X_mean, axis=1, keepdims=True)
    denominator = np.dot(std_devs, std_devs.T)
    
    # Avoid division by zero for genes with no variance (constant expression across cells)
    denominator[denominator == 0] = 1  # To prevent divide by zero
    
    # Calculate the Pearson correlation matrix
    gene_gene_corr_matrix = cov_matrix / denominator
    
    # Print the gene-gene Pearson correlation matrix
    return(gene_gene_corr_matrix)

In [157]:
VIM_1 = get_pearson_correlation(ds1_substitute_1)
VIM_2 = get_pearson_correlation(ds1_substitute_2)
VIM_3 = get_pearson_correlation(ds1_substitute_3)
VIM_4 = get_pearson_correlation(ds1_substitute_4)
VIM_5 = get_pearson_correlation(ds1_substitute_5)

In [173]:
roc_auc_score(gt.flatten(), (VIM_1+VIM_2+VIM_3+VIM_4+VIM_5).flatten())

0.5350571886453445