In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import os
import sys
current_dir = os.getcwd()

# Import GENIE3
from GENIE3.GENIE3 import GENIE3
import baselines.scScope.scscope.scscope as scScope
from baselines.SAUCIE.SAUCIE import SAUCIE, Loader
from baselines.MAGIC.magic import magic

# Import imputation methods
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# For deep learning-based imputation (DeepImpute)
# !pip install deepimpute
from baselines.deepimpute.deepimpute.multinet import MultiNet

# For graph convolutional networks
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from scipy.sparse.linalg import bicgstab
import scprep
from sklearn.metrics import mean_squared_error

2024-09-18 20:14:08.267891: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-18 20:14:09.304762: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-18 20:14:10.809903: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-18 20:14:10.810902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
# Load your dataset
def load_data():
    ds1_clean = np.load('./SERGIO/imputation_data/DS1/iterations_seperate/DS6_clean_iter_0.npy').astype(np.float32)
    ds1_noisy = np.load('./SERGIO/imputation_data/DS1/iterations_seperate/DS6_45_iter_0.npy').astype(np.float32)
    return ds1_clean, ds1_noisy

# Load ground truth network
def load_ground_truth(target_file, num_genes):
    gt = np.zeros((num_genes, num_genes))
    with open(target_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line_list = line.strip().split(',')
        target_index = int(float(line_list[0]))
        num_regs = int(float(line_list[1]))
        for i in range(num_regs):
            reg_index = int(float(line_list[i + 2]))
            gt[reg_index, target_index] = 1
    return gt

# Build the adjacency matrix
def build_adjacency_matrix(num_genes, interactions_file):
    adjacency_matrix = np.zeros((num_genes, num_genes))
    with open(interactions_file, 'r') as f:
        for line in f:
            tokens = line.strip().split(',')
            gene = int(float(tokens[0])) - 1  # Adjusting for zero-based indexing
            num_targets = int(float(tokens[1]))
            targets = [int(float(t)) - 1 for t in tokens[2:2 + num_targets]]
            for target in targets:
                adjacency_matrix[gene, target] = 1
    return adjacency_matrix

def knn_imputation(ds1, n_neighbors=5):
    # Replace zeros with NaN to mark missing values
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    
    # Initialize the imputed dataset
    ds1_imputed = np.copy(ds1)
    
    # Number of genes and cells
    num_genes, num_cells = ds1.shape
    
    # Number of cell types (assuming 300 cells per type)
    cells_per_type = 300
    num_cell_types = num_cells // cells_per_type
    
    # Loop over each cell type
    for i in range(num_cell_types):
        start_idx = i * cells_per_type
        end_idx = start_idx + cells_per_type
        ds1_cell_type = ds1[:, start_idx:end_idx]
        
        # Transpose the data to shape (cells, genes) for KNNImputer
        ds1_cell_type_T = ds1_cell_type.T
        
        # Initialize KNNImputer
        imputer = KNNImputer(n_neighbors=n_neighbors, weights='distance')
        
        # Perform imputation
        ds1_cell_type_imputed_T = imputer.fit_transform(ds1_cell_type_T)
        
        # Transpose back to original shape
        ds1_cell_type_imputed = ds1_cell_type_imputed_T.T
        
        # Update the imputed dataset
        ds1_imputed[:, start_idx:end_idx] = ds1_cell_type_imputed
    
    # Replace any remaining NaN values with zero
    ds1_imputed = np.nan_to_num(ds1_imputed)
    ds1_imputed[ds1_imputed < 0] = 0.0
    
    return ds1_imputed

def iterative_imputation(ds1):
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    ds1_imputed = np.copy(ds1)
    num_genes, num_cells = ds1.shape
    cells_per_type = 300
    num_cell_types = num_cells // cells_per_type

    for i in range(num_cell_types):
        start_idx = i * cells_per_type
        end_idx = start_idx + cells_per_type
        ds1_cell_type = ds1[:, start_idx:end_idx]
        ds1_cell_type_T = ds1_cell_type.T
        imputer = IterativeImputer(max_iter=10, random_state=0)
        ds1_cell_type_imputed_T = imputer.fit_transform(ds1_cell_type_T)
        ds1_cell_type_imputed = ds1_cell_type_imputed_T.T
        ds1_imputed[:, start_idx:end_idx] = ds1_cell_type_imputed

    ds1_imputed = np.nan_to_num(ds1_imputed)
    ds1_imputed[ds1_imputed < 0] = 0.0
    return ds1_imputed

def deep_learning_imputation(ds1):
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    num_genes, num_cells = ds1.shape
    ds1_imputed = np.copy(ds1)
    num_cell_types = num_cells // 300  # Adjust cells per type if necessary

    for i in range(num_cell_types):
        start_idx = i * 300
        end_idx = start_idx + 300
        ds1_cell_type = ds1[:, start_idx:end_idx]
        ds1_cell_type_T = ds1_cell_type.T
        df = pd.DataFrame(ds1_cell_type_T)
        
        # Replace NaNs with zeros for DeepImpute
        df = df.fillna(0)
        
        model = MultiNet()
        model.fit(df)
        imputed_data = model.predict(df)
        ds1_cell_type_imputed = imputed_data.to_numpy().T
        ds1_imputed[:, start_idx:end_idx] = ds1_cell_type_imputed

    # Replace any NaNs or infs with zeros
    ds1_imputed = np.nan_to_num(ds1_imputed, nan=0.0, posinf=0.0, neginf=0.0)
    ds1_imputed[ds1_imputed < 0] = 0.0
    return ds1_imputed

def graph_convolutional_imputation(ds1, adjacency_matrix, num_epochs=100, learning_rate=0.01):
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    num_genes, num_cells = ds1.shape
    ds1_imputed = np.copy(ds1)
    
    # Convert adjacency matrix to edge index
    edge_index = np.array(adjacency_matrix.nonzero())
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    
    # Convert data to PyTorch tensors
    # Since we need to impute gene expressions, the nodes are genes, and features are cells
    x = torch.tensor(ds1_imputed, dtype=torch.float)  # Shape: (num_genes, num_cells)
    
    class GCN(torch.nn.Module):
        def __init__(self, num_features, hidden_channels):
            super(GCN, self).__init__()
            self.conv1 = GCNConv(num_features, hidden_channels)
            self.conv2 = GCNConv(hidden_channels, num_features)
    
        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index)
            x = F.relu(x)
            x = self.conv2(x, edge_index)
            return x
    
    # Adjust the number of features and hidden channels
    num_features = num_cells  # Features are cells
    hidden_channels = 64
    
    model = GCN(num_features, hidden_channels)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.MSELoss()
    
    # Training loop
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        output = model(x, edge_index)
        mask = torch.isnan(x)
        loss = loss_fn(output[~mask], x[~mask])
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
    
    # Impute missing values
    model.eval()
    with torch.no_grad():
        imputed = model(x, edge_index)
        x[mask] = imputed[mask]
    ds1_imputed = x.numpy()
    ds1_imputed[ds1_imputed < 0] = 0.0
    return ds1_imputed

def graph_diffusion_imputation(ds1, adjacency_matrix, alpha=0.5, max_iter=100):
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    ds1_imputed = np.copy(ds1)
    num_genes, num_cells = ds1.shape

    degrees = np.sum(adjacency_matrix, axis=1)
    with np.errstate(divide='ignore'):
        D_inv = np.diag(1.0 / degrees)
        D_inv[np.isinf(D_inv)] = 0.0

    P = D_inv.dot(adjacency_matrix)
    P = np.nan_to_num(P)
    I = np.eye(num_genes)
    epsilon = 1e-5
    A = I - alpha * P + epsilon * np.eye(num_genes)

    for cell_idx in range(num_cells):
        y = ds1_imputed[:, cell_idx]
        missing_indices = np.isnan(y)
        if np.any(missing_indices):
            x0 = np.zeros(num_genes)
            y_filled = np.nan_to_num(y)
            x, info = bicgstab(A, y_filled, x0=x0, maxiter=max_iter)
            if info != 0:
                print(f"Warning: BiCGSTAB did not converge for cell {cell_idx}, info: {info}")
            y[missing_indices] = x[missing_indices]
            ds1_imputed[:, cell_idx] = y

    ds1_imputed = np.nan_to_num(ds1_imputed)
    ds1_imputed[ds1_imputed < 0] = 0.0
    return ds1_imputed

def saucie_imputation(ds1):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()

    ds1 = ds1.copy()
    # ds1[ds1 == 0] = np.nan
    num_genes, num_cells = ds1.shape
    ds1_imputed = np.copy(ds1)
    
    # Number of cell types (assuming 300 cells per type)
    cells_per_type = 300
    num_cell_types = num_cells // cells_per_type

    for i in range(num_cell_types):
        start_idx = i * cells_per_type
        end_idx = start_idx + cells_per_type
        ds1_cell_type = ds1[:, start_idx:end_idx]
        ds1_cell_type_T = ds1_cell_type.T

        tf.reset_default_graph()
        saucie = SAUCIE(ds1_cell_type_T.shape[1])
        loadtrain = Loader(ds1_cell_type_T, shuffle=True)
        saucie.train(loadtrain, steps=1000)
        loadeval = Loader(ds1_cell_type_T, shuffle=False)
        rec_ds1_T = saucie.get_reconstruction(loadeval)
        rec_ds1 = rec_ds1_T.T
        ds1_imputed[:, start_idx:end_idx] = rec_ds1

    ds1_imputed[ds1_imputed < 0] = 0.0
    ds1_imputed = np.nan_to_num(ds1_imputed)
    return ds1_imputed

def magic_imputation(ds1):
    ds1 = ds1.copy()
    # ds1[ds1 == 0] = np.nan
    num_genes, num_cells = ds1.shape
    ds1_imputed = np.copy(ds1)
    
    # Number of cell types (assuming 300 cells per type)
    cells_per_type = 300
    num_cell_types = num_cells // cells_per_type

    for i in range(num_cell_types):
        start_idx = i * cells_per_type
        end_idx = start_idx + cells_per_type
        ds1_cell_type = ds1[:, start_idx:end_idx]
        ds1_cell_type_T = ds1_cell_type.T

        ds1_filtered_T = scprep.filter.filter_rare_genes(ds1_cell_type_T, min_cells=5)
        ds1_normalized_T = scprep.normalize.library_size_normalize(ds1_filtered_T)
        ds1_sqrt_T = scprep.transform.sqrt(ds1_normalized_T)
        magic_operator = magic.MAGIC(
            # knn=5,
            # knn_max=None,
            # decay=1,
            # Variable changed in paper
            t='auto',
            n_pca=20,
            # solver="exact",
            # knn_dist="euclidean",
            n_jobs=-1,
            # random_state=None,
            # verbose=1,
        )
        ds1_imputed_T = magic_operator.fit_transform(ds1_sqrt_T)
        ds1_imputed_cell_type = ds1_imputed_T.T

        ds1_imputed[:, start_idx:end_idx] = ds1_imputed_cell_type

    ds1_imputed[ds1_imputed < 0] = 0.0
    ds1_imputed = np.nan_to_num(ds1_imputed)
    return ds1_imputed

def scscope_imputation(ds1):
    ds1 = ds1.copy()
    ds1[ds1 == 0] = np.nan
    num_genes, num_cells = ds1.shape
    ds1_imputed = np.copy(ds1)
    
    # Number of cell types (assuming 300 cells per type)
    cells_per_type = 300
    num_cell_types = num_cells // cells_per_type

    for i in range(num_cell_types):
        start_idx = i * cells_per_type
        end_idx = start_idx + cells_per_type
        ds1_cell_type = ds1[:, start_idx:end_idx]
        ds1_cell_type_T = ds1_cell_type.T
        DI_model = scScope.train(
            ds1_cell_type_T,
            15,
            use_mask=True,
            batch_size=64,
            max_epoch=1000,
            epoch_per_check=100,
            T=2,
            exp_batch_idx_input=[],
            encoder_layers=[],
            decoder_layers=[],
            learning_rate=0.0001,
            beta1=0.05,
            num_gpus=1)
        _, rec_ds1_cell_type_T, _ = scScope.predict(ds1_cell_type_T, DI_model)
        rec_ds1_cell_type = rec_ds1_cell_type_T.T
        ds1_imputed[:, start_idx:end_idx] = rec_ds1_cell_type

    ds1_imputed[ds1_imputed < 0] = 0.0
    ds1_imputed = np.nan_to_num(ds1_imputed)
    return ds1_imputed

def run_pipeline(imputation_method, method_name, ds1_noisy, ds1_clean, gt, adjacency_matrix=None):
    print(f"Running {method_name}...")
    if 'Graph' in method_name:
        ds1_imputed = imputation_method(ds1_noisy, adjacency_matrix)
    else:
        ds1_imputed = imputation_method(ds1_noisy)

    # Evaluate imputation quality
    mse = mean_squared_error(ds1_clean.flatten(), ds1_imputed.flatten())
    print(f"MSE after {method_name}: {mse:.4f}")

    # Proceed with GENIE3 and ROC AUC evaluation
    ds1_imputed_T = ds1_imputed.T
    VIM_imputed = GENIE3(ds1_imputed_T, nthreads=80, ntrees=100, regulators='all',
                         gene_names=[str(s) for s in range(ds1_imputed_T.shape[1])])
    roc_auc = roc_auc_score(gt.flatten(), VIM_imputed.flatten())
    print(f"ROC AUC Score after {method_name}: {roc_auc:.4f}\n")
    return roc_auc, mse

In [5]:
# Load data
ds1_clean, ds1_noisy = load_data()
num_genes = ds1_noisy.shape[0]
target_file = './SERGIO/data_sets/De-noised_100G_9T_300cPerT_4_DS1/Interaction_cID_4.txt'
gt = load_ground_truth(target_file, num_genes)
adjacency_matrix = build_adjacency_matrix(num_genes, target_file)

# Evaluate clean data
print("Evaluating Clean Data...")
ds1_clean_T = ds1_clean.T
VIM_clean = GENIE3(ds1_clean_T, nthreads=80, ntrees=100, regulators='all',
                    gene_names=[str(s) for s in range(ds1_clean_T.shape[1])])
roc_auc_clean = roc_auc_score(gt.flatten(), VIM_clean.flatten())
print(f"ROC AUC Score for Clean Data: {roc_auc_clean:.4f}\n")

# Evaluate noisy data
print("Evaluating Noisy Data...")
ds1_noisy_T = ds1_noisy.T
VIM_noisy = GENIE3(ds1_noisy_T, nthreads=80, ntrees=100, regulators='all',
                    gene_names=[str(s) for s in range(ds1_noisy_T.shape[1])])
roc_auc_noisy = roc_auc_score(gt.flatten(), VIM_noisy.flatten())
print(f"ROC AUC Score for Noisy Data: {roc_auc_noisy:.4f}\n")

# Compute MSE between noisy data and clean data
mse_noisy = mean_squared_error(ds1_clean.flatten(), ds1_noisy.flatten())
print(f"MSE between Noisy Data and Clean Data: {mse_noisy:.4f}\n")

# Define imputation methods
methods = {
    'KNN Imputation': knn_imputation,
    'Iterative Imputation': iterative_imputation,
    'DeepImpute': deep_learning_imputation,
    'Graph Convolutional Network Imputation': graph_convolutional_imputation,
    'Graph Diffusion Imputation': graph_diffusion_imputation,
    'SAUCIE': saucie_imputation,
    'MAGIC': magic_imputation,
    'scScope': scscope_imputation
}

# Run pipeline for each method
results = {}
mse_results = {}
for method_name, method_func in methods.items():
    try:
        if 'Graph' in method_name:
            roc_auc, mse = run_pipeline(method_func, method_name, ds1_noisy, ds1_clean, gt, adjacency_matrix)
        else:
            roc_auc, mse = run_pipeline(method_func, method_name, ds1_noisy, ds1_clean, gt)
        results[method_name] = roc_auc
        mse_results[method_name] = mse
    except Exception as e:
        print(f"An error occurred with {method_name}: {e}\n")

# Print all results
print("Summary of ROC AUC Scores:")
print(f"Clean Data: {roc_auc_clean:.4f}")
print(f"Noisy Data: {roc_auc_noisy:.4f}")
for method_name, roc_auc in results.items():
    print(f"{method_name}: {roc_auc:.4f}")

print("\nSummary of MSE Values:")
print(f"Noisy Data: {mse_noisy:.4f}")
for method_name, mse in mse_results.items():
    print(f"{method_name}: {mse:.4f}")

Evaluating Clean Data...
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 117.49 seconds
ROC AUC Score for Clean Data: 0.6816

Evaluating Noisy Data...
Tree method: RF
K: sqrt
Number of trees: 100


running jobs on 80 threads
Elapsed time: 24.34 seconds
ROC AUC Score for Noisy Data: 0.4306

MSE between Noisy Data and Clean Data: 126.0642

Summary of ROC AUC Scores:
Clean Data: 0.6816
Noisy Data: 0.4306

Summary of MSE Values:
Noisy Data: 126.0642
