# Reproducing the genscore benchmark

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cosmoDA.cosmoDA as go
from sklearn.metrics import confusion_matrix
import simulation.sim_helpers as sim
import pickle

import importlib

import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri

rpy2.robjects.numpy2ri.activate()

import warnings
import anndata as ad
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ad.ImplicitModificationWarning)
np.set_printoptions(edgeitems=10,linewidth=180)

genscore = importr("genscore")
base = importr("base")

## Data generation - no covariate

In [2]:
n1 = 80
n2 = 1000

p = 100

s1 = 2
s2 = 7

domain = genscore.make_domain(type="simplex", p=p) # Simplex domain

In [3]:
K1 = -genscore.cov_cons("band", p, seed=1, spars=s1)
for i in range(p):
    K1[i, i] = -(np.sum(K1[:, i]) - K1[i, i])
K1

array([[ 1.        , -0.66666667, -0.33333333, -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , ..., -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.66666667,  1.66666667, -0.66666667, -0.33333333, -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , ..., -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.33333333, -0.66666667,  2.        , -0.66666667, -0.33333333, -0.        , -0.        , -0.        , -0.        , -0.        , ..., -0.        , -0.        ,
        -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        , -0.        ],
       [-0.        , -0.33333333, -0.66666667,  2.        , -0.66666667, -0.33333333, -0.        , -0.        , -0.        , -0.        , ..., -0.        

In [4]:
datasets_1 = []
for i in range(50):
    print(f"Dataset {i}/50")
    dat = genscore.gen(n1, setting="log_log_sum0", abs=True, eta=np.repeat(-1, p), K=K1, domain=domain, finite_infinity=100, seed=i, burn_in=1000, thinning=100, verbose=False, remove_outofbound=True)
    datasets_1.append(dat)


Dataset 0/50
Dataset 1/50
Dataset 2/50
Dataset 3/50
Dataset 4/50
Dataset 5/50
Dataset 6/50
Dataset 7/50
Dataset 8/50
Dataset 9/50
Dataset 10/50
Dataset 11/50
Dataset 12/50
Dataset 13/50
Dataset 14/50
Dataset 15/50
Dataset 16/50
Dataset 17/50
Dataset 18/50
Dataset 19/50
Dataset 20/50
Dataset 21/50
Dataset 22/50
Dataset 23/50
Dataset 24/50
Dataset 25/50
Dataset 26/50
Dataset 27/50
Dataset 28/50
Dataset 29/50
Dataset 30/50
Dataset 31/50
Dataset 32/50
Dataset 33/50
Dataset 34/50
Dataset 35/50
Dataset 36/50
Dataset 37/50
Dataset 38/50
Dataset 39/50
Dataset 40/50
Dataset 41/50
Dataset 42/50
Dataset 43/50
Dataset 44/50
Dataset 45/50
Dataset 46/50
Dataset 47/50
Dataset 48/50
Dataset 49/50


In [5]:
benchmark_name = "K_recovery_n80"
if not os.path.exists(f"../../data/simulations/K_recovery/{benchmark_name}"):
    os.makedirs(f"../../data/simulations/K_recovery/{benchmark_name}")
    

with open(f'../../data/simulations/K_recovery/{benchmark_name}/datasets_1.pkl', 'wb') as f:
    pickle.dump(datasets_1, f)

In [6]:
benchmark_name = "K_recovery_n80"
with open(f'../../data/simulations/K_recovery/{benchmark_name}/datasets_1.pkl', 'rb') as f:
    datasets_1 = pickle.load(f)

## Solutions

In [7]:
maxit = 1000
nlambda = 100
h_mode = "pow"
h_param1 = 2
tol = 1e-8

lambda1s_new = np.exp(np.linspace(np.log(1), np.log(1e-6), nlambda))
# lambda1s_new = np.linspace(1, 1e-5, nlambda)

a = 0
b = 0

In [9]:
importlib.reload(go)
results_nocov = []
K_dfs = []
for d in range(50):
    print(f"Dataset {d}/50")
    res = go.estimate(datasets_1[d], cov=None, tol=tol, maxit=maxit, centered=False, symmetric="symmetric", scale="", lambda1s=lambda1s_new, h_param1=h_param1, BIC_refit=True, return_raw=True, return_elts=False, diagonal_multiplier=None, cv_fold=5, verbose=2)
    results_nocov.append(res)
    tps = []
    tns = []
    fps = []
    fns = []
    n_nonzeros = []
    for i in range(nlambda):
        
        gt = (K1 != 0).flatten()
        gt = np.delete(gt, [k*p+k for k in range(p)])
        K_qual = (res["raw_estimates"][i] != 0).flatten()
        K_qual = np.delete(K_qual, [k*p+k for k in range(p)])
        n_nonzeros.append(np.sum((res["raw_estimates"][i] != 0)))
        
        tn, fp, fn, tp = confusion_matrix(gt, K_qual).ravel()
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        tps.append(tp)
    
    K_df_1 = pd.DataFrame({
        "dataset": d,
        "lambda": res["lambda1s"],
        "n_nonzero": n_nonzeros,
        "tn": tns,
        "fp": fps,
        "fn": fns,
        "tp": tps
    })
    K_dfs.append(K_df_1)

    with open(f'../../data/simulations/K_recovery/{benchmark_name}/K_dfs_1.pkl', 'wb') as f:
        pickle.dump(K_dfs, f)




Dataset 0/50
Calculating estimates.
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 1/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 2/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 3/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 4/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 5/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Done.
Dataset 1/50
Calculating estimates.
Lambda 0/100
Lambda 10/100
Lambda 20/100
La

KeyboardInterrupt: 

In [None]:
K_df_nocov = pd.concat(K_dfs)
K_df_nocov = sim.get_scores(K_df_nocov)
K_df_nocov

In [None]:
with open(f'../../data/simulations/K_recovery/{benchmark_name}/results_nocov.pkl', 'wb') as f:
    pickle.dump(results_nocov, f)

with open(f'../../data/simulations/K_recovery/{benchmark_name}/K_df_nocov.pkl', 'wb') as f:
    pickle.dump(K_df_nocov, f)

In [None]:
importlib.reload(go)
cov = np.repeat((0, 1), n1/2)
results_cov = []
K_dfs_cov = []
for d in range(50):
    print(f"Dataset {d}/50")
    res = go.estimate(datasets_1[d], cov=cov, tol=tol, maxit=maxit, centered=False, symmetric="symmetric", scale="", lambda1s=lambda1s_new, mode=h_mode, h_param1=h_param1, BIC_refit=True, return_raw=True, return_elts=False, diagonal_multiplier=None, cv_fold=5, verbose=2)
    results_cov.append(res)
    tps = []
    tns = []
    fps = []
    fns = []
    n_nonzeros = []
    for i in range(nlambda):
        
        gt = (K1 != 0).flatten()
        gt = np.delete(gt, [k*p+k for k in range(p)])
        K_qual = (res["raw_estimates"][i] != 0).flatten()
        K_qual = np.delete(K_qual, [k*p+k for k in range(p)])
        n_nonzeros.append(np.sum((res["raw_estimates"][i] != 0)))
        
        tn, fp, fn, tp = confusion_matrix(gt, K_qual).ravel()
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        tps.append(tp)
    
    K_df_1 = pd.DataFrame({
        "dataset": d,
        "lambda": res["lambda1s"],
        "n_nonzero": n_nonzeros,
        "tn": tns,
        "fp": fps,
        "fn": fns,
        "tp": tps
    })
    K_dfs_cov.append(K_df_1)

    with open(f'../../data/simulations/K_recovery/{benchmark_name}/K_dfs_cov.pkl', 'wb') as f:
        pickle.dump(K_dfs_cov, f)

Dataset 0/50
Calculating estimates.
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 1/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 2/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 3/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 4/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Fold 5/5
Lambda 0/100
Lambda 10/100
Lambda 20/100
Lambda 30/100
Lambda 40/100
Lambda 50/100
Lambda 60/100
Lambda 70/100
Lambda 80/100
Lambda 90/100
Done.
Dataset 1/50
Calculating estimates.
Lambda 0/100
Lambda 10/100
Lambda 20/100
La

In [None]:
K_df_cov = pd.concat(K_dfs_cov)
K_df_cov = sim.get_scores(K_df_cov)
K_df_cov

In [None]:
with open(f'../../data/simulations/K_recovery/{benchmark_name}/results_cov.pkl', 'wb') as f:
    pickle.dump(results_cov, f)

with open(f'../../data/simulations/K_recovery/{benchmark_name}/K_df_cov.pkl', 'wb') as f:
    pickle.dump(K_df_cov, f)