## Imports

In [1]:
from efaar_benchmarking.data_loading import *
from efaar_benchmarking.efaar import *
from efaar_benchmarking.constants import *
from efaar_benchmarking.benchmarking import *
import pandas as pd
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
from upsetplotly import UpSetPlotly
import pickle

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  from .autonotebook import tqdm as notebook_tqdm


## Check overlaps of the benchmark annotation sources (Supp Fig 1).

In [2]:
benchmark_sources = {}
for src in BENCHMARK_SOURCES:
    res = pd.read_csv(f'../efaar_benchmarking/benchmark_annotations/{src}.txt')
    res = res[res.entity1 != res.entity2]
    res['sorted_entities'] = res.apply(lambda row: tuple(sorted([row['entity1'], row['entity2']])), axis=1)
    benchmark_sources[src] = set(res.sorted_entities)

usp = UpSetPlotly(samples=list(benchmark_sources.values()), sample_names=list(benchmark_sources.keys()))
usp.plot(order_by='decreasing')

print(f'The number of unique interacting gene pairs in all {len(BENCHMARK_SOURCES)} sources is {len(set().union(*list(benchmark_sources.values())))}')

143252


## Check the number of total genes and expressed genes (for Table 1).

In [3]:
res_folder = 'data'
expression_data_folder = 'expression_data'

# PERISCOPE EXPRESSION COUNT
metadata = pd.read_pickle(f'{res_folder}/PERISCOPE_pre_agg_metadata.pkl')
all_genes = set(metadata[PERISCOPE_PERT_LABEL_COL])
expr = pd.read_csv(f'{expression_data_folder}/HeLa_expression.csv')
expr.columns = ['gene', 'tpm']
expr.gene = expr.gene.apply(lambda x: x.split(' ')[0])
exp_genes = all_genes.intersection(expr[expr.tpm != 0].gene)
unexp_genes = all_genes.intersection(expr[expr.tpm == 0].gene)
print('PERISCOPE', len(exp_genes), len(unexp_genes), len(all_genes))

# JUMP EXPRESSION COUNT
metadata = pd.read_pickle(f'{res_folder}/JUMP_pre_agg_metadata.pkl')
all_genes = set(metadata[JUMP_PERT_LABEL_COL])
expr = pd.read_csv(f'{expression_data_folder}/U2OS_expression.csv', index_col=0)
expr = expr.groupby('gene').zfpkm.agg('median').reset_index()
exp_genes = all_genes.intersection(expr[expr.zfpkm >= -3].gene)
unexp_genes = all_genes.intersection(expr[expr.zfpkm < -3].gene)
print('JUMP', len(exp_genes), len(unexp_genes), len(all_genes))

## Check unfiltered cell count features in JUMP (Supp Fig 2)

In [None]:
features_orig, metadata = load_cpg16_crispr()

features_orig['Cytoplasm_Number_Object_Number'].hist(bins=100)
plt.xlabel('Number of cytoplasm objects')
plt.ylabel('Frequency')
plt.show()

features_orig['Nuclei_Number_Object_Number'].hist(bins=100)
plt.xlabel('Number of nuclei objects')
plt.ylabel('Frequency')
plt.show()

## Check overlaps of identified protein complexes across the four perturbative maps (Section 4.3.4 & Fig 2).

In [None]:
# These pickle files only include expressed genes.
with open('data/jump_aggr_tvn128_map.pkl', 'rb') as outfile: ### TODO: the user does not know how to generate this file
    jump_map_data = pickle.load(outfile)

with open('data/periscope_aggr_tvn128_map.pkl', 'rb') as outfile: ### TODO: the user does not know how to generate this file
    periscope_map_data = pickle.load(outfile)

with open('data/replogle_aggr_tvn128_map.pkl', 'rb') as outfile: ### TODO: the user does not know how to generate this file
    gwps_map_data = pickle.load(outfile)

len(jump_map_data.metadata[JUMP_PERT_LABEL_COL].unique()), len(periscope_map_data.metadata[PERISCOPE_PERT_LABEL_COL].unique()), len(gwps_map_data.metadata[GWPS_PERT_LABEL_COL].unique())

jump_metrics = cluster_benchmark(jump_map_data, pert_col=JUMP_PERT_LABEL_COL)
periscope_metrics = cluster_benchmark(periscope_map_data, pert_col=PERISCOPE_PERT_LABEL_COL)
gwps_metrics = cluster_benchmark(gwps_map_data, pert_col=GWPS_PERT_LABEL_COL)

thr = .01
metdict = {'GWPS': gwps_metrics, 'JUMP': jump_metrics, 'PERISCOPE': periscope_metrics}
sigs = {}
for k, df in metdict.items():
    sigs[k] = set(df[df.ks_pval <= thr].cluster)

for k, s in sigs.items():
    pd.Series(list(s)).to_csv(f'data/{k}_significant_clusters.csv', index=False)

venn3(list(sigs.values()), list(sigs.keys()))
plt.title('Overlap of significant clusters')
plt.show()


## Confirm the identified relationships of C18orf21 and C1orf131 in GWPS (Section 4.3.5)

In [None]:
print('Top relationships of C18orf21 in GWPS:')
print(compute_top_similars(gwps_map_data, GWPS_PERT_LABEL_COL, 'C18orf21', topx=75))
print('Top relationships of C1orf131 in GWPS:')
print(compute_top_similars(gwps_map_data, GWPS_PERT_LABEL_COL, 'C1orf131'))