In [35]:
## Load results from co-eQTL pipeline and save as .csv for further downstream analyses
## Filter co-eQTL testing files, e.g. all tests for significant gene-pairs, eGenes etc.

# Libraries

In [36]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
from matplotlib import pyplot as plt
from scipy.sparse import issparse

In [37]:
import math

In [38]:
import os

In [39]:
from sklearn.metrics import mean_squared_error

In [40]:
import scipy.stats as stats
import numpy as np

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from sklearn.tree import plot_tree
from sklearn.feature_selection import mutual_info_classif
import pandas as pd 
import numpy as np

In [42]:
import random

In [43]:
import datetime

In [44]:
# import the regressor 
from sklearn.tree import DecisionTreeRegressor 

In [45]:
import seaborn as sns

# Parameters

In [46]:
### which co-eqtl mapping results

In [47]:
coeqtl_results_path = 'co_qtls_decision_tree/'   # path to final mapping results (based on decision tree filter)
coeqtl_results_path_old = 'co_qtls_sceqtlgen/'   # path to initial evaluation phase co-eQTL results

In [48]:
### Path to the co-EQTL Files

In [49]:
data_path = '../data/current/coeqtl_mapping/'

In [None]:
### Path to the eQTL files from sceQTLGen

In [50]:
data_path_eqtl = '../data/current/eqtl_mapping/'

In [51]:
eqtl_results = 'sc_eqtlgen_final'

In [None]:
### Path to store results

In [52]:
result_path = '../results/current/F3/'

In [53]:
result_path_analysis =  "../data/current/coeqtl_mapping/" + coeqtl_results_path + "analysis"

In [None]:
### Path to meta-data files

In [56]:
data_path_meta = '../data/current/meta-data'

In [None]:
### Parameters for determination of significant co-eQTLs

In [54]:
p_value_sign = 0.05

In [57]:
n_sample_filter = 133

In [58]:
mt_correction = 'eGene_bf' # eGene_bf, p_val_mt

In [59]:
snp_filter = ''

# Data

## Load the significant co-eQTLs

In [None]:
### Load the nominally significant co-QTLs from each single-cell dataset

In [61]:
### Define cell-types and datasets for which to execute

In [62]:
datasets_to_load = ['oneK1K']

#datasets_to_load = ['oneK1K', 'Franke_v2'  , 'Franke_v3', 'wijst', 'multiome']

In [63]:
cell_types = ['NK', 'CD4_T', 'CD8_T', 'Mono', 'B']

In [64]:
# Initialize an empty DataFrame to store the results
coeqtl_results = pd.DataFrame()

# Loop through each dataset and cell type
for i in datasets_to_load:
    for j in cell_types:
        # Construct the file path
        if n_sample_filter > 0:
            n_sample_filter_param = 'TRUE'
        if n_sample_filter == 0:
            n_sample_filter_param = 'FALSE'
        #file_path = f"{result_path_analysis}_{i}/{j}/F3_Co_E_Genes" + n_sample_filter_param + '_' + snp_filter + ".csv"
        file_path = f"{result_path_analysis}_{i}/{j}/F3_Co_E_Genes" + n_sample_filter_param +  ".csv"
        print(datetime.datetime.fromtimestamp(os.path.getmtime(file_path)))
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file into a DataFrame
            result_file = pd.read_csv(file_path)
            
            # Add the dataset and cell_type columns
            result_file['dataset'] = i
            result_file['cell_type'] = j
            
            # Append the result to the coeqtl_results DataFrame
            coeqtl_results = pd.concat([coeqtl_results, result_file], ignore_index=True)

2025-03-14 16:52:36
2025-03-19 11:34:45
2025-03-14 14:48:45
2025-03-14 14:44:43
2025-03-14 15:52:22


In [69]:
### Filter the file based on multiple testing correction method

In [70]:
if mt_correction  == 'eGene_bf':
    coeqtl_results = coeqtl_results[coeqtl_results['p_val_mt_eGene'] < coeqtl_results['p_q_cutoff']]

In [72]:
## Filter only bonferroni significant results
if mt_correction  != 'eGene_bf':
    coeqtl_results = coeqtl_results[coeqtl_results['p_val_mt'] < 0.05]  # how to define significance

In [73]:
### Load the significant co-eQTLs from the meta-analysis 

In [75]:
# Initialize an empty DataFrame to store the results
meta_results = pd.DataFrame()
cell_types = [ 'NK', 'B', 'CD8_T', 'Mono', 'CD4_T']

# Loop through each dataset and cell type

for j in cell_types:
    # Construct the file path
    #file_path =  "../data/current/coeqtl_mapping/" + coeqtl_results_path +   '/meta_analysis/' + 'meta_5ds/' + j + '/Meta_Analysis_Sign_results' +  str(n_sample_filter) + '_' + snp_filter +  '.csv' 
    file_path =  "../data/current/coeqtl_mapping/" + coeqtl_results_path +   '/meta_analysis/' + 'meta_5ds/' + j + '/Meta_Analysis_Sign_results' +  str(n_sample_filter) +   '.csv' 

    # Check if the file exists
    if os.path.exists(file_path):
        # Read the CSV file into a DataFrame
        result_file = pd.read_csv(file_path)
        print(datetime.datetime.fromtimestamp(os.path.getmtime(file_path)))

        # Add the dataset and cell_type columns
        result_file['cell_type'] = j

        # Append the result to the coeqtl_results DataFrame
        meta_results = pd.concat([meta_results, result_file], ignore_index=True)

2024-10-15 10:47:28
2025-02-12 16:10:20
2024-10-15 11:00:27
2025-02-12 17:13:29
2024-10-15 11:39:25


In [None]:
### Filter meta-analysis results based on multiple correction p-value threshold

In [80]:
if mt_correction  == 'eGene_bf':
    meta_results = meta_results[meta_results['p_val_mt_eGene'] < meta_results['p_q_cutoff']]

In [81]:
meta_results

Unnamed: 0.1,Unnamed: 0,coQTL,eGene,feature_chromosome,feature_start,feature_end,coeGene,feature_id,snp_id,snp_position,...,triplet,is_rb_mt,p_val_mt,amount_tests_eGene,p_val_mt_eGene,p_bh_cutoff,p_q_cutoff,type,cell_type,q_value
0,1,12:56007301:G:A;RPS26_RPL41,RPS26,12,56041351,56044676,RPL41,RPL41_RPS26,12:56007301:G:A,56007301,...,12:56007301:G:A;RPL41_RPS26,1,0.000000,172827,0.000000,0.001380,0.001380,unique,NK,
1,2,12:56007301:G:A;RPS26_RPS27,RPS26,12,56041351,56044676,RPS27,RPS26_RPS27,12:56007301:G:A,56007301,...,12:56007301:G:A;RPS26_RPS27,1,0.000000,172827,0.000000,0.001380,0.001380,unique,NK,
2,3,12:56007301:G:A;RPS26_RPS29,RPS26,12,56041351,56044676,RPS29,RPS26_RPS29,12:56007301:G:A,56007301,...,12:56007301:G:A;RPS26_RPS29,1,0.000000,172827,0.000000,0.001380,0.001380,unique,NK,
3,4,12:56042145:C:G;RPS26_RPL41,RPS26,12,56041351,56044676,RPL41,RPL41_RPS26,12:56042145:C:G,56042145,...,12:56042145:C:G;RPL41_RPS26,1,0.000000,172827,0.000000,0.001380,0.001380,unique,NK,
4,5,12:56042145:C:G;RPS26_RPS27,RPS26,12,56041351,56044676,RPS27,RPS26_RPS27,12:56042145:C:G,56042145,...,12:56042145:C:G;RPS26_RPS27,1,0.000000,172827,0.000000,0.001380,0.001380,unique,NK,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23789512,8693916,11:65885771:C:T;FAU_CTSW,CTSW_FAU,11,65120628,65122473,CTSW_FAU,CTSW_FAU,11:65885771:C:T,65885771,...,11:65885771:C:T;CTSW_FAU,1,0.151367,162076,0.000154,0.002728,0.002728,duplicate,CD4_T,
23789513,8693917,18:24001745:T:C;TTC39C_TTC39C-AS1,TTC39C_TTC39C-AS1,18,23992773,24135610,TTC39C_TTC39C-AS1,TTC39C_TTC39C-AS1,18:24001745:T:C,24001745,...,18:24001745:T:C;TTC39C_TTC39C-AS1,0,0.169862,8053,0.000009,0.002728,0.002728,duplicate,CD4_T,
23789514,8693918,6:32658495:C:A;HLA-DRB1_HLA-DQB1,HLA-DQB1_HLA-DRB1,6,32578769,32589848,HLA-DQB1_HLA-DRB1,HLA-DQB1_HLA-DRB1,6:32658495:C:A,32658495,...,6:32658495:C:A;HLA-DQB1_HLA-DRB1,0,0.209483,1742214,0.002290,0.002728,0.002728,duplicate,CD4_T,
23789533,8693937,12:8971336:T:C;A2M_KLRB1,A2M_KLRB1,12,9067664,9116229,A2M_KLRB1,A2M_KLRB1,12:8971336:T:C,8971336,...,12:8971336:T:C;A2M_KLRB1,0,1.558180,97252,0.000951,0.002728,0.002728,duplicate,CD4_T,


In [84]:
if mt_correction  != 'eGene_bf':  # Bonferroni threshold
    meta_results = meta_results[meta_results['p_val_mt'] < 0.05]

In [89]:
### Define the sign gene-pair and eGenes for which to save all the tests

In [None]:
### All significant gene-pairs

In [90]:
sign_gene_pairs = pd.unique(pd.concat([coeqtl_results['feature_id'], meta_results['feature_id']]))

In [None]:
### All significant eGenes

In [94]:
sign_egenes_sc_datasets = coeqtl_results.drop_duplicates(['eGene', 'snp_id'])[['eGene', 'snp_id']]
sign_egenes_meta = meta_results.drop_duplicates(['eGene', 'snp_id'])[['eGene', 'snp_id']]

In [95]:
sign_egenes_sc_datasets['egene_snp_id'] = sign_egenes_sc_datasets['eGene'] + ';' +  sign_egenes_sc_datasets['snp_id']
sign_egenes_meta['egene_snp_id'] = sign_egenes_meta['eGene'] + ';' +  sign_egenes_meta['snp_id'] 

In [96]:
sign_egenes = pd.concat([sign_egenes_sc_datasets, sign_egenes_meta], ignore_index=True)

In [97]:
sign_egenes = sign_egenes.drop_duplicates()

In [98]:
sign_egenes

Unnamed: 0,eGene,snp_id,egene_snp_id
0,RPS26,12:55991020:G:A,RPS26;12:55991020:G:A
1,RPS26,12:55995509:T:C,RPS26;12:55995509:T:C
2,RPS26,12:56001170:A:G,RPS26;12:56001170:A:G
3,RPS26,12:56007301:G:A,RPS26;12:56007301:G:A
4,RPS26,12:56042145:C:G,RPS26;12:56042145:C:G
...,...,...,...
2001,GLIPR1_KRR1,12:75493570:A:G,GLIPR1_KRR1;12:75493570:A:G
2005,CTSW_FAU,11:65885771:C:T,CTSW_FAU;11:65885771:C:T
2007,HLA-DQB1_HLA-DRB1,6:32658495:C:A,HLA-DQB1_HLA-DRB1;6:32658495:C:A
2008,A2M_KLRB1,12:8971336:T:C,A2M_KLRB1;12:8971336:T:C


In [99]:
len(pd.unique(sign_egenes['eGene']))

362

In [100]:
len(pd.unique(sign_egenes['egene_snp_id']))

1161

## Get the tests for each sign gene-pair (in the meta-analysis)

In [75]:
datasets_to_load = ['meta_analysis']

In [76]:
cell_types = ['CD4_T', 'Mono', 'NK', 'B', 'CD8_T']

In [None]:
# Initialize an empty DataFrame to store the results
executed_tests = pd.DataFrame()


# Loop through each dataset and cell type
for i in datasets_to_load:
    for j in cell_types:
        # Construct the file path
        file_path = data_path + '/' +coeqtl_results_path  + '/'+ i + '/'  + 'meta_5ds/' + j + '/meta_qtl_results_all.txt.gz'

        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file into a DataFrame
            result_file = pd.read_csv(file_path, sep = '\t')

            result_file = result_file[result_file['feature_id'].isin(sign_gene_pairs)]

            # Add the dataset and cell_type columns
            result_file['dataset'] = i
            result_file['cell_type'] = j

            # Append the result to the coeqtl_results DataFrame
            executed_tests = pd.concat([executed_tests, result_file], ignore_index=True)
            del result_file

In [None]:
pd.unique(executed_tests['cell_type'])

In [None]:
pd.unique(executed_tests['dataset'])

In [None]:
executed_tests

In [None]:
### Save the file for further analysis

In [None]:
executed_tests.to_csv(data_path + coeqtl_results_path + '/tests_sign_pairs/Tests_executed_on_sign_gene_pairs_meta' +  str(n_sample_filter)  + '.csv')

## Get the tests for each sign gene-pair (in the single-datasets)

In [101]:
datasets_to_load = ['oneK1K', 'Franke_v2'  , 'Franke_v3', 'wijst', 'multiome']

In [102]:
cell_types = ['CD4_T', 'Mono', 'NK', 'B', 'CD8_T']

In [103]:
# Initialize an empty DataFrame to store the results
executed_tests = pd.DataFrame()

# Loop through each dataset and cell type
for i in datasets_to_load:
    for j in cell_types:
        # Construct the file path
        #file_path = data_path + '/' +coeqtl_results_path  + '/'+ i + '/' + j + '/F3_decision_tree_input.csv.gz'
        file_path = data_path + '/' +coeqtl_results_path  + '/'+ i + '/' + j + '/qtl_results_all_unqid.csv.gz'  # changed to ensure to have all tests not only those filtered based on n_sample

        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file into a DataFrame
            result_file = pd.read_csv(file_path)

            result_file = result_file[result_file['feature_id'].isin(sign_gene_pairs)]

            # Add the dataset and cell_type columns
            result_file['dataset'] = i
            result_file['cell_type'] = j

            # Append the result to the coeqtl_results DataFrame
            executed_tests = pd.concat([executed_tests, result_file], ignore_index=True)
            del result_file

In [104]:
pd.unique(executed_tests['cell_type'])

array(['CD4_T', 'Mono', 'NK', 'B', 'CD8_T'], dtype=object)

In [105]:
pd.unique(executed_tests['dataset'])

array(['oneK1K', 'Franke_v2', 'Franke_v3', 'wijst', 'multiome'],
      dtype=object)

In [106]:
executed_tests

Unnamed: 0.1,Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,closeGenes,z_score,triplet,n_samples,n_e_samples,gene1,gene2,dataset,cell_type
0,42372,A1BG_RPS26,12:55975722:A:G,0.397208,,,,12,56041351,56044676,...,55975722.0,False,-8.469810e-01,A1BG_RPS26;12:55975722:A:G,992,992,RPS26,A1BG,oneK1K,CD4_T
1,42373,A1BG_RPS26,12:55991020:G:A,0.506004,,,,12,56041351,56044676,...,55991020.0,False,-6.653110e-01,A1BG_RPS26;12:55991020:G:A,992,992,RPS26,A1BG,oneK1K,CD4_T
2,42374,A1BG_RPS26,12:55991795:C:T,0.850009,,,,12,56041351,56044676,...,55991795.0,False,1.891558e-01,A1BG_RPS26;12:55991795:C:T,992,992,RPS26,A1BG,oneK1K,CD4_T
3,42375,A1BG_RPS26,12:55995509:T:C,0.512736,,,,12,56041351,56044676,...,55995509.0,False,-6.548161e-01,A1BG_RPS26;12:55995509:T:C,992,992,RPS26,A1BG,oneK1K,CD4_T
4,42376,A1BG_RPS26,12:56001170:A:G,0.458205,,,,12,56041351,56044676,...,56001170.0,False,-7.420924e-01,A1BG_RPS26;12:56001170:A:G,992,992,RPS26,A1BG,oneK1K,CD4_T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3634060,86741372,HLA-DRB1_RPS28,6:32457527:A:G,1.000000,,,,6,32578769,32589848,...,32457527.0,False,9.521116e-17,,24,24,RPS28,HLA-DRB1,multiome,CD8_T
3634061,86741373,HLA-DRB1_RPS28,6:32459604:G:T,1.000000,,,,6,32578769,32589848,...,32459604.0,False,9.521116e-17,,24,24,RPS28,HLA-DRB1,multiome,CD8_T
3634062,86741421,HLA-DRB1_UBA52,6:33072321:C:T,1.000000,,,,6,32578769,32589848,...,33072321.0,False,-4.760558e-17,,24,24,UBA52,HLA-DRB1,multiome,CD8_T
3634063,86741422,HLA-DRB1_UBA52,6:33092341:G:A,1.000000,,,,6,32578769,32589848,...,33092341.0,False,-4.760558e-17,,24,24,UBA52,HLA-DRB1,multiome,CD8_T


In [107]:
### Save the file for further analysis

In [108]:
executed_tests.to_csv(data_path + coeqtl_results_path + '/tests_sign_pairs/Tests_executed_on_sign_gene_pairs' +   str(n_sample_filter)  +  mt_correction + '_' + snp_filter + '.csv')

## Get tests for each sign egene (in the single datasets)

In [83]:
datasets_to_load = ['oneK1K']

In [84]:
cell_types = ['CD4_T', 'Mono', 'NK', 'B', 'CD8_T']
# ['CD4_T', 'Mono', 'NK', 'B', 'CD8_T']

In [85]:
# Initialize an empty DataFrame to store the results
executed_tests = pd.DataFrame()

# Loop through each dataset and cell type
for i in datasets_to_load:
    for j in cell_types:
        # Construct the file path
        file_path = data_path + '/' +coeqtl_results_path  + '/'+ i + '/' + j + '/F3_decision_tree_input.csv.gz'

        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file into a DataFrame
            result_file = pd.read_csv(file_path)
            
            
            ### Remove unnnecessary columns
            del result_file['empirical_feature_p_value']
            del result_file ['feature_chromosome']
            #del result_file['test1']
            #del result_file['test2']
            #del result_file['test3']
            del result_file['feature_start']
            del result_file['feature_end']
            #del result_file['biotype']
            #del result_file['index']
            del result_file[ 'alpha_param']
            del result_file['beta_param']

           ### Add columns for filtering
            result_file['gene1_snp_id'] = result_file['gene1'] + ';' + result_file['snp_id']
            result_file['gene2_snp_id'] = result_file['gene2'] + ';' + result_file['snp_id']

            ### Filter result file on results for significant egenes
            result_file =  result_file[(result_file['gene1_snp_id'].isin(pd.unique(sign_egenes['egene_snp_id']))) | (result_file['gene2_snp_id'].isin(pd.unique(sign_egenes['egene_snp_id'])))  ]

            # Add the dataset and cell_type columns
            result_file['dataset'] = i
            result_file['cell_type'] = j

            # Append the result to the coeqtl_results DataFrame
            executed_tests = pd.concat([executed_tests, result_file], ignore_index=True)
            del result_file

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/icb/corinna.losert/miniconda3/envs/scgrn_R_4_1/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1734574/1546546559.py", line 13, in <module>
    result_file = pd.read_csv(file_path)
  File "/home/icb/corinna.losert/miniconda3/envs/scgrn_R_4_1/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/icb/corinna.losert/miniconda3/envs/scgrn_R_4_1/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 626, in _read
    return parser.read(nrows)
  File "/home/icb/corinna.losert/miniconda3/envs/scgrn_R_4_1/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1923, in read
    ) = self._engine.read(  # type: ignore[attr-defined]
  File "/home/icb/corinna.losert/miniconda3/envs/scgrn_R_4_1/lib/python3.9/site-packages/pandas/io/parse

TypeError: object of type 'NoneType' has no len()

In [None]:
### Save the file for further analysis

In [None]:
executed_tests.to_csv(data_path + coeqtl_results_path + '/tests_sign_pairs/Tests_executed_on_sign_egenes' +   str(n_sample_filter)  +  mt_correction + '.csv.gz', compression='gzip')

In [None]:
#executed_tests

## Get tests for each sign egene (in the meta-analysis)

In [109]:
datasets_to_load = ['meta_analysis']

In [110]:
cell_types = [ 'Mono', 'NK', 'B', 'CD8_T', 'CD4_T']
# ['CD4_T', 'Mono', 'NK', 'B', 'CD8_T']

In [111]:
j

'CD8_T'

In [None]:
# Initialize an empty DataFrame to store the results
executed_tests = pd.DataFrame()

# Loop through each dataset and cell type
for i in datasets_to_load:
    for j in cell_types:
        # Construct the file path
        file_path = data_path + '/' +coeqtl_results_path  + '/'+ i + '/'  + 'meta_5ds/' + j + '/meta_qtl_results_all.txt.gz'

        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file into a DataFrame
            result_file = pd.read_csv(file_path, sep = '\t')

           ### Add columns for filtering
            result_file['eGene_snp_id'] = result_file['eGene'] + ';' + result_file['snp_id']
            #result_file['gene2_snp_id'] = result_file['gene2'] + ';' + result_file['snp_id']

            ### Filter result file on results for significant egenes
            result_file =  result_file[(result_file['eGene_snp_id'].isin(pd.unique(sign_egenes['egene_snp_id'])))  ]
            result_file = result_file[['coQTL', 'eGene',  'coeGene', 'feature_id', 'snp_id', 'snp_position', 'closeGenes', 'p_value', 'n_samples']]

            # Add the dataset and cell_type columns
            result_file['dataset'] = i
            result_file['cell_type'] = j

            # Append the result to the coeqtl_results DataFrame
            executed_tests = pd.concat([executed_tests, result_file], ignore_index=True)
            del result_file

In [None]:
### Save the file for further analysis

In [None]:
executed_tests.to_csv(data_path + coeqtl_results_path + '/tests_sign_pairs/Tests_executed_on_sign_egenes_meta_5ds' +   str(n_sample_filter)  +  mt_correction + '.csv.gz', compression='gzip')

In [None]:
executed_tests['ident'] = executed_tests['eGene'] + ';' +  executed_tests['snp_id']

In [None]:
### Add to the tests whether significant co-EQTL in meta-analysis or not

In [None]:
meta_results['coeQTL'] = 1

In [None]:
executed_tests

In [None]:
result = pd.merge(executed_tests, meta_results[['eGene', 'coeGene', 'snp_id', 'cell_type', 'coeQTL']], on=['eGene', 'coeGene', 'snp_id', 'cell_type'], how='left')

In [None]:
sum(meta_results['coeQTL'][meta_results['cell_type'] == 'Mono'])

In [None]:
result['coeQTL'] = result['coeQTL'].fillna(0)

In [None]:
result

In [None]:
sum(result['coeQTL'][result['cell_type'] == 'B'])

In [None]:
sum(result['coeQTL'])

In [None]:
result.to_csv(data_path + coeqtl_results_path + '/tests_sign_pairs/Tests_executed_on_sign_egenes_meta_5ds_incl_annot' +   str(n_sample_filter)  +  mt_correction + '.csv.gz', compression='gzip')