In [1]:
## Combine all the summary statistics for the gene-pairs as input for the decision tree

# Libraries

In [2]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
from matplotlib import pyplot as plt
from scipy.sparse import issparse

In [3]:
import os

In [4]:
import scipy.stats as stats
import numpy as np

# Parameters

In [None]:
### Define dataset and cell-type for which to prepare

In [5]:
### dataset

In [6]:
dataset = 'oneK1K'

In [7]:
cell_type = 'CD8_T'

In [None]:
### Define which summary stats to use

In [8]:
expr_type = 'data' ### or 'RNA'/ 'data' ## specify whether to use normalized or raw based expression stats

In [9]:
### Path to the co-EQTL Files

In [10]:
data_path = '../data/current/coeqtl_mapping/co_qtls_sceqtlgen/'

## co_qtls_sceqtlgen can be replaced by co_qtls_decision_tree
## co_qtls_sceqtlgen: contains all for initial mapping run
## co_qtls_decision_tree: files for final decision tree filtered run

In [11]:
result_path = '../results/current/F6/'

In [12]:
result_path_analysis = "../data/current/coeqtl_mapping/co_qtls_sceqtlgen/analysis_" + dataset  + "/" + cell_type + "/"

# Functions

In [13]:
def get_r_thres(n, p_thres=0.975):
    t = stats.t.ppf(p_thres, n)
    return np.sqrt(1 / (((n - 2) / t ** 2) + 1))

# Data

In [None]:
### Get summary stats calculated in previous steps

## 1) Correlation summary stats (mean correlation, max correlation etc. per gene-pair)

In [15]:
result_path_analysis

'../data/current/coeqtl_mapping/co_qtls_decision_tree/analysis_oneK1K/CD8_T/'

In [16]:
cor_summary_stats = pd.read_csv(result_path_analysis  + 'F6_Correlation_Summary_Stats_all.csv')

In [17]:
cor_summary_stats = cor_summary_stats[['gene_pair', 'mean_correlation', 'mean_abs_correlation', 'var_correlation', 'max_correlation', 'n_NA', 'n_not_NA']]

In [18]:
cor_summary_stats.shape

(47030365, 7)

In [19]:
cor_summary_stats

Unnamed: 0,gene_pair,mean_correlation,mean_abs_correlation,var_correlation,max_correlation,n_NA,n_not_NA
0,SUGP1_TNFRSF1B,0.004962,0.062006,0.006261,0.417952,519,496
1,SUGP1_TNFRSF25,0.006427,0.053139,0.004873,0.268827,730,285
2,SUGP1_TOE1,0.013783,0.056209,0.004832,0.150000,977,38
3,SUGP1_TOMM20,-0.005369,0.057851,0.005376,0.309478,495,520
4,SUGP1_TOR1AIP1,0.004191,0.057628,0.005581,0.386313,549,466
...,...,...,...,...,...,...,...
47030360,ZMAT1_ZMAT5,-0.004078,0.049333,0.003627,0.182506,909,106
47030361,ZMAT5_ZNF41,-0.001395,0.049932,0.004497,0.324365,842,173
47030362,ZMAT5_ZNF75D,-0.002744,0.048654,0.003704,0.215466,836,179
47030363,ZMAT5_ZNF81,-0.008842,0.040711,0.002374,0.136821,951,64


In [20]:
### Extract gene 1 and gene2

In [21]:
cor_summary_stats['gene1'] = cor_summary_stats['gene_pair'].str.replace('_.*', '', regex = True)
cor_summary_stats['gene2'] = cor_summary_stats['gene_pair'].str.replace('.*_', '', regex = True)

## 2) Expression summary stats (mean expression, variance , percentage zero etc. per gene)

In [23]:
result_path_analysis

'../data/current/coeqtl_mapping/co_qtls_decision_tree/analysis_oneK1K/CD8_T/'

In [24]:
if expr_type == 'data':
    expression_summary_stats = pd.read_csv(result_path_analysis  + 'F6_Expression_Summary_Stats_all_data.csv') # alternative with normalized data
if expr_type == 'RNA':
    expression_summary_stats = pd.read_csv(result_path_analysis  + 'F6_Expression_Summary_Stats_all.csv')

In [25]:
expression_summary_stats

Unnamed: 0.1,Unnamed: 0,gene,mean_expression,sd_sample_mean_expression,mean_variance,sd_sample_variance,sum_non_zero,mean_non_zero,mean_percentage_zero,sum_zero,mean_zero,mean_coef_var
0,1,A1BG,0.092249,0.037742,0.065164,0.033825,17261,23.014667,86.841141,123757,165.009333,2.918516
1,2,A1BG-AS1,0.031895,0.010598,0.021689,0.010834,1280,13.061224,94.874179,25859,263.867347,4.718715
2,3,A2M,0.182415,0.108957,0.148347,0.096971,29558,36.581683,79.311047,115079,142.424505,2.379295
3,4,A2M-AS1,0.091136,0.036155,0.070729,0.035675,13406,24.242315,87.887734,98584,178.271248,3.053253
4,5,A2ML1-AS1,0.024959,0.008998,0.023648,0.009368,56,11.200000,96.976541,1998,399.600000,6.311343
...,...,...,...,...,...,...,...,...,...,...,...,...
13029,13030,ZXDC,0.085154,0.023146,0.063677,0.026809,16015,22.089655,88.103565,123789,170.743448,2.991849
13030,13031,ZYG11B,0.043337,0.014220,0.033271,0.016787,4078,14.512456,93.839469,65746,233.971530,4.247124
13031,13032,ZYX,0.086619,0.023932,0.061678,0.024970,17030,23.751743,87.390285,122369,170.668061,2.927361
13032,13033,ZZEF1,0.129043,0.030666,0.099616,0.041669,25942,29.580388,83.016948,129504,147.667047,2.449801


In [26]:
### Add for gene1

In [27]:
expression_summary_stats_gene1 = expression_summary_stats.add_prefix('gene1_')

In [28]:
summary_stats = pd.merge(cor_summary_stats, expression_summary_stats_gene1, how = 'left', left_on = 'gene1', right_on = 'gene1_gene')

In [29]:
summary_stats.shape

(47030365, 21)

In [30]:
### Add for gene2

In [31]:
expression_summary_stats_gene2 = expression_summary_stats.add_prefix('gene2_')

In [32]:
summary_stats = pd.merge(summary_stats, expression_summary_stats_gene2, how = 'left', left_on = 'gene2', right_on = 'gene2_gene')

In [33]:
summary_stats.shape

(47030365, 33)

## 3) Paired expression summary stats (mean paired zero, zero one cell etc.)

In [35]:
if os.path.isfile(result_path_analysis  + 'Paired_expression_summary_all.csv'):
    paired_expression_stats = pd.read_csv(result_path_analysis  + 'Paired_expression_summary_all.csv')
else:
    # Define the columns
    columns = ['gene_pair', 'n_sample', 'mean_paired_non_zero', 'mean_paired_zero', 'mean_zero_one_cell', 'mean_non_zero_one_cell']
    # Create an empty DataFrame with specified columns
    paired_expression_stats = pd.DataFrame(columns=columns)
    

In [36]:
#paired_expression_stats = pd.read_csv(result_path_analysis  + 'Paired_expression_summary_all.csv')

In [37]:
paired_expression_stats

Unnamed: 0,gene_pair,n_sample,mean_paired_non_zero,mean_paired_zero,mean_zero_one_cell,mean_non_zero_one_cell


In [38]:
summary_stats = pd.merge(summary_stats, paired_expression_stats, how = 'left', left_on = 'gene_pair', right_on = 'gene_pair')

In [39]:
summary_stats.shape

(47030365, 38)

## Save the combined dataset for usage in decision tree analysis

In [41]:
summary_stats[0:6]

Unnamed: 0,gene_pair,mean_correlation,mean_abs_correlation,var_correlation,max_correlation,n_NA,n_not_NA,gene1,gene2,gene1_Unnamed: 0,...,gene2_mean_non_zero,gene2_mean_percentage_zero,gene2_sum_zero,gene2_mean_zero,gene2_mean_coef_var,n_sample,mean_paired_non_zero,mean_paired_zero,mean_zero_one_cell,mean_non_zero_one_cell
0,SUGP1_TNFRSF1B,0.004962,0.062006,0.006261,0.417952,519,496,SUGP1,TNFRSF1B,10862,...,30.10388,83.250444,122461,153.267835,2.592668,,,,,
1,SUGP1_TNFRSF25,0.006427,0.053139,0.004873,0.268827,730,285,SUGP1,TNFRSF25,10862,...,17.587822,91.102698,84039,196.812646,3.614241,,,,,
2,SUGP1_TOE1,0.013783,0.056209,0.004832,0.15,977,38,SUGP1,TOE1,10862,...,12.814815,95.51605,16191,299.833333,5.019318,,,,,
3,SUGP1_TOMM20,-0.005369,0.057851,0.005376,0.309478,495,520,SUGP1,TOMM20,10862,...,56.672746,65.005954,106352,107.752786,1.561175,,,,,
4,SUGP1_TOR1AIP1,0.004191,0.057628,0.005581,0.386313,549,466,SUGP1,TOR1AIP1,10862,...,19.943425,89.678985,118782,181.623853,3.223934,,,,,
5,SUGP1_TOR1AIP2,-2.3e-05,0.055735,0.004781,0.216183,557,458,SUGP1,TOR1AIP2,10862,...,19.947853,89.846062,119090,182.653374,3.268995,,,,,


In [42]:
## Save

In [43]:
if expr_type == 'data':
    summary_stats.to_csv(result_path_analysis + 'F6_Summary_Stats_all_data.csv') # alternative with normalized expression stats    
if expr_type == 'RNA':
    summary_stats.to_csv(result_path_analysis + 'F6_Summary_Stats_all.csv')