In [1]:
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu
from tqdm import tqdm 
import numpy as np
from statsmodels.stats.multitest import fdrcorrection

In [2]:
inp1 = '../HeLa_TNFa_R1.txt'
inp2 = '../HeLa_TNFa_R2.txt'

# Load data
df1 = pd.read_csv(inp1, sep='\t')
df2 = pd.read_csv(inp2, sep='\t')

In [3]:
# Filter for gfp mean > 300
df1 = df1[df1.gfp_norm > 300]
df2 = df2[df2.gfp_norm > 300]

In [4]:
#normalize

control1 = df1[df1.gene == 'Non-Targeting Control'].correlation.to_list()
control2 = df2[df2.gene == 'Non-Targeting Control'].correlation.to_list()

ctl_mean1 = np.mean(control1)
ctl_mean2 = np.mean(control2)

delta = ctl_mean1-ctl_mean2
ratio = ctl_mean1/ctl_mean2

df2.correlation = df2.correlation+delta #works better
#df2.correlation = df2.correlation*ratio

delta

np.float64(-0.1566241260367558)

In [5]:
# combine screening datasets
df = pd.concat([df1,df2])

In [6]:
# obtain list of control values:
control = df[df.gene == 'Non-Targeting Control'].correlation.to_list()
print(len(control))

# obtain mean of control values
ctl_mean = np.mean(control)

43717


In [7]:
res = {}

# Group df by genes
groupb = df.groupby('gene')

# Loop through genes in df
for gene in tqdm(df.groupby('gene').groups):
    
    # Retrieve list of correlations for gene
    gen = groupb.get_group(gene).correlation.to_list()
    
    # Calculate values
    # Number of genes
    numcells = len(groupb.get_group(gene))
    
    #if at least 10 cells, perform test and save to list
    if numcells > 9:
        # Mean
        meangenes = np.mean(gen)/ctl_mean
    
        # Test
        #p = ttest_ind(gen, control, equal_var=False)[1]
        p = mannwhitneyu(gen, control)[1]
        
        #save to list
        res[gene] = (numcells, meangenes, p)
    
res = pd.DataFrame(res).T
res.columns = ['count', 'fold', 'pvalue']
res = res.sort_values(by='pvalue', ascending=False)
res['count'] = res['count'].astype(int)

100%|████████████████████████████████████| 19112/19112 [01:21<00:00, 235.86it/s]


In [8]:
#fdr correction:
res['pvalue_fdr'] = fdrcorrection(res['pvalue'])[1]

In [9]:
# Create log values for p values
res['pvalue_-log10'] = -res['pvalue'].apply(np.log10)
res['pvalue_fdr_-log10'] = -res['pvalue_fdr'].apply(np.log10)

# Save as tsv
res.to_csv('screening_results_HeLa_TNFa.tsv', '\t')

  res.to_csv('screening_results_HeLa_TNFa.tsv', '\t')
