In [1]:
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu
from tqdm import tqdm 
import numpy as np
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import normaltest

In [2]:
inp1 = '../HeLa_IL1b_R1.txt'
inp2 = '../HeLa_IL1b_R2.txt'

# Load data
df1 = pd.read_csv(inp1, sep='\t')
df2 = pd.read_csv(inp2, sep='\t')

In [3]:
# Filter for gfp mean > 300
df1 = df1[df1.gfp_norm > 300]
df2 = df2[df2.gfp_norm > 300]

In [4]:
#normalize

control1 = df1[df1.gene == 'Non-Targeting Control'].correlation.to_list()
control2 = df2[df2.gene == 'Non-Targeting Control'].correlation.to_list()

ctl_mean1 = np.mean(control1)
ctl_mean2 = np.mean(control2)

delta = ctl_mean1-ctl_mean2

df2.correlation = df2.correlation+delta

delta

np.float64(-0.29297591617813756)

In [5]:
# combine screening datasets
df = pd.concat([df1,df2])

In [6]:
# optional: random downsampling
#df = df.sample(frac=.5)

In [7]:
# obtain list of control values
control = df[df.gene == 'Non-Targeting Control'].correlation.to_list()
print(len(control))

# obtain mean of control values
ctl_mean = np.mean(control)

58759


In [8]:
res = {}

# Group df by genes
groupb = df.groupby('gene')

# Loop through genes in df
for gene in tqdm(df.groupby('gene').groups):
    
    # Retrieve list of values for gene
    gen = groupb.get_group(gene).correlation.to_list()
    
    # Calculate number of cells
    numcells = len(groupb.get_group(gene))
    
    #if at least 10 cells, perform test and save to list
    if numcells > 9:
        # Calculate fold change
        fc = np.mean(gen)/ctl_mean
    
        # Test
        #p = ttest_ind(gen, control, equal_var=False)[1]
        p = mannwhitneyu(gen, control)[1]
        
        #save to list
        res[gene] = (numcells, fc, p)
    
res = pd.DataFrame(res).T
res.columns = ['count', 'fold', 'pvalue']
res = res.sort_values(by='pvalue', ascending=False)
res['count'] = res['count'].astype(int)

100%|████████████████████████████████████| 19114/19114 [01:57<00:00, 163.27it/s]


In [9]:
#fdr correction:
res['pvalue_fdr'] = fdrcorrection(res['pvalue'])[1]

In [10]:
# Create log values for p values
res['pvalue_-log10'] = -res['pvalue'].apply(np.log10)
res['pvalue_fdr_-log10'] = -res['pvalue_fdr'].apply(np.log10)

# Save as tsv
res.to_csv('screening_results_HeLa_IL1b.tsv', '\t')

  res.to_csv('screening_results_HeLa_IL1b.tsv', '\t')
