In [1]:
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu
from tqdm import tqdm 
import numpy as np
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import normaltest

In [2]:
inp1 = '../THP1_PrgI-PA_R1a.txt'
inp2 = '../THP1_PrgI-PA_R1b.txt'
inp3 = '../THP1_PrgI-PA_R2a.txt'
inp4 = '../THP1_PrgI-PA_R2b.txt'

# Load data
df1 = pd.read_csv(inp1, sep='\t')
df2 = pd.read_csv(inp2, sep='\t')
df3 = pd.read_csv(inp3, sep='\t')
df4 = pd.read_csv(inp4, sep='\t')

In [3]:
# Filter for gfp mean > 30
df1 = df1[df1.gfp_norm > 30]
df2 = df2[df2.gfp_norm > 30]
df3 = df3[df3.gfp_norm > 30]
df4 = df4[df4.gfp_norm > 30]

In [4]:
#normalize

control1 = df1[df1.gene == 'Non-Targeting Control'].ratio.to_list()
control2 = df2[df2.gene == 'Non-Targeting Control'].ratio.to_list()
control3 = df3[df3.gene == 'Non-Targeting Control'].ratio.to_list()
control4 = df4[df4.gene == 'Non-Targeting Control'].ratio.to_list()

ctl_mean1 = np.mean(control1)
ctl_mean2 = np.mean(control2)
ctl_mean3 = np.mean(control3)
ctl_mean4 = np.mean(control4)

delta2 = ctl_mean1-ctl_mean2
delta3 = ctl_mean1-ctl_mean3
delta4 = ctl_mean1-ctl_mean4

df2.ratio = df2.ratio+delta2
df3.ratio = df3.ratio+delta3
df4.ratio = df4.ratio+delta4

In [5]:
# combine screening datasets
df = pd.concat([df1,df2,df3,df4])

In [6]:
# obtain list of control values
control = df[df.gene == 'Non-Targeting Control'].ratio.to_list()

# obtain mean of control values
ctl_mean = np.mean(control)

In [7]:
res = {}

# Group df by genes
groupb = df.groupby('gene')

# Loop through genes in df
for gene in tqdm(df.groupby('gene').groups):
    
    # Retrieve list of ratios for gene
    gen = groupb.get_group(gene).ratio.to_list()
    
    # Number of cells
    numcells = len(groupb.get_group(gene))
    
    #if at least 3 cells, perform test and save to list
    if numcells > 9:
        # Mean
        meangenes = np.mean(gen)/ctl_mean
    
        # Test
        #p = ttest_ind(gen, control, equal_var=False)[1]
        p = mannwhitneyu(gen, control)[1]
        
        #save to list
        res[gene] = (numcells, meangenes, p)
    
res = pd.DataFrame(res).T
res.columns = ['count', 'fold', 'pvalue']
res = res.sort_values(by='pvalue', ascending=False)
res['count'] = res['count'].astype(int)

100%|████████████████████████████████████| 18982/18982 [00:47<00:00, 397.39it/s]


In [8]:
#fdr correction:
res['pvalue_fdr'] = fdrcorrection(res['pvalue'])[1]

In [11]:
# Create log values for p values
res['pvalue_-log10'] = -res['pvalue'].apply(np.log10)
res['pvalue_fdr_-log10'] = -res['pvalue_fdr'].apply(np.log10)

# Save as tsv
res.to_csv('screening_results_THP1_PrgI-PA.tsv', '\t')

  res.to_csv('screening_results_THP1_PrgI-PA.tsv', '\t')
