In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import pysam
import collections
import os
import re
import scipy
from statsmodels.stats import multitest

#Collect the different mouse/ont BAM paths
bam_stem = '/scratch/groups/horence/rob/data/MERFISH_scRNAseq/10X_mapping/merged_by_mouse_celltype'

mice = set()
onts = set()
mouse_onts = collections.defaultdict(dict)

for p in glob.glob(os.path.join(bam_stem,'*.bam')):
    name = os.path.basename(p)
    mouse = name.split('_')[0]
    ont = re.findall('Mouse._(.*).bam',name)[0]
    mice.add(mouse)
    onts.add(ont)
    mouse_onts[mouse][ont] = p
    
long_region = ('chr5',147561364,147564691)
trunc_region = ('chr5',147634360,147640217)

counts = {
    'mouse':[],
    'ont':[],
    'long':[],
    'trunc':[],
}

for mouse,ont_dict in mouse_onts.items():
    
    for ont,bam_path in ont_dict.items():
        
        with pysam.AlignmentFile(bam_path) as bam:
            counts['mouse'].append(mouse)
            counts['ont'].append(ont)
            counts['long'].append(sum(1 for r in bam.fetch(*long_region)))
            counts['trunc'].append(sum(1 for r in bam.fetch(*trunc_region)))
        
counts = pd.DataFrame(counts)
counts

Unnamed: 0,mouse,ont,long,trunc
0,Mouse4,Macrophage,0,0
1,Mouse4,Sncg,0,0
2,Mouse4,L56_NP,20,6
3,Mouse4,Oligo,1,1
4,Mouse4,Lamp5,1,3
...,...,...,...,...
73,Mouse3,L6_CT,11,22
74,Mouse3,Sst,2,7
75,Mouse3,SMC,0,1
76,Mouse3,Astro,0,0


In [22]:
counts = counts[
    counts['long'].gt(20) |
    counts['trunc'].gt(20)
].sort_values('long')

counts['total'] = counts['long']+counts['trunc']

counts

Unnamed: 0,mouse,ont,long,trunc,total
22,Mouse1,VLMC,5,21,26
53,Mouse2,L6b,6,33,39
45,Mouse2,VLMC,8,29,37
73,Mouse3,L6_CT,11,22,33
66,Mouse3,L5_IT,22,17,39
77,Mouse3,Pvalb,24,10,34
49,Mouse2,L23_IT,24,22,46
39,Mouse2,L6_CT,26,44,70
29,Mouse1,L23_IT,32,18,50
8,Mouse4,L23_IT,34,12,46


In [39]:
counts_agg = counts.groupby('ont').sum()
counts_agg['fraction_long'] = counts_agg['long']/counts_agg['total']
counts_agg = counts_agg.sort_values('fraction_long')
counts_agg

Unnamed: 0_level_0,long,trunc,total,fraction_long
ont,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
L6b,6,33,39,0.153846
VLMC,13,50,63,0.206349
L6_CT,83,112,195,0.425641
Endo,3177,2586,5763,0.551275
L5_IT,262,179,441,0.594104
L23_IT,90,52,142,0.633803
Pvalb,24,10,34,0.705882


In [40]:
counts_agg['p_binom_twosided'] = counts_agg.apply(
    lambda r: scipy.stats.binom_test(x = r.long, n = r.total, p = 0.5, alternative='two-sided'),
    axis = 1,
)

counts_agg

Unnamed: 0_level_0,long,trunc,total,fraction_long,p_binom_twosided
ont,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
L6b,6,33,39,0.153846,1.429926e-05
VLMC,13,50,63,0.206349,3.015952e-06
L6_CT,83,112,195,0.425641,0.04467252
Endo,3177,2586,5763,0.551275,7.332571e-15
L5_IT,262,179,441,0.594104,9.025156e-05
L23_IT,90,52,142,0.633803,0.00180171
Pvalb,24,10,34,0.705882,0.02430651


In [41]:
_,adj_p,_,_ = multitest.multipletests(
    counts_agg['p_binom_twosided'],
    alpha = 0.05,
    method = 'fdr_bh',
)
counts_agg['p_binom_twosided_BH'] = adj_p
counts_agg

Unnamed: 0_level_0,long,trunc,total,fraction_long,p_binom_twosided,p_binom_twosided_BH
ont,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L6b,6,33,39,0.153846,1.429926e-05,3.336494e-05
VLMC,13,50,63,0.206349,3.015952e-06,1.055583e-05
L6_CT,83,112,195,0.425641,0.04467252,0.04467252
Endo,3177,2586,5763,0.551275,7.332571e-15,5.1328e-14
L5_IT,262,179,441,0.594104,9.025156e-05,0.0001579402
L23_IT,90,52,142,0.633803,0.00180171,0.002522394
Pvalb,24,10,34,0.705882,0.02430651,0.0283576


In [43]:
counts_agg['significant'] = counts_agg['p_binom_twosided_BH'].le(0.05)
counts_agg

Unnamed: 0_level_0,long,trunc,total,fraction_long,p_binom_twosided,p_binom_twosided_BH,significant
ont,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L6b,6,33,39,0.153846,1.429926e-05,3.336494e-05,True
VLMC,13,50,63,0.206349,3.015952e-06,1.055583e-05,True
L6_CT,83,112,195,0.425641,0.04467252,0.04467252,True
Endo,3177,2586,5763,0.551275,7.332571e-15,5.1328e-14,True
L5_IT,262,179,441,0.594104,9.025156e-05,0.0001579402,True
L23_IT,90,52,142,0.633803,0.00180171,0.002522394,True
Pvalb,24,10,34,0.705882,0.02430651,0.0283576,True


In [45]:
from scipy.stats import fisher_exact

In [47]:
table = counts_agg.loc[['VLMC','L5_IT'],['long','trunc']]
table

Unnamed: 0_level_0,long,trunc
ont,Unnamed: 1_level_1,Unnamed: 2_level_1
VLMC,13,50
L5_IT,262,179


In [49]:
oddsr,p = fisher_exact(table,alternative='two-sided')
print(p)

8.932322865779636e-09
