In [92]:
import pandas as pd
import numpy as np
import io
import os
import glob
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

In [2]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [5]:
# the subset 17 samples' vcf file is generating using "bcftools view -s" with selected sample ID on the genotype file from pangenie_merged_bi_nosnvs.vcf.gz
# be sure to download the file under the same directory as your analysis

pangenie_hgsvc_17sample = read_vcf("../data/pangenie_merged_bi_nosnvs_hgsvc_9add8sample.vcf")

In [6]:
pangenie_hgsvc_17sample.shape

(1129810, 26)

In [7]:
pangenie_hgsvc_17sample[['INFO_ID']] = pangenie_hgsvc_17sample['INFO'].str.split(';', expand=True)[2]
pangenie_hgsvc_17sample['INFO_ID'] = pangenie_hgsvc_17sample['INFO_ID'].str.lstrip('ID=')
pangenie_DEL_hgsvc17sample = pangenie_hgsvc_17sample[pangenie_hgsvc_17sample['INFO_ID'].str.contains('DEL')]
pangenie_INS_hgsvc17sample = pangenie_hgsvc_17sample[pangenie_hgsvc_17sample['INFO_ID'].str.contains('INS')]

In [8]:
pangenie_hgsvc_17sample[['type']] = pangenie_hgsvc_17sample['INFO_ID'].str.split('-', expand=True)[2]
col_list = pangenie_hgsvc_17sample['type'].values.tolist()
pd.Series(col_list).value_counts()

DEL    672432
INS    457378
dtype: int64

## deletion 

In [9]:
pangenie_DEL_hgsvc17sample_new = pangenie_DEL_hgsvc17sample[[
 'CHROM','FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522'
     ]]

In [10]:
pangenie_DEL_hgsvc17sample_new = pangenie_DEL_hgsvc17sample_new.copy()
for col in pangenie_DEL_hgsvc17sample_new.iloc[:, 4:]:
    pangenie_DEL_hgsvc17sample_new[col] = pangenie_DEL_hgsvc17sample_new[col].str[:3]

In [11]:
for col in pangenie_DEL_hgsvc17sample_new.columns:
    pangenie_DEL_hgsvc17sample_new.loc[pangenie_DEL_hgsvc17sample_new[col].astype(str).str.startswith('0/0'), col] = '0'
    pangenie_DEL_hgsvc17sample_new.loc[pangenie_DEL_hgsvc17sample_new[col].astype(str).str.startswith('0/1'), col] = '1'
    pangenie_DEL_hgsvc17sample_new.loc[pangenie_DEL_hgsvc17sample_new[col].astype(str).str.startswith('1/0'), col] = '1'
    pangenie_DEL_hgsvc17sample_new.loc[pangenie_DEL_hgsvc17sample_new[col].astype(str).str.startswith('1/1'), col] = '1'

In [12]:
pangenie_DEL_hgsvc17sample_new.shape

(672432, 21)

In [13]:
# Function to convert to integer if possible
def convert_to_int(val):
    try:
        return int(val)
    except ValueError:
        return val

In [14]:
# Apply the conversion function to each cell in the relevant columns to keep the original DataFrame unchanged
pangenie_DEL_hgsvc17sample_new.iloc[:, 4:] = pangenie_DEL_hgsvc17sample_new.iloc[:, 4:].applymap(convert_to_int)
# Create a separate DataFrame for numeric operations
numeric_data = pangenie_DEL_hgsvc17sample_new.iloc[:, 4:].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Calculate the sum of integer values
pangenie_DEL_hgsvc17sample_new['Sum'] = numeric_data.sum(axis=1)
# Filter the DataFrame based on the sum condition
pangenie_DEL_hgsvc17sample_new_atleast1 = pangenie_DEL_hgsvc17sample_new[pangenie_DEL_hgsvc17sample_new['Sum'] >= 5]

In [15]:
# Function to count occurrences of missing genotype ".:."
def count_dot_colon(row):
    return (row == '.:.').sum()

In [17]:
pangenie_DEL_hgsvc17sample_new_atleast1 = pangenie_DEL_hgsvc17sample_new_atleast1.copy()
# Apply the function to each row starting from the 4th column
pangenie_DEL_hgsvc17sample_new_atleast1['missing_geno'] = pangenie_DEL_hgsvc17sample_new_atleast1.iloc[:, 4:].apply(count_dot_colon, axis=1)

In [18]:
pangenie_DEL_hgsvc17sample_new_atleast1['missing_geno'].max()

11

In [19]:
pangenie_DEL_hgsvc17sample_new_atleast1.shape

(227073, 23)

In [20]:
# require less than 50% of samples have missing genotypes 
pangenie_DEL_hgsvc17sample_new_atleast1_filter = pangenie_DEL_hgsvc17sample_new_atleast1[pangenie_DEL_hgsvc17sample_new_atleast1['missing_geno'] < 9]
pangenie_DEL_hgsvc17sample_new_atleast1_filter.shape

(227063, 23)

In [21]:
pangenie_DEL_hgsvc17sample_new_atleast1_filter['Sum'].max()

17

In [43]:
pangenie_DEL_hgsvc17sample_new_atleast1_filter['START_POS'] = pangenie_DEL_hgsvc17sample_new_atleast1_filter['INFO_ID'].str.split('-', expand=True)[1]
pangenie_DEL_hgsvc17sample_new_atleast1_filter['TYPE'] = pangenie_DEL_hgsvc17sample_new_atleast1_filter['INFO_ID'].str.split('-', expand=True)[2]
pangenie_DEL_hgsvc17sample_new_atleast1_filter['LEN'] = pangenie_DEL_hgsvc17sample_new_atleast1_filter['INFO_ID'].str.split('-', expand=True)[3]
pangenie_DEL_hgsvc17sample_new_atleast1_filter.iloc[:,[23,25]] = pangenie_DEL_hgsvc17sample_new_atleast1_filter.iloc[:,[23,25]].astype(str).astype(int)
pangenie_DEL_hgsvc17sample_new_atleast1_filter['END_POS'] = pangenie_DEL_hgsvc17sample_new_atleast1_filter['START_POS'] + pangenie_DEL_hgsvc17sample_new_atleast1_filter['LEN'] - 1
pangenie_DEL_hgsvc17sample_new_atleast1_filter_new = pangenie_DEL_hgsvc17sample_new_atleast1_filter[[
'CHROM', 'START_POS','END_POS', 'TYPE','LEN', 'FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522', 'Sum'
    ]]

In [44]:
pangenie_DEL_hgsvc17sample_new_atleast1_filter_new.shape

(227063, 26)

In [45]:
pangenie_DEL_hgsvc17sample_new_atleast1_50 = pangenie_DEL_hgsvc17sample_new_atleast1_filter_new[pangenie_DEL_hgsvc17sample_new_atleast1_filter_new['LEN'] >= 50]
pangenie_DEL_hgsvc17sample_new_atleast1_50.shape

(12126, 26)

In [46]:
pangenie_DEL_hgsvc17sample_new_atleast1_50_zero = pangenie_DEL_hgsvc17sample_new_atleast1_50[pangenie_DEL_hgsvc17sample_new_atleast1_50.iloc[:, 8:].eq(0).any(1)]
pangenie_DEL_hgsvc17sample_new_atleast1_50_zero.shape

(10548, 26)

In [47]:
pangenie_DEL_hgsvc17sample_new_atleast1_50_zero.to_csv('../preprocess_data/pangenie_svs_17sample_ALL_DEL_atleast5_50_genotype_only01.bed', index = False, sep='\t', header=True)

In [48]:
pangenie_DEL_hgsvc17sample_new_atleast1_50_start_end = pangenie_DEL_hgsvc17sample_new_atleast1_50_zero[['CHROM', 'START_POS', 'END_POS', 'INFO_ID']]
pangenie_DEL_hgsvc17sample_new_atleast1_50_start_end.shape

(10548, 4)

In [49]:
pangenie_DEL_hgsvc17sample_new_atleast1_50_start_end.to_csv('../preprocess_data/pangenie_svs_17sample_ALL_DEL_atleast5_50_only01_new.bed', index = False, sep='\t', header=True)

## insertion

In [31]:
pangenie_INS_hgsvc17sample_new = pangenie_INS_hgsvc17sample[[
 'CHROM','FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522'
     ]]

In [32]:
pangenie_INS_hgsvc17sample_new = pangenie_INS_hgsvc17sample_new.copy()
for col in pangenie_INS_hgsvc17sample_new.iloc[:, 4:]:
    pangenie_INS_hgsvc17sample_new[col] = pangenie_INS_hgsvc17sample_new[col].str[:3]

In [33]:
for col in pangenie_INS_hgsvc17sample_new.columns:
    pangenie_INS_hgsvc17sample_new.loc[pangenie_INS_hgsvc17sample_new[col].astype(str).str.startswith('0/0'), col] = '0'
    pangenie_INS_hgsvc17sample_new.loc[pangenie_INS_hgsvc17sample_new[col].astype(str).str.startswith('0/1'), col] = '1'
    pangenie_INS_hgsvc17sample_new.loc[pangenie_INS_hgsvc17sample_new[col].astype(str).str.startswith('1/0'), col] = '1'
    pangenie_INS_hgsvc17sample_new.loc[pangenie_INS_hgsvc17sample_new[col].astype(str).str.startswith('1/1'), col] = '1'

In [34]:
pangenie_INS_hgsvc17sample_new.shape

(457378, 21)

In [35]:
# Apply the conversion function to each cell in the relevant columns to keep the original DataFrame unchanged
pangenie_INS_hgsvc17sample_new.iloc[:, 4:] = pangenie_INS_hgsvc17sample_new.iloc[:, 4:].applymap(convert_to_int)
# Create a separate DataFrame for numeric operations
numeric_data = pangenie_INS_hgsvc17sample_new.iloc[:, 4:].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Calculate the sum of integer values
pangenie_INS_hgsvc17sample_new['Sum'] = numeric_data.sum(axis=1)
# Filter the DataFrame based on the sum condition
pangenie_INS_hgsvc17sample_new_atleast1 = pangenie_INS_hgsvc17sample_new[pangenie_INS_hgsvc17sample_new['Sum'] >= 5]

In [36]:
pangenie_INS_hgsvc17sample_new_atleast1 = pangenie_INS_hgsvc17sample_new_atleast1.copy()
# Apply the function to each row starting from the 4th column
pangenie_INS_hgsvc17sample_new_atleast1['missing_geno'] = pangenie_INS_hgsvc17sample_new_atleast1.iloc[:, 4:].apply(count_dot_colon, axis=1)

In [37]:
pangenie_INS_hgsvc17sample_new_atleast1['missing_geno'].max()

12

In [38]:
pangenie_INS_hgsvc17sample_new_atleast1.shape

(218847, 23)

In [39]:
# require less than 50% of samples have missing genotypes 
pangenie_INS_hgsvc17sample_new_atleast1_filter = pangenie_INS_hgsvc17sample_new_atleast1[pangenie_INS_hgsvc17sample_new_atleast1['missing_geno'] < 9]
pangenie_INS_hgsvc17sample_new_atleast1_filter.shape

(218842, 23)

In [40]:
pangenie_INS_hgsvc17sample_new_atleast1_filter['Sum'].max()

17

In [41]:
pangenie_INS_hgsvc17sample_new_atleast1['START_POS'] = pangenie_INS_hgsvc17sample_new_atleast1['INFO_ID'].str.split('-', expand=True)[1]
pangenie_INS_hgsvc17sample_new_atleast1['TYPE'] = pangenie_INS_hgsvc17sample_new_atleast1['INFO_ID'].str.split('-', expand=True)[2]
pangenie_INS_hgsvc17sample_new_atleast1['LEN'] = pangenie_INS_hgsvc17sample_new_atleast1['INFO_ID'].str.split('-', expand=True)[3]
pangenie_INS_hgsvc17sample_new_atleast1.iloc[:,[23,25]] = pangenie_INS_hgsvc17sample_new_atleast1.iloc[:,[23,25]].astype(str).astype(int)
pangenie_INS_hgsvc17sample_new_atleast1['END_POS'] = pangenie_INS_hgsvc17sample_new_atleast1['START_POS']
pangenie_INS_hgsvc17sample_new_atleast1_new = pangenie_INS_hgsvc17sample_new_atleast1[[
'CHROM', 'START_POS','END_POS', 'TYPE','LEN', 'FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522', 'Sum'
    ]]

In [42]:
pangenie_INS_hgsvc17sample_new_atleast1_50 = pangenie_INS_hgsvc17sample_new_atleast1_new[pangenie_INS_hgsvc17sample_new_atleast1_new['LEN'] >= 50]
pangenie_INS_hgsvc17sample_new_atleast1_50.shape

(17056, 26)

In [50]:
pangenie_INS_hgsvc17sample_new_atleast1_50_zero = pangenie_INS_hgsvc17sample_new_atleast1_50[pangenie_INS_hgsvc17sample_new_atleast1_50.iloc[:, 8:].eq(0).any(1)]
pangenie_INS_hgsvc17sample_new_atleast1_50_zero.shape

(14008, 26)

In [51]:
pangenie_INS_hgsvc17sample_new_atleast1_50_zero.to_csv('../preprocess_data/pangenie_svs_17sample_ALL_INS_atleast5_50_genotype_only01_new.bed', index = False, sep='\t', header=True)

In [52]:
pangenie_INS_hgsvc17sample_new_atleast1_50_start_end = pangenie_INS_hgsvc17sample_new_atleast1_50_zero[['CHROM', 'START_POS', 'END_POS', 'INFO_ID']]
pangenie_INS_hgsvc17sample_new_atleast1_50_start_end.shape

(14008, 4)

In [54]:
pangenie_INS_hgsvc17sample_new_atleast1_50_start_end.to_csv('../preprocess_data/pangenie_svs_17sample_ALL_INS_atleast5_50_only01.bed', index = False, sep='\t', header=True)

### SVs' impact on boundary strength 

In [55]:
all_DEL_merged = pd.read_csv('../data/all_DEL_17merged_cutoff_flank_inter30_5kb_boundaries_100kb_only01_atleast5.bed', sep='\t', header=None)
all_DEL_merged.columns =['CHROM', 'START_POS', 'END_POS', 'INFO_ID', 'BOUND_CHR', 'FLANK_START', 'FLANK_END', 'BOUND_SCORE']
all_INS_merged = pd.read_csv('../data/all_INS_17merged_cutoff_flank_inter30_5kb_boundaries_100kb_only01_atleast5.bed', sep='\t', header=None)
all_INS_merged.columns =['CHROM', 'START_POS', 'END_POS', 'INFO_ID', 'BOUND_CHR', 'FLANK_START', 'FLANK_END', 'BOUND_SCORE']

In [56]:
all_DEL_merged.shape

(10590, 8)

In [57]:
all_INS_merged.shape

(14045, 8)

In [58]:
all_DEL_boundary_1 = all_DEL_merged.loc[all_DEL_merged.iloc[:, 4] != '.']
all_DEL_boundary_2 = all_DEL_boundary_1.reset_index(drop=True)
all_INS_boundary_1 = all_INS_merged.loc[all_INS_merged.iloc[:, 4] != '.']
all_INS_boundary_2 = all_INS_boundary_1.reset_index(drop=True)

In [59]:
all_DEL_boundary_2.shape

(1517, 8)

In [60]:
all_INS_boundary_2.shape

(2119, 8)

In [61]:
len(all_DEL_boundary_2['INFO_ID'].drop_duplicates())

1475

In [62]:
len(all_INS_boundary_2['INFO_ID'].drop_duplicates())

2082

In [67]:
sample_ids = ['HG00512','HG00513','HG00514_1','HG00731','HG00732','HG00733_1','NA19238','NA19239','NA19240_1',
              'GM18507','GM18505','GM18508','GM18486','GM19099','GM19141','GM18516','GM18522']

for i in sample_ids:
    file_pattern = os.path.join(f"../data/sample_bound/{i}_BS_cutoff_inter30_10kb.boundaries_100kb_rep.bed")
    sample_file = glob.glob(file_pattern)
    sample_merged_BS = pd.read_csv(sample_file[0], sep='\t', header=None)
    sample_merged_BS =sample_merged_BS[[0, 1, 2, 3, 4, 5, 6, 8]]
    sample_merged_BS.columns =['BOUND_CHR', 'FLANK_START', 'FLANK_END', 'BOUND_SCORE','BOUND_CHR_samp', 'BOUND_START_samp', 'BOUND_END_samp', 'BOUND_SCORE_samp']
    sample_merged_BS_1 = sample_merged_BS.groupby(['BOUND_CHR', 'FLANK_START', 'FLANK_END'], as_index=False)['BOUND_SCORE_samp'].median()
    all_DEL_boundary_2[i] = all_DEL_boundary_2.merge(sample_merged_BS_1, how="left", left_on=["BOUND_CHR", "FLANK_START", "FLANK_END"], right_on=["BOUND_CHR", "FLANK_START", "FLANK_END"])['BOUND_SCORE_samp']
    all_INS_boundary_2[i] = all_INS_boundary_2.merge(sample_merged_BS_1, how="left", left_on=["BOUND_CHR", "FLANK_START", "FLANK_END"], right_on=["BOUND_CHR", "FLANK_START", "FLANK_END"])['BOUND_SCORE_samp']

In [68]:
all_DEL_boundary_2.shape

(1517, 25)

In [69]:
all_INS_boundary_2.shape

(2119, 25)

#### Mann-Whitney U test for genotypes and boundary BS

In [70]:
all_DEL_boundary_2['num_NaNs'] = all_DEL_boundary_2.iloc[:, 8:].isna().sum(1)
all_INS_boundary_2['num_NaNs'] = all_INS_boundary_2.iloc[:, 8:].isna().sum(1)

In [73]:
all_DEL_boundary_2_noNA = all_DEL_boundary_2[all_DEL_boundary_2.num_NaNs < 17]
all_INS_boundary_2_noNA = all_INS_boundary_2[all_INS_boundary_2.num_NaNs < 17]

In [74]:
# Replace NaN Values with Zeros, because those NA boundary scores mean there is no boundary, so we changed them to zero
all_DEL_boundary_3 = all_DEL_boundary_2_noNA.fillna(0).drop_duplicates().reset_index(drop=True)
all_DEL_boundary_3.shape

(1290, 26)

In [75]:
# Replace NaN Values with Zeros
all_INS_boundary_3 = all_INS_boundary_2_noNA.fillna(0).drop_duplicates().reset_index(drop=True)
all_INS_boundary_3.shape

(1806, 26)

In [76]:
all_DEL_boundary_id = list(all_DEL_boundary_3['INFO_ID'])
DEL_list = list(set(all_DEL_boundary_id))
len(DEL_list)

1267

In [77]:
all_INS_boundary_id = list(all_INS_boundary_3['INFO_ID'])
INS_list = list(set(all_INS_boundary_id))
len(INS_list)

1787

In [79]:
DEL_genotype = pd.read_csv('../preprocess_data/pangenie_svs_17sample_ALL_DEL_atleast5_50_genotype_only01.bed', sep='\t', header=0, dtype=str)
INS_genotype = pd.read_csv('../preprocess_data/pangenie_svs_17sample_ALL_INS_atleast5_50_genotype_only01.bed', sep='\t', header=0, dtype=str)

In [80]:
DEL_genotype_1 = DEL_genotype.iloc[:, :-1]
INS_genotype_1 = INS_genotype.iloc[:, :-1]
DEL_boundary_genotype = DEL_genotype_1[DEL_genotype_1['INFO_ID'].isin(all_DEL_boundary_id)]
INS_boundary_genotype = INS_genotype_1[INS_genotype_1['INFO_ID'].isin(all_INS_boundary_id)]
DEL_boundary_genotype = DEL_boundary_genotype[[
'CHROM', 'START_POS','END_POS', 'TYPE','LEN', 'FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522'
    ]]
INS_boundary_genotype = INS_boundary_genotype[[
'CHROM', 'START_POS','END_POS', 'TYPE','LEN', 'FILTER', 'INFO_ID', 'FORMAT', 'HG00512','HG00513','HG00514','HG00731','HG00732','HG00733','NA19238','NA19239','NA19240','NA18507','NA18505','NA18508','NA18486','NA19099','NA19141','NA18516','NA18522'
    ]]

In [81]:
DEL_boundary_genotype_1 = DEL_boundary_genotype.reset_index(drop=True)
DEL_boundary_genotype_1.shape

(1267, 25)

In [82]:
INS_boundary_genotype_1 = INS_boundary_genotype.reset_index(drop=True)
INS_boundary_genotype_1.shape

(1787, 25)

In [88]:
### test for genotypes 0 and 1 (binary genotypes, only 0/0 and others)
pvalue= []

for i in range(0, len(all_DEL_boundary_3)):
    DEL_boundary_genotype_all = DEL_boundary_genotype_1[DEL_boundary_genotype_1.loc[:,'INFO_ID'] == all_DEL_boundary_3.loc[i,'INFO_ID']].iloc[:,8:]
    genotypes_list = DEL_boundary_genotype_all.iloc[0].values.tolist()
    boundary_score_list = all_DEL_boundary_3.iloc[i,8:25].values.tolist()

    df = pd.DataFrame({'genotypes' : genotypes_list,
                   'boundary_score' : boundary_score_list})
    
    # Convert the 'genotypes' column to string type
    df['genotypes'] = df['genotypes'].astype(str)
    
    df_new = df.assign(BS=df.groupby('genotypes').cumcount()).pivot('BS','genotypes','boundary_score')
      
    try:
        U, p = mannwhitneyu(x=df_new['0'].dropna().tolist(), y=df_new['1'].dropna().tolist(), method="exact", alternative = 'two-sided')
        #print (p)
        
    except ValueError:
        p = 'NA'
        #print (p)
    
    pvalue.append(p)
    
all_DEL_boundary_3['pvalue_0_1'] = pvalue

In [89]:
all_DEL_boundary_3.shape

(1290, 27)

In [90]:
### test for genotypes 0 and 1 (binary genotypes, only 0/0 and others)
pvalue= []

for i in range(0, len(all_INS_boundary_3)):
    INS_boundary_genotype_all = INS_boundary_genotype_1[INS_boundary_genotype_1.loc[:,'INFO_ID'] == all_INS_boundary_3.loc[i,'INFO_ID']].iloc[:,8:]
    genotypes_list = INS_boundary_genotype_all.iloc[0].values.tolist()
    boundary_score_list = all_INS_boundary_3.iloc[i,8:25].values.tolist()

    df = pd.DataFrame({'genotypes' : genotypes_list,
                   'boundary_score' : boundary_score_list})
    
    # Convert the 'genotypes' column to string type
    df['genotypes'] = df['genotypes'].astype(str)
    
    df_new = df.assign(BS=df.groupby('genotypes').cumcount()).pivot('BS','genotypes','boundary_score')
      
    try:
        U, p = mannwhitneyu(x=df_new['0'].dropna().tolist(), y=df_new['1'].dropna().tolist(), method="exact", alternative = 'two-sided')
        #print (p)
        
    except ValueError:
        p = 'NA'
        #print (p)
    
    pvalue.append(p)
    
all_INS_boundary_3['pvalue_0_1'] = pvalue

In [91]:
all_INS_boundary_3.shape

(1806, 27)

In [94]:
def fdr_correction(group):
    _, corrected_pvalues, _, _ = multipletests(group['pvalue_0_1'], method='fdr_bh')
    group['qvalue'] = corrected_pvalues
    return group

In [99]:
corrected_del = all_DEL_boundary_3.groupby('INFO_ID').apply(fdr_correction)
all_DEL_boundary_3_sig = corrected_del[corrected_del['qvalue'] < 0.05]
all_DEL_boundary_3_sig.shape

(35, 28)

In [100]:
corrected_ins = all_INS_boundary_3.groupby('INFO_ID').apply(fdr_correction)
all_INS_boundary_3_sig = corrected_ins[corrected_ins['qvalue'] < 0.05]
all_INS_boundary_3_sig.shape

(57, 28)

In [101]:
len(all_DEL_boundary_3_sig['INFO_ID'].drop_duplicates())

34

In [102]:
len(all_INS_boundary_3_sig['INFO_ID'].drop_duplicates())

57

### load significant discovery set

In [106]:
all_DEL_boundary_dis  = pd.read_csv('../data/final_ALL_DEL_merged_BS_flank_26sample_pvalue_only01_fillna.bed', sep='\t', header=0)
all_INS_boundary_dis  = pd.read_csv('../data/final_ALL_INS_merged_BS_flank_26sample_pvalue_only01_fillna.bed', sep='\t', header=0)
all_DEL_boundary_dis_sig = pd.read_csv('../data/all_DEL_boundary_3_sig_10kbflank.bed', sep='\t', header=0)
all_INS_boundary_dis_sig = pd.read_csv('../data/all_INS_boundary_3_sig_10kbflank.bed', sep='\t', header=0)

In [107]:
len(all_DEL_boundary_dis['INFO_ID'].drop_duplicates())

1740

In [109]:
len(all_INS_boundary_dis['INFO_ID'].drop_duplicates())

2510

In [110]:
len(all_DEL_boundary_dis_sig['INFO_ID'].drop_duplicates())

66

In [111]:
len(all_INS_boundary_dis_sig['INFO_ID'].drop_duplicates())

119

In [112]:
sig_DEL_boundary_in_dis = all_DEL_boundary_3_sig[all_DEL_boundary_3_sig['INFO_ID'].isin(all_DEL_boundary_dis_sig['INFO_ID'])]
sig_INS_boundary_in_dis = all_INS_boundary_3_sig[all_INS_boundary_3_sig['INFO_ID'].isin(all_INS_boundary_dis_sig['INFO_ID'])]

In [113]:
len(sig_DEL_boundary_in_dis['INFO_ID'].drop_duplicates())

4

In [114]:
len(sig_INS_boundary_in_dis['INFO_ID'].drop_duplicates())

3

In [115]:
DEL_boundary_in_dis = all_DEL_boundary_3[all_DEL_boundary_3['INFO_ID'].isin(all_DEL_boundary_dis_sig['INFO_ID'])]
INS_boundary_in_dis = all_INS_boundary_3[all_INS_boundary_3['INFO_ID'].isin(all_INS_boundary_dis_sig['INFO_ID'])]

In [116]:
len(DEL_boundary_in_dis['INFO_ID'].drop_duplicates())

40

In [117]:
len(INS_boundary_in_dis['INFO_ID'].drop_duplicates())

80

In [118]:
all_DEL_boundary_sort = corrected_del.sort_values(by="qvalue",ascending=True)
all_DEL_boundary_sort.shape

(1290, 28)

In [119]:
all_INS_boundary_sort = corrected_ins.sort_values(by="qvalue",ascending=True)
all_INS_boundary_sort.shape

(1806, 28)

### top 25% 

In [120]:
all_DEL_boundary_sort_25 = all_DEL_boundary_sort.head(round(len(all_DEL_boundary_sort)*(1/4)))
all_DEL_boundary_sort_25.shape

(322, 28)

In [121]:
all_INS_boundary_sort_25 = all_INS_boundary_sort.head(round(len(all_INS_boundary_sort)*(1/4)))
all_INS_boundary_sort_25.shape

(452, 28)

In [135]:
sig_DEL_boundary_sort_25 = all_DEL_boundary_sort_25[all_DEL_boundary_sort_25['INFO_ID'].isin(DEL_boundary_in_dis['INFO_ID'])]
sig_INS_boundary_sort_25 = all_INS_boundary_sort_25[all_INS_boundary_sort_25['INFO_ID'].isin(INS_boundary_in_dis['INFO_ID'])]

In [136]:
len(sig_DEL_boundary_sort_25['INFO_ID'].drop_duplicates())

14

In [137]:
len(sig_INS_boundary_sort_25['INFO_ID'].drop_duplicates())

24

### top 50% 

In [138]:
all_DEL_boundary_sort_half = all_DEL_boundary_sort.head(round(len(all_DEL_boundary_sort)*(2/4)))
all_DEL_boundary_sort_half.shape

(645, 28)

In [139]:
all_INS_boundary_sort_half = all_INS_boundary_sort.head(round(len(all_INS_boundary_sort)*(2/4)))
all_INS_boundary_sort_half.shape

(903, 28)

In [140]:
sig_DEL_boundary_sort_half = all_DEL_boundary_sort_half[all_DEL_boundary_sort_half['INFO_ID'].isin(DEL_boundary_in_dis['INFO_ID'])]
sig_INS_boundary_sort_half = all_INS_boundary_sort_half[all_INS_boundary_sort_half['INFO_ID'].isin(INS_boundary_in_dis['INFO_ID'])]

In [141]:
len(sig_DEL_boundary_sort_half['INFO_ID'].drop_duplicates())

26

In [142]:
len(sig_INS_boundary_sort_half['INFO_ID'].drop_duplicates())

48

### top 75% 

In [143]:
all_DEL_boundary_sort_75 = all_DEL_boundary_sort.head(round(len(all_DEL_boundary_sort)*(3/4)))
all_DEL_boundary_sort_75.shape

(968, 28)

In [144]:
all_INS_boundary_sort_75 = all_INS_boundary_sort.head(round(len(all_INS_boundary_sort)*(3/4)))
all_INS_boundary_sort_75.shape

(1354, 28)

In [145]:
sig_DEL_boundary_sort_75 = all_DEL_boundary_sort_75[all_DEL_boundary_sort_75['INFO_ID'].isin(DEL_boundary_in_dis['INFO_ID'])]
sig_INS_boundary_sort_75 = all_INS_boundary_sort_75[all_INS_boundary_sort_75['INFO_ID'].isin(INS_boundary_in_dis['INFO_ID'])]

In [146]:
len(sig_DEL_boundary_sort_75['INFO_ID'].drop_duplicates())

34

In [147]:
len(sig_INS_boundary_sort_75['INFO_ID'].drop_duplicates())

67