In [1]:
import glob
import main
import subprocess

import pandas as pd

In [2]:
tsv_filename = "/home/resecmo/vigg/release_Zanthar/release_dump/TF/AGO2_HUMAN.tsv"
ago2 = pd.read_csv(tsv_filename, sep='\t', usecols=[0, 1, 14, 15])
ago2.head(5)

Unnamed: 0,#chr,pos,fdrp_bh_ref,fdrp_bh_alt
0,chr1,2406687,0.696739,0.57911
1,chr1,2506456,0.61702,0.652714
2,chr1,2506868,0.719304,0.627044
3,chr1,2506954,0.528243,0.693341
4,chr1,2507389,0.088303,0.98968


In [21]:
def copy_significant_snvs(src_files, dst_dir, p_thr=0.05, keep_counts=None):
    if keep_counts != [] and keep_counts is not None:
        raise ValueError("keep_counts should be empty list or None")
    print(f"Copying SNVs from {'/'.join(src_files[0].split('/')[:-1])} to {dst_dir}, p_value threshold = {p_thr}")
    for i, tsv_filename in enumerate(src_files):
        if i % 50 == 0:        
            print(f"Doing {i}/{len(src_files)}: {tsv_filename.split('/')[-1]}")
        bed_filename = dst_dir + '/' + tsv_filename.split('/')[-1][:-4] + ".bed"

        df = pd.read_csv(tsv_filename, sep='\t', usecols=[0, 1, 14, 15])

        snps = []
        for i in range(df.shape[0]):
            if ((df["fdrp_bh_ref"][i] < p_thr) or (df["fdrp_bh_alt"][i] < p_thr)):
                snps.append((df["#chr"][i], df["pos"][i]))
        keep_counts.append(len(snps))
        main.write_positions_in_bed(snps, bed_filename)
    
    #print(significant_snv_counts)

In [23]:
tsv_files = glob.glob("/home/resecmo/vigg/release_Zanthar/release_dump/TF/*")
#print(tsv_files[:3])

significant_snv_counts = []
copy_significant_snvs(tsv_files, "adastra_snps", p_thr=0.05, keep_counts=significant_snv_counts)
#print(significant_snv_counts)

Copying SNVs from /home/resecmo/vigg/release_Zanthar/release_dump/TF to adastra_snps, p_value threshold = 0.05
Doing 0/1140: ZN222_HUMAN.tsv
Doing 50/1140: HNRPK_HUMAN.tsv
Doing 100/1140: MZF1_HUMAN.tsv
Doing 150/1140: ARI3A_HUMAN.tsv
Doing 200/1140: ZN674_HUMAN.tsv
Doing 250/1140: ZN697_HUMAN.tsv
Doing 300/1140: ETS2_HUMAN.tsv
Doing 350/1140: ZN423_HUMAN.tsv
Doing 400/1140: ZN623_HUMAN.tsv
Doing 450/1140: NR1H3_HUMAN.tsv
Doing 500/1140: ZN224_HUMAN.tsv
Doing 550/1140: ENL_HUMAN.tsv
Doing 600/1140: CDK12_HUMAN.tsv
Doing 650/1140: IRF5_HUMAN.tsv
Doing 700/1140: HNRPL_HUMAN.tsv
Doing 750/1140: ZN660_HUMAN.tsv
Doing 800/1140: BICRA_HUMAN.tsv
Doing 850/1140: PHX2B_HUMAN.tsv
Doing 900/1140: GATA1_HUMAN.tsv
Doing 950/1140: ZN212_HUMAN.tsv
Doing 1000/1140: ANDR_HUMAN.tsv
Doing 1050/1140: KLF9_HUMAN.tsv
Doing 1100/1140: HLF_HUMAN.tsv


In [4]:
bed_files = glob.glob("adastra_snps/*")
bed_files[:3]

['adastra_snps/ZN443_HUMAN.bed',
 'adastra_snps/SALL3_HUMAN.bed',
 'adastra_snps/UBP7_HUMAN.bed']

In [15]:
def find_intersections(bed_files):
    cistrome_bed = "/home/resecmo/vigg/cistrome_hg38/hg38_cistrome/HNF4A_HUMAN.A.bed"

    intersection_counts = []
    for i, bed_filename in enumerate(bed_files):
        if i % 50 == 0:
            print(f"Doing {bed_filename}")
        intersection_filename = "intersections/with_" + bed_filename.split('/')[-1]
        #print(intersection_filename)
        with open(intersection_filename, "bw") as output_file:
            intersection = subprocess.run(["bedtools", "intersect", "-a", cistrome_bed, "-b", bed_filename], capture_output=True).stdout
            output_file.write(intersection)
            intersection_counts.append(len(intersection.split(b'\n')) - 1)

    print(f"{sum(intersection_counts)} intersections found")

In [16]:
bed_files = glob.glob("adastra_snps/*")
#print(bed_files[:3])
find_intersections(bed_files)

Doing adastra_snps/ZN443_HUMAN.bed
Doing adastra_snps/STA5B_HUMAN.bed
Doing adastra_snps/ZN776_HUMAN.bed
Doing adastra_snps/HXC5_HUMAN.bed
Doing adastra_snps/PRDM2_HUMAN.bed
Doing adastra_snps/MITF_HUMAN.bed
Doing adastra_snps/FOXO1_HUMAN.bed
Doing adastra_snps/PPARG_HUMAN.bed
Doing adastra_snps/CBP_HUMAN.bed
Doing adastra_snps/RING1_HUMAN.bed
Doing adastra_snps/CCAR2_HUMAN.bed
Doing adastra_snps/PMEPA_HUMAN.bed
Doing adastra_snps/RUNX2_HUMAN.bed
Doing adastra_snps/ZN619_HUMAN.bed
Doing adastra_snps/MYRF_HUMAN.bed
Doing adastra_snps/ZN629_HUMAN.bed
Doing adastra_snps/PBX4_HUMAN.bed
Doing adastra_snps/ZN513_HUMAN.bed
Doing adastra_snps/ZN662_HUMAN.bed
Doing adastra_snps/INO80_HUMAN.bed
Doing adastra_snps/ZN770_HUMAN.bed
Doing adastra_snps/THA_HUMAN.bed
Doing adastra_snps/ZBT11_HUMAN.bed
33018 intersections found
