Remove archetype prediction redundancies among archetypes that overlap MPRA library. 

Use sliding window and median archetype length to select archetype w best score. 

In [4]:
import os, sys

sys.path.append(os.getcwd()) # append current working directory

import config_readwrite as crw  # custom script for reading config
import pandas as pd
import pybedtools as pbt

import chr_functions

# config

In [19]:
TILES

'/wynton/group/ahituv/biomarin/library_1/Design/design_tiles_regions.bed'

In [20]:
config, cfn = crw.read(os.path.join(os.getcwd(), "config.neuron.mpra.ini"))

# read
section = "design"


LIB = "lib2"

if LIB == "lib1":
    TILES = config[section]["full_bed"]
    PATH = config[section]["PATH"]
    PREFIX = "unq_tiles.lib1"
    VIERSTRA = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.bed")
    CLEANED = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.cleaned.bed")
    CLEANED_TABLE = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.cleaned.table.bed")


    section = "tfbs"
    crw.check(config, section)

    config[section]["vierstra"] = VIERSTRA  # write the tiles
    config[section]["vierstra.cleaned"] = CLEANED  # write the tiles
    config[section]["vierstra.cleaned.table"] = CLEANED_TABLE # max TFBS motif score, no redundancies within 13 bp. Not strand specific

else:
    
    TILES = "/wynton/group/ahituv/biomarin/library_2/Design/biomarin-lib2-hg38-final.bed"
    PATH = config[section]["PATH2"]
    PREFIX = "biomarin-lib2-hg38-final"
    VIERSTRA = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.bed")
    CLEANED = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.cleaned.bed")
    CLEANED_TABLE = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.cleaned.table.bed")



    section = "tfbs2"
    crw.check(config, section)

    config[section]["vierstra"] = VIERSTRA  # write the tiles
    config[section]["vierstra.cleaned"] = CLEANED  # write the tiles
    config[section]["vierstra.cleaned.table"] = CLEANED_TABLE # max TFBS motif score, no redundancies within 13 bp. Not strand specific

crw.write(config, cfn)

# tiles 

In [9]:
df = pd.read_csv(VIERSTRA, sep='\t', nrows=3)

if "tile.coor" not in (list(df)):
    if LIB == "lib2":
        names = ["#chr", 'start_tile', "end_tile", "id", "#chr_arch", "start_arch", "end_arch", "group",
        'arch', "strand", "model", "num_models"]
    else:
        names = ["#chr", 'start_tile', "end_tile", "#chr_arch", "start_arch", "end_arch", "group",
        'arch', "strand", "model", "num_models"]
    df = pd.read_csv(VIERSTRA, sep='\t', header=None, names=names)

    ## rearrange columns for bedtools merging

    rearranged_cols = [ "#chr_arch", "start_arch", "end_arch", "group",
            'arch', "strand", "model", "num_models"]

    # rearrange columns
    df = df[rearranged_cols]

    # sort and save
    df.sort_values(by=list(df.columns[:3])).drop_duplicates().to_csv(VIERSTRA, sep='\t', index=False)
else:
    print('already rearranged columns')
    df = pd.read_csv(VIERSTRA, sep='\t')

  df = pd.read_csv(VIERSTRA, sep='\t', header=None, names=names)


## prepare dataframe for sliding window

In [10]:
rearranged_cols = [ "#chr_arch", "start_arch", "end_arch", "group",
        'arch', "strand", "model", "num_models"]

# rearrange columns
df = df[rearranged_cols]

# sort and save
df = df.sort_values(by=list(df.columns[:3])).drop_duplicates()

In [11]:
df.head()

Unnamed: 0,#chr_arch,start_arch,end_arch,group,arch,strand,model,num_models
2775711,chr10,623934,623942,HD/2,7.5948,+,HXB4_HUMAN.H11MO.0.B,3
2775712,chr10,623934,623943,HD/17,8.9578,+,Hoxc10.mouse_homeodomain_1,11
2775713,chr10,623934,623943,HD/18,7.6673,+,Hoxc10.mouse_homeodomain_2,10
2775714,chr10,623937,623949,POU/2,9.4311,-,POU4F2_POU_1,7
2775715,chr10,623943,623960,ZNF85,6.8064,-,ZNF85_HUMAN.H11MO.0.C,1


## sliding window function

In [12]:
def slideWindowCollapseGroup(df):
    print(df.shape)

    collection = {}  # collect results

    # cols to query
    cols = [ "#chr_arch", "start_arch", "end_arch", "group","strand", "arch"]

    # compute length 
    df["len"] = df.end_arch-df.start_arch

    # compute median length
    median_len = df.len.describe()["50%"]

    # per chromosome, query starts
    for chr_ in set(df['#chr_arch']):
        print(chr_)
        test= df.loc[df['#chr_arch'] == chr_].copy()

        for start in test['start_arch'].unique():

            # test windows + median tfbs len range
            window=test.loc[test['start_arch'].between(start, int(start+median_len)), cols]

            # no group in window
            if window.shape[0] ==0:
                continue

            # if unique groups in window only
            elif len(set(window["group"])) == window.shape[0]:

                # add to collection
                collection[f'{chr_}:{start}']= window

                continue

            # possibly non-unique
            else:
                # get max score among tfs
                # preserve strand information
                max_scores = window.groupby(["group", "strand"])['arch'].max().reset_index()  

                # right join to keep only max_scores
                window = pd.merge(window, max_scores, how ="right")

                collection[f'{chr_}:{start}']= window[cols]

    # combine results
    cleaned = pd.concat(collection.values()).drop_duplicates()
    print("got rid of N redundacies=", df.shape[0]-cleaned.shape[0], 
          "old:", df.shape, 
          "new", cleaned.shape)
    
    return cleaned

## Sliding window to remove redundant TFBS archetype predictions

In [13]:
cleaned = slideWindowCollapseGroup(df)

(490268, 8)
chr13
chr21
chrY
chr15
chr6
chr7
chrX
chr9
chr20
chr19
chr8
chr18
chr12
chr22
chr10
chr5
chr14
chr3
chr11
chr16
chr4
chr17
got rid of N redundacies= 68331 old: (490268, 9) new (421937, 6)


## write cleaned file

In [14]:
cleaned.head()

Unnamed: 0,#chr_arch,start_arch,end_arch,group,strand,arch
0,chr13,19864629,19864643,CTCF,+,8.128
1,chr13,19864627,19864647,GC-tract,+,6.129
2,chr13,19864641,19864661,GC-tract,-,8.0062
3,chr13,19864637,19864653,HEN1,-,4.5627
4,chr13,19864633,19864650,KLF/SP/2,+,7.9062


In [16]:
cleaned.shape, df.shape

((421937, 6), (490268, 9))

In [17]:
cleaned.to_csv(CLEANED, sep='\t', index=False)

# intersect tiles back into cleaned archetype dataset 

In [21]:
A = pbt.BedTool(TILES)
B = pbt.BedTool(CLEANED)

C = os.path.join(PATH, f"{PREFIX}.x.vierstra.archetypes.cleaned.re-intersect.lib1.bed")

A.intersect(B, wa=True, wb=True, output=C)

<BedTool(/wynton/group/ahituv/biomarin/library_2/Design/biomarin-lib2-hg38-final.x.vierstra.archetypes.cleaned.re-intersect.lib1.bed)>

# format and pivot table for archetype data

In [23]:
if LIB == "lib1":
    keepcols =[0,1,2,5,10,14,15,16]
    names = ["#chr", 'start_tile', "end_tile","cell_origin","enh_id", "TFArch", "TFStrand", "TFscore"]
else:
    keepcols =[0,1,2,3,7,8,9]
    names = ["#chr", 'start_tile', "end_tile","id", "TFArch", "TFStrand", "TFscore"]
df = pd.read_csv(C, sep='\t', header=None, 
                usecols =keepcols, names=names
                ).drop_duplicates()
print(df.shape)

df.head()

  df = pd.read_csv(C, sep='\t', header=None,


(4471990, 7)


Unnamed: 0,#chr,start_tile,end_tile,id,TFArch,TFStrand,TFscore
0,chr22,31898081,31898351,shuffle_64,GC-tract,+,9.8098
1,chr22,31898081,31898351,shuffle_64,ZNF354,-,7.3128
2,chr22,31898081,31898351,shuffle_64,PRDM4,-,2.186
3,chr22,31898081,31898351,shuffle_64,LEF1,-,7.804
4,chr22,31898081,31898351,shuffle_64,IRF/2,+,7.8743


## add tile coordinates back in

In [24]:
df = chr_functions.makeCoorAnnot(df, "#chr", 'start_tile', "end_tile", "tile")
df.head()

Unnamed: 0,#chr,start_tile,end_tile,id,TFArch,TFStrand,TFscore,tile.coor
0,chr22,31898081,31898351,shuffle_64,GC-tract,+,9.8098,chr22:31898081-31898351
1,chr22,31898081,31898351,shuffle_64,ZNF354,-,7.3128,chr22:31898081-31898351
2,chr22,31898081,31898351,shuffle_64,PRDM4,-,2.186,chr22:31898081-31898351
3,chr22,31898081,31898351,shuffle_64,LEF1,-,7.804,chr22:31898081-31898351
4,chr22,31898081,31898351,shuffle_64,IRF/2,+,7.8743,chr22:31898081-31898351


## groupby max score regardless of strand (creates redundancies. too hard.) and pivot

In [25]:
table = pd.pivot(df.groupby(["tile.coor", "TFArch"])["TFscore"].max().reset_index(), 
        index="tile.coor", 
         columns="TFArch", 
         values="TFscore"
        ).reset_index()

table.head()

TFArch,tile.coor,AHR,AIRE,AP1/1,AP1/2,ARI5A,ARI5B,BATF,BCL6/1,BCL6/2,...,ZNF652,ZNF667,ZNF680,ZNF708,ZNF713,ZNF768,ZNF784,ZNF85,ZSCAN3,ZSCAN4
0,chr10:101031026-101031296,,,,,,,,,7.4566,...,,8.4637,,7.1276,,8.8228,,,7.1462,
1,chr10:101031046-101031316,,,,,,,,,,...,,8.4637,,7.1276,,8.8228,,,7.1462,
2,chr10:101031066-101031336,,,,,,,,,,...,,8.4637,,7.1276,,4.3834,,,,
3,chr10:101031086-101031356,,,,,,,,,,...,,8.4637,,7.1276,,4.3834,,,,
4,chr10:101031106-101031376,,,,,,,,,,...,,8.4637,,7.1276,,4.3834,,,,


## write table

In [26]:
table.drop_duplicates().to_csv(CLEANED_TABLE, sep='\t', index=False)

In [28]:
table.drop_duplicates().shape

(34913, 283)

In [27]:
CLEANED_TABLE

'/wynton/group/ahituv/biomarin/library_2/Design/biomarin-lib2-hg38-final.x.vierstra.archetypes.cleaned.table.bed'

In [None]:
tabl