20240124

SarahFong

Take output from DiffBind summarized peak and count matrix and prepare data for 
1. Clean diffbind normalized count data (remove bad coordinates)
2. Mean normalized read count: take mean of normalized count peaks, per condition
3. Trim peaks: center 270 bp
4. Format dataframe for LegNet, multitask
5. Split into training and test set - randomly hold out 2 chromosomes. 
6. Add fold num to training set

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd

from scipy import stats

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

In [2]:
config_name =os.path.join(os.getcwd(), "config.multi.ini")
config, cfn = crw.read(config_name)

# load data

## params

In [3]:
CL = 'hepg2'

PATH = "/wynton/group/ahituv/data/US-MPRA/ATAC-seq"
DATA_PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/deepstarr"

TEST_CHR = ["chr8", "chr17"]
VAL_CHR = "chr12"


# peak information
PEAK_SIZE = 270
MIN_READ_DEPTH = 4

# genome information
HG38= "/wynton/group/ahituv/data/dna/hg38/hg38.chrom.sizes"
FA_HG38="/wynton/group/ahituv/data/dna/hg38/hg38.fa"


cols = ['coor.type', "seq", "ctrl", "US", "fold_num"]
fa_cols = ['coor.type', "seq"]


os.chdir(PATH)

### set prefix

In [4]:
PREFIX = "class.all"

if PREFIX =="class.all":
    QUANTILE_FILTER=False
    MIN_READ_DEPTH_FILTER=False
    JOINT_ACCESSIBLE=True
    CLASS_LABEL = True
    
if PREFIX =="class.nojoint":
    QUANTILE_FILTER=False
    MIN_READ_DEPTH_FILTER=True
    JOINT_ACCESSIBLE=False
    CLASS_LABEL = True
    

## base files

In [5]:
# files
ALL = f"{CL}_counts.txt"
CONTROL = f"{CL}c_counts.txt"
TREATED = f"{CL}t_counts.txt"

# DESEQ2 information
DIFF = './diffbind_results/hepg2_deseq2.csv'
DIFF_BED = "./" + DIFF.strip(".csv") + ".bed"

# write training, test files
FULL = f"{CL}.centered.coor.mean.reads.full.tsv"
HELD_OUT = f"{CL}.heldoutchromosomes.tsv"
HELD_OUT_FA = f"{CL}.heldoutchromosomes.fa"
TRAINING = f"{CL}.training.tsv"

## base config

In [6]:
section = f"{CL}-ATAC"
crw.check(config, section)

config[section]
config[section]["path"]= PATH

config[section]["all_norm_read_counts"] = "%(path)s/" + ALL
config[section]["ctrl_norm_read_counts"] = "%(path)s/" +CONTROL
config[section]["trmt_norm_read_counts"] = "%(path)s/" +TREATED

config[section]["diff_bind_results"] = "%(path)s/" +DIFF
config[section]["diff_bind_results_bed"] = "%(path)s/" +DIFF_BED

# full genomic dataset filtered by mind read depth
config[section]["HELD_OUT"] = "%(path)s/" +HELD_OUT
config[section]["HELD_OUT_FA"] = "%(path)s/" +HELD_OUT_FA
config[section]["TRAINING"] = "%(path)s/" +TRAINING


## deepstarr files

In [7]:
# file inputs for training, testing
## x
TRAIN_FA = f"{PREFIX}.Sequences_Train.fa"
VAL_FA = f"{PREFIX}.Sequences_Val.fa"
TEST_FA = f"{PREFIX}.Sequences_Test.fa"

## y
TRAIN_TARGET = f"{PREFIX}.Sequences_activity_Train.txt"
VAL_TARGET = f"{PREFIX}.Sequences_activity_Val.txt"
TEST_TARGET = f"{PREFIX}.Sequences_activity_Test.txt"

## deepstarr config 

In [8]:
section = f"Hepg2.atac.deepstarr"
crw.check(config, section)

# deepstar config
config[section]["data_path"] = DATA_PATH
config[section]["held_out_chr"] = ",".join(TEST_CHR)
config[section]["val_chr"] = VAL_CHR


# deepstarr prefix config
section = f"Hepg2.atac.deepstarr.{PREFIX}"
crw.check(config, section)

config[section]["train_fa"] = TRAIN_FA
config[section]["val_fa"] = VAL_FA
config[section]["test_fa"] = TEST_FA

config[section]["train_target"] = TRAIN_TARGET
config[section]["tval_target"] = VAL_TARGET
config[section]["test_target"] = TEST_TARGET


config[section]["filter_min_read_depth"] = str(MIN_READ_DEPTH_FILTER)
if MIN_READ_DEPTH_FILTER is True:
    config[section]["min_read_depth"] = str(MIN_READ_DEPTH)
    
config[section]["filter_quantile"] = str(QUANTILE_FILTER)
config[section]["filter_jointaccessible"] = str(JOINT_ACCESSIBLE)
config[section]["classlabel"] = str(CLASS_LABEL)

In [9]:
crw.write(config, cfn)

# functions

## make chromosome list

In [10]:
def chrList():
    """return  list of chromosomes"""
    
    chrs = []
    
    for n in np.arange(1,23):
        chrs.append(f"chr{n}")
    
    # add sex chromosomes
    chrs.append("chrX")
    chrs.append("chrY")
    
    return chrs

## write fa

In [11]:
def writeFa(heldout_df, heldout_fa):
    with open(heldout_fa, "w") as writer:
        for row in heldout_df.iterrows():
            seqid, seq=row[1][:2]
            writer.write(f">{seqid}\n{seq}\n")

## train test split on chromosome

In [12]:
def splitTrainTestVal(df, val_chr_list=None, test_chr_list=None):
    """randomly sample and hold out 2 chromosomes for testing, validation"""
    
    cols =['coor.type',"seq"]
    
    # randomly sample test chromosomes (n=2)
    if test_chr_list is None:
        print('randomly sampling chromosomes for test')


        # randomly sample
        test_chr_list = list(np.random.choice(chrs, 2))
   
    # randomly sample validation chromosome (n=1)
    if val_chr_list is None:
        print('randomly sampling chromosomes for val')

        # remove test chromosomes
        for chr_ in test_chr_list:
            chrs.remove(chr_)

        # randomly sample
        val_chr_list = list(np.random.choice(chrs, 1))

    # separate held out chromosomes from  training chromosomes
    test = df.loc[df["#chr"].isin(test_chr_list)].copy()
    val = df.loc[df["#chr"].isin(val_chr_list)].copy()
    train = df.loc[(~df["#chr"].isin(test_chr_list))&
                  (~df["#chr"].isin(val_chr_list))].copy()

    return train, val, test


## filters

In [13]:
def minReadDepthFilter(table, min_read_depth):
    print("before read depth filter:", table.shape)
    table = table.loc[(table["ctrl"]>min_read_depth )| (table["US"]>min_read_depth)].copy()
    print("after:", table.shape)
    
    return table

def quantileFilter(table, quantile=0.99):
    """upper quantile filter for read count values"""
    
    print("before quantile filter:", table.shape)
    
    # quantiles
    ctrl_thresh, us_thresh = table[["ctrl", "US"]].quantile(quantile)

    # filter table
    table = table.loc[(table["ctrl"]<ctrl_thresh)&
                (table["US"]<us_thresh)]

    print("after:", table.shape)
    
    return table

def jointAccessibleFilter(table):
    
    """remove joint accessible regions, scramble new dataframe"""
    
    print("before joint_accessible filter:", table.shape)
    
    # filter table
    ctrl_only = table.loc[(table["ctrl"]>0)&
                (table["US"]==0)].copy()
    
    US_only = table.loc[(table["ctrl"]==0)&
                (table["US"]>0)].copy()
    
    # combine us and ctrl
    # shuffle dataframe
    table = pd.concat([ctrl_only, US_only]).sample(frac=1).reset_index(drop=True)
    
    print("after:", table.shape)
    
    return table

## binarize

def classLabel(table):
    
    """binarize read count column"""
    cols_to_label =['ctrl', 'US']
    for col in cols_to_label:
        table[col] = table[col].apply(lambda x: 0 if x==0 else 1)
        
    return table

# scramble df

def dfShuffle(df):
    """scramble dataframe"""
    return df.sample(frac=1).reset_index(drop=True)

# Main

In [14]:
FULL

'hepg2.centered.coor.mean.reads.full.tsv'

In [15]:
table = pd.read_csv(FULL, sep='\t')
table["coor.type"] = table["type"] + "|" + table["seq.id"]

##  min read depth filter

In [16]:
## mean read depth filter
if MIN_READ_DEPTH_FILTER is True:
    table = minReadDepthFilter(table, MIN_READ_DEPTH) 

## quantile filter
if QUANTILE_FILTER is True:

    table = quantileFilter(table, quantile=0.99)

## joint accessible filter
if JOINT_ACCESSIBLE is False:
    table = jointAccessibleFilter(table)

## apply class label
if CLASS_LABEL is True:
    table = classLabel(table)

## train on all atac peaks

In [17]:
table.groupby(['ctrl', "US"])['#chr'].count()

ctrl  US
0     1     16560
1     0     26540
      1     39415
Name: #chr, dtype: int64

In [18]:
os.chdir(DATA_PATH)

table = dfShuffle(table)  # shuffle the table before splitting
train, val, test = splitTrainTestVal(table, val_chr_list=[VAL_CHR], test_chr_list=TEST_CHR)

writeFa(test[fa_cols], TEST_FA)
writeFa(train[fa_cols], TRAIN_FA)
writeFa(val[fa_cols], VAL_FA)

cols = ["US", "ctrl"]
test[cols].to_csv(TEST_TARGET, sep='\t', index=False)
train[cols].to_csv(TRAIN_TARGET, sep='\t', index=False)
val[cols].to_csv(VAL_TARGET, sep='\t', index=False)