In [1]:
import config_readwrite as crw
import os, sys
import pandas as pd

# config

In [2]:
cfn = os.path.join(os.path.dirname(os.getcwd()), "config.ini")
config, cfn = crw.read(cfn)

In [3]:
section = "agarwal_mpra"
crw.check(config, section)

AGARWAL_PATH ="/wynton/home/ahituv/fongsl/MPRA/agarwal_2023"
PATH = "/wynton/home/ahituv/fongsl/EMF/US/data"
FA = os.path.join(AGARWAL_PATH, "joint_library.fa")
OUT_PRE = os.path.join(PATH, "AGARWAL.seqs.txt")
OUT_ALL = os.path.join(PATH, "AGARWAL.seqs.all.tsv")

config[section]["PATH"] = AGARWAL_PATH
config[section]["FA"] = FA
config[section]["seq_only"] = OUT_PRE
config[section]["seq.id_seq"] = OUT_ALL

# write sequence to file

In [4]:
# read the fasta file into 
fa_dict={}
with open(FA, "r") as reader:
    key = ""
    for line in reader:
        if ">" in line:  # handle sequence id
            key = (line.strip("\n")).split(">")[1]
        else:  # handle actual sequence

            fa_dict[key]=line.strip("\n")
            key = ""


# make fasta dataframe

In [5]:
# make dataframe of sequences
df = pd.DataFrame(fa_dict.items(), columns =["seq.id", "seq"])

df.head()

Unnamed: 0,seq.id,seq
0,chr1:19461350-19461550,GTAGGACCAGGCTCTGCCTCCTTCTCTCTCCCCCGACCCACCTGCC...
1,chr10:88965538-88965738,TGTACTTGCCTTAGGAGGTCAAACACATTAGGTATACCAACATATC...
2,chr10:89029900-89030100,TAGCTCAACACAAATCCATCACTGAAAGCAGTGCAACCAAGGCAGC...
3,chr10:89032143-89032343,GACCCTAAATCAGTATGGGGAAAGGTCCCCACTGCAGCACTTTCTA...
4,chr10:89111910-89112110,GTGATCTCAGCTTACTGCAACCTCTGCCTCCTGGGTTCAAGCAATT...


## write sequences only

In [6]:
# save only sequences to a file
df["seq"].to_csv(OUT_PRE, sep='\t', header=None, index=None)

len(line.strip("\n"))

200

## write seq.id + sequence

In [7]:
df.to_csv(OUT_ALL, sep='\t', header=None, index=None)

df.head()

Unnamed: 0,seq.id,seq
0,chr1:19461350-19461550,GTAGGACCAGGCTCTGCCTCCTTCTCTCTCCCCCGACCCACCTGCC...
1,chr10:88965538-88965738,TGTACTTGCCTTAGGAGGTCAAACACATTAGGTATACCAACATATC...
2,chr10:89029900-89030100,TAGCTCAACACAAATCCATCACTGAAAGCAGTGCAACCAAGGCAGC...
3,chr10:89032143-89032343,GACCCTAAATCAGTATGGGGAAAGGTCCCCACTGCAGCACTTTCTA...
4,chr10:89111910-89112110,GTGATCTCAGCTTACTGCAACCTCTGCCTCCTGGGTTCAAGCAATT...


# write activity

In [8]:
PREFIX = "AGARWAL.seqs"
TRUTH = os.path.join(AGARWAL_PATH, "all_cell_types_summary.csv")  # US activity in HepG2
PRED_COOR = os.path.join(AGARWAL_PATH, f"joint_library.bed")  # seq.id + seq

INFO = os.path.join(AGARWAL_PATH, f"joint_info.tsv")  # seq.id + seq
MULTI_BIN = os.path.join(PATH, f"agarwal.multibin.txt")  # seq.id + seq

config[section]["MPRA_ACTIVITY"]=TRUTH
config[section]["BED_COOR"]=PRED_COOR
config[section]["MPRA_INFO"]=INFO

config[section]["MULTIBIN"]=MULTI_BIN
config[section]["MULTIBIN_training"]=MULTI_BIN
crw.write(config, cfn)

## load

## MPRA activity

In [9]:
truth = pd.read_csv(TRUTH) # load data

truth.head()

Unnamed: 0,name,HepG2 [log2(rna/dna)],K562 [log2(rna/dna)],WTC11 [log2(rna/dna)]
0,WTC11_seq9998_F,-1.018,-1.018,-1.254
1,WTC11_seq9987_F,0.32,-0.439,-1.505
2,WTC11_seq998_F,0.003,-0.217,-0.462
3,WTC11_seq9970_F,-0.794,-1.026,-1.414
4,WTC11_seq9967_F,-0.485,-0.592,-1.346


## load bedfile

In [10]:
bed = pd.read_csv(PRED_COOR, sep='\t')
bed["seq.id"] = bed["#chr"] + ":"+ bed["start.hg38"].map(str) + "-" + bed["stop.hg38"].map(str)
bed.head()

Unnamed: 0,#chr,start.hg38,stop.hg38,name,str.hg38,seq.id
0,chr1,19461350,19461550,WTC11_seq1000_F,+,chr1:19461350-19461550
1,chr10,88965538,88965738,WTC11_seq10002_F,+,chr10:88965538-88965738
2,chr10,89029900,89030100,WTC11_seq10004_F,+,chr10:89029900-89030100
3,chr10,89032143,89032343,WTC11_seq10005_F,+,chr10:89032143-89032343
4,chr10,89111910,89112110,WTC11_seq10007_F,+,chr10:89111910-89112110


In [11]:
# merge prediction with sequence id
seq_id = pd.merge(df, bed[['seq.id', "name"]])

# merge truth and prediction
merged = pd.merge(seq_id, truth)
merged.head()

Unnamed: 0,seq.id,seq,name,HepG2 [log2(rna/dna)],K562 [log2(rna/dna)],WTC11 [log2(rna/dna)]
0,chr1:19461350-19461550,GTAGGACCAGGCTCTGCCTCCTTCTCTCTCCCCCGACCCACCTGCC...,WTC11_seq1000_F,0.633,0.08,-0.387
1,chr10:88965538-88965738,TGTACTTGCCTTAGGAGGTCAAACACATTAGGTATACCAACATATC...,WTC11_seq10002_F,-1.157,-1.147,-1.507
2,chr10:89029900-89030100,TAGCTCAACACAAATCCATCACTGAAAGCAGTGCAACCAAGGCAGC...,WTC11_seq10004_F,-0.066,0.186,0.655
3,chr10:89032143-89032343,GACCCTAAATCAGTATGGGGAAAGGTCCCCACTGCAGCACTTTCTA...,WTC11_seq10005_F,0.846,1.212,-0.637
4,chr10:89111910-89112110,GTGATCTCAGCTTACTGCAACCTCTGCCTCCTGGGTTCAAGCAATT...,WTC11_seq10007_F,-0.784,-0.63,-0.612


In [12]:
for CL in ["WTC11", "HepG2", "K562"]:
    merged[f'{CL}_q'] = pd.qcut(merged[f'{CL} [log2(rna/dna)]'], 18, labels=np.arange(18))
    merged[f'{CL}_q'] = merged[f'{CL}_q'].astype(float)

merged.head()

Unnamed: 0,seq.id,seq,name,HepG2 [log2(rna/dna)],K562 [log2(rna/dna)],WTC11 [log2(rna/dna)],WTC11_q,HepG2_q,K562_q
0,chr1:19461350-19461550,GTAGGACCAGGCTCTGCCTCCTTCTCTCTCCCCCGACCCACCTGCC...,WTC11_seq1000_F,0.633,0.08,-0.387,12.0,15.0,12.0
1,chr10:88965538-88965738,TGTACTTGCCTTAGGAGGTCAAACACATTAGGTATACCAACATATC...,WTC11_seq10002_F,-1.157,-1.147,-1.507,2.0,1.0,0.0
2,chr10:89029900-89030100,TAGCTCAACACAAATCCATCACTGAAAGCAGTGCAACCAAGGCAGC...,WTC11_seq10004_F,-0.066,0.186,0.655,15.0,11.0,13.0
3,chr10:89032143-89032343,GACCCTAAATCAGTATGGGGAAAGGTCCCCACTGCAGCACTTTCTA...,WTC11_seq10005_F,0.846,1.212,-0.637,10.0,16.0,16.0
4,chr10:89111910-89112110,GTGATCTCAGCTTACTGCAACCTCTGCCTCCTGGGTTCAAGCAATT...,WTC11_seq10007_F,-0.784,-0.63,-0.612,10.0,3.0,5.0


## write merged file w/ info

In [13]:
merged.to_csv(INFO, sep='\t', index=False)

## write multipred file 
- predict multiple bins

In [14]:
merged[["seq", 'WTC11_q', 'HepG2_q', 'K562_q']].drop_duplicates().to_csv(MULTI_BIN, sep='\t', index=False)

## write cell-specific input prediction file

In [15]:
for CL in ["WTC11", "HepG2", "K562"]:
    PREFIX2 = f"AGARWAL.seqs.{CL}.binnedMPRA"
    INPUT_DATA = os.path.join(PATH, f"{PREFIX2}.txt")  # file to write
    
    config[section][PREFIX2]=INPUT_DATA
    
    if os.path.exists(INPUT_DATA) is False:
        merged[["seq", f'{CL}_q']].drop_duplicates().to_csv(
        INPUT_DATA, sep='\t',  header=False, index=False)

crw.write(config, cfn)

In [16]:
for CL in ["WTC11", "HepG2", "K562"]:
    for size in [5000, 0.9]:
        
        PREFIX_TRAIN, PREFIX_TEST = f"training.{CL}.{size}", f"test.{CL}.{size}"
        INPUT_TRAINING = os.path.join(PATH, f"{PREFIX_TRAIN}.txt")  # file to write
        INPUT_TESTING =  os.path.join(PATH, f"{PREFIX_TEST}.txt")  # file to write
        
        cols = ['seq', f'{CL}_q']
        
        if os.path.exists(INPUT_DATA) is False:
            if size>1:
                training = merged.sample(n=size)  # sample size
            else:
                training = merged.sample(frac=size)

            # test set
            test = merged.loc[~merged["seq"].isin(set(training['seq']))]
            
            training[cols].drop_duplicates().to_csv(
                    INPUT_TRAINING, sep='\t',  header=False, index=False)
            test[cols].drop_duplicates().to_csv(
                    INPUT_TESTING, sep='\t',  header=False, index=False)
        else:
            training = pd.read_csv(INPUT_TRAINING, sep='\t', header=None, names=cols)
            test = merged.loc[~merged["seq"].isin(set(training['seq']))]
            test[cols].drop_duplicates().to_csv(
                    INPUT_TESTING, sep='\t',  header=False, index=False)

        config[section][PREFIX_TRAIN]=INPUT_TRAINING  # write to config
        config[section][PREFIX_TEST]=INPUT_TESTING  # write to config

crw.write(config, cfn)

## multibin training

In [17]:
for size in [5000, 0.9]:
    
    PREFIX_TRAIN, PREFIX_TEST = f"training.multibin.{size}", f"test.multibin.{size}"
    
    INPUT_TRAINING = os.path.join(PATH, f"{PREFIX_TRAIN}.txt")  # file to write
    INPUT_TESTING = os.path.join(PATH, f"{PREFIX_TEST}.txt")  # file to write
    cols = ["seq", 'WTC11_q', 'HepG2_q', 'K562_q']
    if size>1:
        training = merged.sample(n=size)  # sample size
    else:
        training = merged.sample(frac=size)

    test = merged.loc[~merged["seq"].isin(set(training['seq']))]
    
    if os.path.exists(INPUT_TRAINING) is False:
        training[cols].drop_duplicates().to_csv(
                INPUT_TRAINING, sep='\t',  header=False, index=False)
        test[cols].drop_duplicates().to_csv(
                INPUT_TESTING, sep='\t',  header=False, index=False)

    config[section][PREFIX_TRAIN]=INPUT_TRAINING  # write to config
    config[section][PREFIX_TEST]=INPUT_TESTING  # write to config

crw.write(config, cfn)

In [20]:
test[cols].head()

Unnamed: 0,seq,WTC11_q,HepG2_q,K562_q
19,CGAGTAGGGAACTGGAATGGGAGAGGATTAGGGAGGCGCCCTCCCT...,10.0,2.0,2.0
25,GGTGTGGGCATGGAGCCCAGCTGACCGAGGCAGCTGGGAACTGTTT...,16.0,15.0,11.0
30,GGACACTGTTGGAGACGCTTCCTCTCCACCTCTCTGGGAAAGTCTT...,8.0,9.0,9.0
44,GAGCCTCCACTTGGTGCTGATCTGTCTTGAATGCCTTTCTTTGAAA...,3.0,1.0,0.0
45,CACACACGGTACTGGTAGCACAGGCGTACACCTGGTGGGCATTGTG...,14.0,10.0,10.0
