In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
import seaborn as sns

# custom scripts
sys.path.append("/Users/sarahfong/tools/py_")
import config_readwrite as crw
import plot_params as pp
pp.fonts()

# config 
name = "local_config.ini"
config, cfn = crw.read_config(name)

section="LIBRARY"
MPRA_LIBRARY = config[section]["15mer_constructs"]
RE = config[section]["results"]

if os.path.exists(RE) is False:
    os.mkdir(RE)

# load library

In [2]:
df = pd.read_csv(MPRA_LIBRARY, sep='\t')

## single-nullomer inserts only

In [3]:
one_null = df.loc[df["exp.id"]=="single-null"].copy()

one_null.head()

Unnamed: 0,id,exp.id,exp.seq.id,exp.coor.id,coor,str,seq_frag,200bp_insert,200bp_insert+adaptors,gc_frac,len
701,single-null-0,single-null,0.0,0.0,chr22:42533418-42533618,+,TCGATCGCGAAGTGG,TTCTGAAAAGAGCATAGAAGAAATAATGACGTAAGCTGTCCTCTCT...,AGGACCGGATCAACTTTCTGAAAAGAGCATAGAAGAAATAATGACG...,0.495,230
703,single-null-0,single-null,0.0,1.0,chr17:81472134-81472334,+,TCGATCGCGAAGTGG,GATCCGAGGACAGCCATGCCACTCTCCGGTCACATGACCCACGCCA...,AGGACCGGATCAACTGATCCGAGGACAGCCATGCCACTCTCCGGTC...,0.58,230
705,single-null-0,single-null,0.0,2.0,chr1:23562395-23562595,+,TCGATCGCGAAGTGG,TGCCCAGCTGTGGGTCCCGGTCCCACAGCTGTGTTGATCTAAGACT...,AGGACCGGATCAACTTGCCCAGCTGTGGGTCCCGGTCCCACAGCTG...,0.66,230
707,single-null-0,single-null,0.0,3.0,chr10:68901084-68901284,+,TCGATCGCGAAGTGG,TAATCAGTGCATGCTTCCTTTATGGAGATTGGCCAGGGTTTTACTC...,AGGACCGGATCAACTTAATCAGTGCATGCTTCCTTTATGGAGATTG...,0.525,230
709,single-null-0,single-null,0.0,4.0,chr1:150977355-150977555,+,TCGATCGCGAAGTGG,GATGATGCGACCAGCTTTGTCAGCACGCTTGGGGAGCATACACACC...,AGGACCGGATCAACTGATGATGCGACCAGCTTTGTCAGCACGCTTG...,0.6,230


## write fa

In [4]:
def writeFa(ID, df, cols, RE):
    
    # make fa file
    out = os.path.join(RE, f"{ID}.fa")

    with open(out, "w") as writer:
        for row in df.iterrows():
            
            # first line in fa entry
            tag = ">"+ row[1]["id"] + "|" + row[1]["coor"] + "|" + str(row[1]["seq_frag"])
            
            # second line in fa entry
            seq = row[1]["200bp_insert"] 
            
            # write
            writer.write(f"{tag}\n{seq}\n")

    writer.close()
    
    return out

## get one nullomer inserted into 100 scaffolds

In [None]:
# cols to extract
cols = ["id", "coor", "200bp_insert", "seq_frag"]

In [5]:
ID = "single-null-0"

# subset data
test = df.loc[df["id"]==ID, cols].copy().drop_duplicates()

# write fa
single_null_out = writeFa(ID, test, cols, RE)
single_null_out

'/Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-0.fa'

## randomly select 30 nullomer ids

In [15]:
# randomly sample single-null ids
random_30 = np.random.choice(list(set(one_null["id"])), size=30)

for ID in random_30:
    
    # subset data
    test = df.loc[df["id"]==ID, cols].copy().drop_duplicates()

    # write fa
    single_null_out = writeFa(ID, test, cols, RE)

    print("wrote", single_null_out)

wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-204.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-50.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-81.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-223.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-330.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-969.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-410.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-763.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-464.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-null-959.fa
wrote /Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/single-nu

# get all the concatemer-nullomers 

In [31]:
ID = "concat-null"
cols = ["id", "coor", "200bp_insert", "seq_frag"]

# subset data
test = df.loc[df["exp.id"]==ID, cols].copy().drop_duplicates()

# write fa
concat_out = writeFa(ID, test.sample(n=19), cols, RE)
concat_out

'/Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/concat-null.fa'

In [32]:
test.head()

Unnamed: 0,id,coor,200bp_insert,seq_frag
100800,concat-null-0,concat_template-chr22:42533418-42533618,TTTATTACGGTCGCGGGGTATTCGCTAGCGGGTATCTATCGGCGCG...,TATTACGGTCGCGGGGTATTCGCTAGCGGGTATCTATCGGCGCGGA...
100801,concat-null-1,concat_template-chr22:42533418-42533618,TTTTGACGCGATCGAGGCGTAGAGTTACGCGGCGTATACGATACGG...,TTGACGCGATCGAGGCGTAGAGTTACGCGGCGTATACGATACGGGT...
100802,concat-null-2,concat_template-chr22:42533418-42533618,TTCGTCGACGATACTGGCGAACCGTAAACGGGACGCGCGATAGTTG...,CGTCGACGATACTGGCGAACCGTAAACGGGACGCGCGATAGTTGGT...
100803,concat-null-3,concat_template-chr22:42533418-42533618,TTTATGATACCGCGCGGCGTATAGTCGGACGGACACGCGTAACGTG...,TATGATACCGCGCGGCGTATAGTCGGACGGACACGCGTAACGTGGC...
100804,concat-null-4,concat_template-chr22:42533418-42533618,TTAAAGTTAACGCGCGGCATATTCGACCGCGGCTCGACACGTATCG...,AAAGTTAACGCGCGGCATATTCGACCGCGGCTCGACACGTATCGGA...


## endogenous sequence .fa

In [49]:
ID = "endog"
cols = ["id", "coor", "200bp_insert", "seq_frag"]

# subset data
test = df.loc[df["id"]==ID, cols].copy().drop_duplicates()

# write fa
out = writeFa(ID, test, cols, RE)
out

'/Users/sarahfong/Desktop/local_data/MPRA/nullomers/library/results/endog.fa'

In [50]:
test

Unnamed: 0,id,coor,200bp_insert,seq_frag
700,endog,chr22:42533418-42533618,TTCTGAAAAGAGCATAGAAGAAATAATGACGTAAGCTGTCCTCTCT...,
702,endog,chr17:81472134-81472334,GATCCGAGGACAGCCATGCCACTCTCCGGTCACATGACCCACGCCA...,
704,endog,chr1:23562395-23562595,TGCCCAGCTGTGGGTCCCGGTCCCACAGCTGTGTTGATCTAAGACT...,
706,endog,chr10:68901084-68901284,TAATCAGTGCATGCTTCCTTTATGGAGATTGGCCAGGGTTTTACTC...,
708,endog,chr1:150977355-150977555,GATGATGCGACCAGCTTTGTCAGCACGCTTGGGGAGCATACACACC...,
...,...,...,...,...
890,endog,chr22:45553841-45554041,GCTTCCCTCTGGCAAAACAAATTCCATCCCTGGTTAGTACCCGCCA...,
892,endog,chr17:40909375-40909575,CAGGACTTGTATGGTTGATCTTTGTCCAACCCGAGCCTTGGATCAG...,
894,endog,chr10:100111785-100111985,TGAGCCACCACGCCCGGCCAATCCCCTGTGTTTTTCCTTAAAGAAT...,
896,endog,chr6:127642085-127642285,AGCCTTCCTACACTGCCAGGATGCCAGGGTATTCTCTTTGGGACCT...,
