# concatenate nullomers into concatemers. 
- make matching kmer concatemers using bowtie. 

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
import config_readwrite as crw

import glob
import gzip
import numpy as np
import os, sys
import pandas as pd
import subprocess as sp

In [2]:
CL, MER, NMUTS, BUILD, FO_ONLY = "common", "15", "2", "hs1", True
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

In [3]:
# number of sequences per construct
NSEQS = 13

TSV_MUT= f'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.{MER}mers.2mut.nulls.fo.pam.tsv'
TSV_MUTCONCAT= f'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.{MER}mers.2mut.nulls.fo.pam.CONCAT-{NSEQS}.tsv'
seed = config["seed"][f'{MER}mer_morethan1bprelated']

section = f"{MER}.firstorder.pam"
crw.check(config,section)

config[section]["tsv"] = TSV_MUT
config[section][f"concat-{NSEQS}"] = TSV_MUTCONCAT

crw.write(config, cfn)

# load data

In [4]:
df = pd.read_csv(seed, header=None,  names=[f"{MER}_pam_fo"])

df.shape

(4224, 1)

## shuffle nullomers

In [5]:
# shuffle the dataframe
df_shuffled = df.sample(frac=1).to_numpy()

## split shuffled nullomers, NSEQS per array

In [6]:
# split the array into equally sized parts w/ nseqs per part
df_splits = np.array_split(df_shuffled, int(len(df_shuffled)/NSEQS)+1)

In [7]:
len(df_splits)

325

# make sequences from shuffled, split arrays 

In [8]:
def makeSeq(splits):
    
    collection = {}
    for i, sampled_set in enumerate(splits):
    
        seq = "".join(sampled_set.ravel())
    
        # gc fraction
        gc = gc_fraction(seq)
        collection[i]=[seq, sampled_set.ravel(), gc]
    return collection

In [9]:
collection = makeSeq(df_splits)
collection[0]

['15-firstorder.1743585\tGTCACCGGTTATCGG15-firstorder.1852869\tTAACTGTTCGGCGGG15-firstorder.1492625\tGCCGATTACAACCGG15-firstorder.1856763\tTAACGCTCCTATCGG15-firstorder.1411224\tGACGATCCGTTACGG15-firstorder.1489232\tGCCGGAATATTGCGG15-firstorder.2280317\tTCGTACTAATGTCGG15-firstorder.398570\tATCGCACATCGTCGG15-firstorder.1033072\tCGCTCATTCGATCGG15-firstorder.300895\tAGCGGTAATTCGCGG15-firstorder.860359\tCGCATAGGTAATCGG15-firstorder.1670136\tGGGTCGATTAACGGG15-firstorder.437421\tATCGTCGACTCGAGG',
 array(['15-firstorder.1743585\tGTCACCGGTTATCGG',
        '15-firstorder.1852869\tTAACTGTTCGGCGGG',
        '15-firstorder.1492625\tGCCGATTACAACCGG',
        '15-firstorder.1856763\tTAACGCTCCTATCGG',
        '15-firstorder.1411224\tGACGATCCGTTACGG',
        '15-firstorder.1489232\tGCCGGAATATTGCGG',
        '15-firstorder.2280317\tTCGTACTAATGTCGG',
        '15-firstorder.398570\tATCGCACATCGTCGG',
        '15-firstorder.1033072\tCGCTCATTCGATCGG',
        '15-firstorder.300895\tAGCGGTAATTCGCGG',
       

# check if GC content is greater than 70%:

In [10]:
for i in collection.values():
    if i[2]>=0.70:
        print("whoa")
        print(i[0], i[2])
        

# Bowtie align to common MPRA tiles, make natural kmers

In [11]:
BOWTIE_PATH = "/wynton/home/ahituv/fongsl/bin/bowtie"
FA = config[CL]["fasta_hs1"]

## build index for common MPRA
- MPRA lifted over to HS1

In [12]:
FA_INDEX = FA.strip(".fa") + ".ebwt"

cmd = [os.path.join(BOWTIE_PATH, "bowtie-build"),
       
       FA,
       FA_INDEX
       
      ]
if os.path.exists(FA_INDEX) is False:
    sp.call(" ".join(cmd), shell=True)

Settings:
  Output files: "/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.ebwt.*.ebwt"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 5 (one in 32)
  FTable chars: 10
  Strings: unpacked
  Max bucket size: default
  Max bucket size, sqrt multiplier: default
  Max bucket size, len divisor: 4
  Difference-cover sample period: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.fa
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
bmax according to bmaxDivN setting: 2960267
Using parameters --bmax 2220201 --dcv 1024
  Doing ahead-of-time memory 

  bucket 2: 50%
  bucket 2: 60%
  bucket 2: 70%
  bucket 2: 80%
  bucket 2: 90%
  bucket 2: 100%
  Sorting block of length 735701 for bucket 2
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 735702 for bucket 2
Getting block 3 of 7
  Reserving size (2220201) for bucket 3
  Calculating Z arrays for bucket 3
  Entering block accumulator loop for bucket 3:
  bucket 3: 10%
  bucket 3: 20%
  bucket 3: 30%
  bucket 3: 40%
  bucket 3: 50%
  bucket 3: 60%
  bucket 3: 70%
  bucket 3: 80%
  bucket 3: 90%
  bucket 3: 100%
  Sorting block of length 1663033 for bucket 3
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 1663034 for bucket 3
Getting block 4 of 7
  Reserving size (2220201) for bucket 4
  Calculating Z arrays for bucket 4
  Entering block accumulator loop for bucket 4:
  bucket 4: 10%
  bucket 4: 20%
  bucket 4: 30%
  bucket 4: 40%
  bucket 4: 50%
  bucket 4: 60%
  bucket 4: 70%
  bucket 4: 80%
  bucket 4: 90%
  bucket 4: 100%
  S

## align seed to common mpra
-- allow for 3 gaps

In [13]:
def bowTieOneFile(file, index, nsubs):
    """
    run bowtie on 1 file w/ index file, write to out file
    
    input
        file (str) - path to file with sequences to evaluate w/ bowtie (just a text file)
        index (str) - path to index fa file (run before!)
        nsubs (int) - number of substitutions to tolerate in alignment. 
    
    method
        1. make variable to bowtie bin, outfile to write
        2. compile command w/ 
            index file, 
            file to run, 
            -v (number of substitutions to look for), 
            and where to write results
        3. launch the command 
    return 
        out (str) - path to written file
    """
    
    #1
    BOWTIE = os.path.join("/wynton/home/ahituv/fongsl/bin/bowtie", "bowtie")
    OUT = os.path.split(file)[0] + f".bowtie.v{nsubs}.tsv"
    
    #2
    cmd = [BOWTIE,
       "-x", index,
       "-r", file, 
       "-v", str(nsubs),  
       ">", OUT,
      ]
    
    #3
    print(" ".join(cmd))

    sp.call(" ".join(cmd), shell=True)
    
    return OUT

## tolerate 3 substitutions

In [14]:
nsubs = 3
out = bowTieOneFile(seed, FA_INDEX, nsubs)

/wynton/home/ahituv/fongsl/bin/bowtie/bowtie -x /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.ebwt -r /wynton/home/ahituv/fongsl/dna/hs1/kmers/15mers/SEED_15mer.firstorder.pam.purine.nohomopoly.GC.morethan.1bp_related.tsv -v 3 > /wynton/home/ahituv/fongsl/dna/hs1/kmers/15mers.bowtie.v3.tsv


# reads processed: 4224
# reads with at least one alignment: 0 (0.00%)
# reads that failed to align: 4224 (100.00%)
No alignments


##### results 

- v 3
    -  reads processed: 4224
    -  reads with at least one alignment: 57784 (100.00%)
    -  reads that failed to align: 0 (0.00%)
Reported 57784 alignments

# find matched kmers

In [15]:
bowtie = pd.read_csv(out, sep='\t', header=None, 
                     names=['id', "strand", "coor", "alignPos", "seq", 
                           "ascii", "m","ref>alt" ]
                    )
bowtie["kmer"]= ""
bowtie.head()

Unnamed: 0,id,strand,coor,alignPos,seq,ascii,m,ref>alt,kmer


## make kmers by inserting ref sequence into nullomers

In [21]:
# dictionary to collect kmer match
kmer_dict = {}

for row in bowtie.iterrows():
    seq = row[1]["seq"] 
    muts = row[1]["ref>alt"].split(",")
    
    # copy the nullomer sequence
    null = seq[:]

    # mutate the nullomer sequence back to ref
    for m in muts:
        alt = m.split(">")[1]
        ref = (m.split(">")[0]).split(":")[1]
        pos = int(m.split(":")[0])
        
        # make ref seq, one base at a time. 
        seq = seq[:pos] + ref + seq[pos+1:]

    # add to dictionary
    kmer_dict[null]=seq

len(kmer_dict.keys())

57270

# make kmer - concatemer dataframe

In [23]:
def prettifySeq(original, mut):
    """
    prettify sequence. 
        All matching bases will be written as "."
        All non matching bases will be written w mutated base identity. 
    """
    prettyseq = ""
    for o, m in zip(original, mut):
        if o != m:
            prettyseq += m
        else:
            prettyseq += "."

    return prettyseq

In [24]:
results = {}
for n, i in enumerate(collection.values()):
    
    concate_null, array, gc_null = i[0], i[1], i[2]
    
    # build kmer sequence given nullomer sequence order. 
    seq = ""
    
    for null in array.tolist():
        
        # consider both the forward and reverse complement. 
        rev = Seq(null).reverse_complement()
        
        if null in kmer_dict:
            seq+=kmer_dict[null]

        elif rev in kmer_dict:  # if the reverse complement is in the dictionary
            seq += Seq(kmer_dict[rev]).reverse_complement() # reverse the reverse complement. 
            
        else:
            print("no kmer", null)
    
    # get kmer GC fraction
    gc_kmer = gc_fraction(seq)
    
    # add dataframe of results to results dictionary. 
    results[n] = pd.DataFrame([concate_null, str(seq), gc_null, gc_kmer, ",".join(array.tolist())]).T
    
    # print example of the kmer and nullomer sequence differences
    if n ==0:
        print(prettifySeq(seq, concate_null))


..........G..G.........T............T..G...........C.......G.A......C.......G..G.C........A...........C....G.......T.....T........T...........A......GA..............C.......G......G....G....G....


In [25]:
# combine all the dataframes
matched = pd.concat(results.values())

# name columns
matched.columns = ["nullConcat", "kmerConcat", "nullGC", "kmerGC", "null_array"]

matched.head()

Unnamed: 0,nullConcat,kmerConcat,nullGC,kmerGC,null_array
0,TATTACGGTCGCGGGGTATTCGCTAGCGGGTATCTATCGGCGCGGA...,TATTACGGTCCCGCGGTATTCGCAAGCGGGTATCTAACGCCGCGGA...,0.589744,0.579487,"TATTACGGTCGCGGG,GTATTCGCTAGCGGG,TATCTATCGGCGCG..."
0,TTGACGCGATCGAGGCGTAGAGTTACGCGGCGTATACGATACGGGT...,CTGAGGTGATCGAGGCGGAGAGTTACGAGGCGTATACCAAACGGGT...,0.579487,0.564103,"TTGACGCGATCGAGG,CGTAGAGTTACGCGG,CGTATACGATACGG..."
0,CGTCGACGATACTGGCGAACCGTAAACGGGACGCGCGATAGTTGGT...,CGTCGATGATACGGGAGAACCGGAAACGGGACGCGCGATAGTTGGT...,0.589744,0.589744,"CGTCGACGATACTGG,CGAACCGTAAACGGG,ACGCGCGATAGTTG..."
0,TATGATACCGCGCGGCGTATAGTCGGACGGACACGCGTAACGTGGC...,TATCATACCGCGCGGCATATTGTCGGACGGACACGCCTAACGGGGC...,0.6,0.584615,"TATGATACCGCGCGG,CGTATAGTCGGACGG,ACACGCGTAACGTG..."
0,AAAGTTAACGCGCGGCATATTCGACCGCGGCTCGACACGTATCGGA...,AAAGTGAAGGCGCGGCATATTCGACCCCGGCTCGCCACGGATGGGA...,0.594872,0.620513,"AAAGTTAACGCGCGG,CATATTCGACCGCGG,CTCGACACGTATCG..."


In [26]:
matched.shape

(4445, 5)

## write matched to csv

In [27]:
matched.to_csv(TSV_MUTCONCAT, sep='\t', index=False)

In [28]:
TSV_MUTCONCAT

'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.CONCAT-13.tsv'

# other tools, analyses

In [132]:
def bowTieOneSeq(seq):
    
    cmd = [os.path.join(BOWTIE_PATH, "bowtie"),
       "-x", FA_INDEX,
       "-c", seq, 
       "-v 2", 
       ">", out,
      ]
    print(" ".join(cmd))
    
    if os.path.exists(FA_INDEX) is False:
        sp.call(" ".join(cmd), shell=True)

In [224]:
i = 'TCGCTAGTCGCGTGG'
o = "CCACGCGACCAGCGA"
prettifySeq(i, o)

prettifySeq(Seq(i).reverse_complement(), o)

'.........C.....'