# concatenate nullomers into concatemers. 
- make matching kmer concatemers using bowtie. 

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
import config_readwrite as crw

import glob
import gzip
import numpy as np
import os, sys
import pandas as pd
import subprocess as sp

In [2]:
CL, MER, NMUTS, BUILD, FO_ONLY = "common", "15", "2", "hs1", True
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

In [3]:
# number of sequences per construct
NSEQS = 13

TSV_MUT= f'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.{MER}mers.2mut.nulls.fo.pam.tsv'
TSV_MUTCONCAT= f'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.{MER}mers.2mut.nulls.fo.pam.CONCAT-{NSEQS}.tsv'
seed = config["seed"][f'{MER}mer_morethan1bprelated']

section = f"{MER}.firstorder.pam"
crw.check(config,section)

config[section]["tsv"] = TSV_MUT
config[section][f"concat-{NSEQS}"] = TSV_MUTCONCAT

crw.write(config, cfn)

# load data

In [23]:
df = pd.read_csv(seed, sep='\t',  header=None,  names=[f"{MER}_pam_fo"])

df.shape

(4224, 1)

## shuffle nullomers

In [24]:
# shuffle the dataframe
df_shuffled = df.sample(frac=1).to_numpy()

In [25]:
df_shuffled

array([['TATTCGCTCAACCGG'],
       ['AACGAATGTCGACGG'],
       ['TAACGTCGAGTGCGG'],
       ...,
       ['CGCGACATACAACGG'],
       ['CCTTCGCGTAAACGG'],
       ['TCGGTACGAAGTCGG']], dtype=object)

## split shuffled nullomers, NSEQS per array

In [26]:
# split the array into equally sized parts w/ nseqs per part
df_splits = np.array_split(df_shuffled, int(len(df_shuffled)/NSEQS)+1)

In [27]:
len(df_splits)

325

# make sequences from shuffled, split arrays 

In [28]:
def makeSeq(splits):
    
    collection = {}
    for i, sampled_set in enumerate(splits):
    
        seq = "".join(sampled_set.ravel())
    
        # gc fraction
        gc = gc_fraction(seq)
        collection[i]=[seq, sampled_set.ravel(), gc]
    return collection

In [29]:
collection = makeSeq(df_splits)
collection[0]

['TATTCGCTCAACCGGAACGAATGTCGACGGTAACGTCGAGTGCGGTAAATGACGACGCGGGGCGATAACCTACGGTCTTCGACGCGTAGGGTTAATAGTCGCGGGCGAGCGAACAATCGGACTATCGTGAACCGGCGTTGTGCACAACGGAGCGTATACCGTCGGCGCAAATTCGACCGGCGACGAATGGTACGG',
 array(['TATTCGCTCAACCGG', 'AACGAATGTCGACGG', 'TAACGTCGAGTGCGG',
        'TAAATGACGACGCGG', 'GGCGATAACCTACGG', 'TCTTCGACGCGTAGG',
        'GTTAATAGTCGCGGG', 'CGAGCGAACAATCGG', 'ACTATCGTGAACCGG',
        'CGTTGTGCACAACGG', 'AGCGTATACCGTCGG', 'CGCAAATTCGACCGG',
        'CGACGAATGGTACGG'], dtype=object),
 0.5743589743589743]

# check if GC content is greater than 70%:

In [30]:
for i in collection.values():
    if i[2]>=0.70:
        print("whoa")
        print(i[0], i[2])
        

# make kmer - concatemer dataframe

In [31]:
results = {}
for n, i in enumerate(collection.values()):
    
    concate_null, array, gc_null = i[0], i[1], i[2]

    
    for null in array.tolist():
        
        # consider both the forward and reverse complement. 
        rev = Seq(null).reverse_complement()

    # add dataframe of results to results dictionary. 
    results[n] = pd.DataFrame([concate_null, gc_null, ",".join(array.tolist())]).T



In [32]:
# combine all the dataframes
matched = pd.concat(results.values())

In [33]:
# name columns
matched.columns = ["nullConcat","nullGC", "null_array"]

In [34]:
matched.head()

Unnamed: 0,nullConcat,nullGC,null_array
0,TATTCGCTCAACCGGAACGAATGTCGACGGTAACGTCGAGTGCGGT...,0.574359,"TATTCGCTCAACCGG,AACGAATGTCGACGG,TAACGTCGAGTGCG..."
0,TTCGAACGCAAGCGGTAATTACGTCGACGGGTTGCGAGTAGTCGGC...,0.564103,"TTCGAACGCAAGCGG,TAATTACGTCGACGG,GTTGCGAGTAGTCG..."
0,CCATCGATCGAACGGCCGGTATTGGATCGGCTAGATCGACGGAGGC...,0.584615,"CCATCGATCGAACGG,CCGGTATTGGATCGG,CTAGATCGACGGAG..."
0,ACCGCTATATTGCGGTTATTGCGCGAGCGGTATCGACGAAGGCGGG...,0.589744,"ACCGCTATATTGCGG,TTATTGCGCGAGCGG,TATCGACGAAGGCG..."
0,AAGCGCGTCTATCGGCCTTAGTTCGGACGGTTCGATACGGGTCGGC...,0.584615,"AAGCGCGTCTATCGG,CCTTAGTTCGGACGG,TTCGATACGGGTCG..."


In [35]:
matched.shape

(325, 3)

## write matched to csv

In [36]:
matched.to_csv(TSV_MUTCONCAT, sep='\t', index=False)

TSV_MUTCONCAT

'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.CONCAT-13.tsv'