In [79]:
from Bio.Align import PairwiseAligner
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import gc_fraction
import config_readwrite as crw
from collections import Counter
import gzip
from itertools import product, combinations

import numpy as np
import pandas as pd
import os, sys
import matplotlib.pyplot as plt
import seaborn as sns
import plot_params as pp
pp.fonts()

import subprocess as sp

NMER, MIN_HOMOPOLYMER_LEN = 15, 4

# functions

    PAM = "NGG"
    A|G in pos 19 or 20 (position 11,12 of nullomer)
    no homoploymer
    gc content between 40-60%


In [20]:
config_tag = "config"
config, cfn = crw.read_config(os.path.join(os.getcwd(), config_tag))

NULLS_TSV = config["nullomers"][f"{NMER}mer_fo"]
NULLS_FA = config["nullomers"][f"{NMER}mer_fo_fa"]

SEED_FA = config["seed"][f"{NMER}mer_fa"]
SEED_TSV = config["seed"][f"{NMER}mer_tsv"]

SEED_1bp = config["seed"][f"{NMER}mer_1bprelated"]
SEED_morethan1bp = config["seed"][f"{NMER}mer_morethan1bprelated"]

SEED_1bp_fa = config["seed"][f"{NMER}mer_1bprelated_fa"]
SEED_morethan1bp_fa = config["seed"][f"{NMER}mer_morethan1bprelated_fa"]

RE= config[f"seed"]["results"]

# convert to fasta

In [3]:
if os.path.exists(NULLS_FA) is False:
    writer = open(NULLS_FA, "w")
    with open(NULLS_TSV, "r") as reader:
        for n, line in enumerate(reader.readlines()):
            writer.write(f">{NMER}-firstorder.{n}\n{line}")
    writer.close()    

# read the fasta

In [4]:
seqs = {}
with open(NULLS_FA, "r") as reader:
    for value in SimpleFastaParser(reader):
        seq_id, seq = value
        seqs[seq_id] = seq.split("\t")[0]
reader.close()

In [5]:
list(seqs.items())[:3]

[('15-firstorder.0', 'AAAACGCGTCGGCGA'),
 ('15-firstorder.1', 'AAAATCGTCGGACGT'),
 ('15-firstorder.2', 'AAAATCGCGCTTCGA')]

# find the pam sites

In [6]:
def assessPAM(seq):
    if "".join(seq[-2:]) == "GG":
        pam = True
    else:
        pam = False
        
    return pam

In [7]:
def assessPur(seq):
    
    PURS = ["A", "G"]

    if seq[-4] in PURS or seq[-5] in PURS: # assume NGG in seq[-3:]
        PUR=True
    
    else:
        PUR=False
    
    return PUR    

In [8]:
def makeHomopolymer(seq_len, min_homopolymer_len):
    
    BASES = ["A", "C", "G", "T"]
    homopolymer_list = []
    for base in BASES:
        
        # make homopolymers between min length and full seq len
        for i in np.arange(min_homopolymer_len, seq_len+1):  
            homopolymer_list.append((base*i))

    return homopolymer_list

In [9]:
def assessNoHomopolymers(seq, homoploymer_list):
    
    # assume no homopolymer in seq
    no_homopolymer = True
    
    # unless one of the homopolymers is in the sequence
    for homoploymer in homoploymer_list:
        if homoploymer in seq:
            no_homopolymer = False
    
    return no_homopolymer        

In [10]:
def assessGcContent(seq):
    good_gc = False
    
    # get gc fraction
    gc = gc_fraction(seq)
    
    # if gc content between 40-60
    if gc >=0.40 and gc<=0.60:
        good_gc = True
    
    return good_gc

In [11]:
def writeDictToFa(result_dict, outfile):
    with open(outfile, "w") as writer:
        for key, value in result_dict.items():
            writer.write(f">{key}\n{value}\n")
    writer.close()

# main

In [21]:
"""
filter nullomers

input 
    seq length (int) - length of the sequence
    min_homopolymer_len (int) - the minimum homopolymer length to be excluded

method
    1. make list of homopolymers to look out for, based on min_homopolymer_len
    2. parse through first order seqeunces
        2.1 count the number of sequences that 
            meet the pam, purine, nohomopolymer or gc content rules
    3. Pam? 
    4. purine in pos 19 or 20?
    5. no homopolymers? 
    6. meets gc requirements (gc content >=40%, <= 60%)
    7. if all requirements met, add to list
    8. report the number of sequences that pass each requirement. 
    9. make a dataframe of the sequences that meet requirements. 
    10. sort sequences, rename columns, write sequences that meet requirements to file
    11. make dictionary and write fa file of sequences that meet requirements. 
"""
if os.path.exists(SEED_TSV) is False:
    #1 make once - homopolymer list to reference against
    homopolymer_list = makeHomopolymer(len(seq), MIN_HOMOPOLYMER_LEN)

    #2 parse through candidate sequences
    candidates = {}
    
    #2.1
    pam, pur, nohomo, gc_ = 0, 0, 0, 0
    
    for seq_id, seq in seqs.items():
        
        #3
        if assessPAM(seq) is True:  # seq contains PAM? 
            pam += 1
            
            #4
            if assessPur(seq) is True:
                pur += 1
                
                #5
                if assessNoHomopolymers(seq, homopolymer_list) is True:
                    nohomo += 1
                    
                    #6
                    if assessGcContent(seq) is True:
                        gc_ += 1
                        
                        #7
                        candidates[seq_id] = seq
    #8
    print(list(candidates.items())[:3], "npam",
          pam, "n pur", pur, 
          "n no homopolymer", nohomo, 
          "n gc<=60, >=40", gc_)

    #9 make into a dataframe to sort based on str similarity
    df = pd.DataFrame(candidates.items())

    #10 sort sequences
    df = df.sort_values(by=1).reset_index()
    df.columns = ["idx", "id", "seq"]
    df[["id", "seq"]].to_csv(SEED_TSV, sep='\t', index=None)
    
    #11 make dictionary of sorted sequences
    sorted_candidates = dict(zip(df["id"], df["seq"]))

    # write dictionary to fa
    writeDictToFa(sorted_candidates, SEED_FA)
else:
    df = pd.read_csv(SEED_TSV, sep='\t')

df = df[["id", "seq"]]
df.head()

[('15-firstorder.701', 'AAACCGATCGTGCGG'), ('15-firstorder.743', 'AAACCGCGCTTACGG'), ('15-firstorder.790', 'AAACGATTCGAGCGG')] npam 128910 n pur 110649 n no homopolymer 107449 n gc<=60, >=40 60682


# Pairwise alignment to remove identical sequences. 

In [63]:
def PairwiseAlign(seq1, seq2):

    aligner = PairwiseAligner()
    
    alignment = aligner.align(seq1[:-3], seq2[:-3])  #[:-3] ignores the pam site
    
    return alignment.score  # return number of bases that match

# find alignments 1 bp away
- randomly pick one alignment to keep, one to toss. 

In [None]:
""" 
find onebpaway_matchsize for nullomer of length NMER
    - 3 bases, to ignore for pam site  
    - 1 base, reflects 1bp difference between matching sequence 
"""
onebpaway_matchsize = NMER - (3 + 1)

"""
pairwise align the sequences, looking for matches.

input 
    candidates.values() (dictionary of nullomer values) - 
            nullomers filtered for first order, PAM, PUR 
            in pos 19|20, GC content, no homopolymers
            
    SEED_1bp_fa (str) - file to write all the nullomers 1bp 
            away in distance, ignoring pam. 
    
method
    1. if SEED_1bp_fa does not exist, 
        perform pairwise alignments to find sequences that 
        match by 1 bp. 
    2. make 2 lists from the same nullomer values.
        2.1 shuffle 1 list for randomness. 
    3. make one_bp (list), a list to collect all sequences related by 1bp
    4. per sequence in nullomer list 1
        4.1 check that you haven't evaluated this first sequence (s1)
            and all its pairs already.
        4.2 test pairwise alignment in all sequences in list 2
        4.3 check that you haven't evaluated this second sequence (s2)
            and all its pairs already, or that it isn't identical with s1
    5. pairwise align, score how many bases match for two aligned sequences, ignoring pam site
    6. count the number of times that score appears
    7. test if alignment score matches all bases but one base
        7.1 if one bp difference, then randomly choose one sequence to remove from pool

return
    one_bp (list) - list of all sequences that are related by one basepair to remove. 
                    Note, this is not all the 1bp away sequences because some
                    are randomly chosen to remain in the pool
    counter (dictionary) - counter of how many times pairs of sequences had a matching score. 
        

"""
#1 this take 13 minutes to run
if os.path.exists(SEED_1bp_fa) is False:

    #2 
    l1, l2 = list(candidates.values()), list(candidates.values())

    #2.1 shuffle, for diversity sake
    np.random.shuffle(l1)
    
    #3
    one_bp, counter = set(), Counter()
    
    #4
    for s1 in l1:

        # 4.1
        if s1 not in one_bp:
            
            #4.2
            for s2 in l2:

                #4.3
                if s2 not in one_bp and s2 != s1:

                    #5 
                    score = PairwiseAlign(s1, s2)

                    #6 count score distribution
                    counter[score] += 1

                    #7 if the alignment score equals the onebpaway matchsize
                    if score >= onebpaway_matchsize:

                        #7.1 randomly add one of the sequences to be removed.
                        # Keep the other in the pool.
                        one_bp.add(list(np.random.choice([s1, s2], size=1))[0])

                        # print("found a match", s1, s2)

    len(one_bp)

In [None]:
counter

# save results

In [30]:
def writeTsvFa(df, out_tsv, out_fa):
    print(df.shape)
    # rename columns
    if "seq" not in df.columns:
        df.columns=["idx", "id", "seq"]
    
    # sort by seq
    df=df.sort_values(by="seq")
    
    #save id, seq to tsv
    df[["id", "seq"]].to_csv(out_tsv, sep='\t', index=False)

    # make dictionary of candidates
    sorted_candidates = dict(zip(df["id"], df['seq']))

    # write dictionary to fa
    writeDictToFa(sorted_candidates, out_fa)
    

## write sequences related by 1 bp

In [33]:
if os.path.exists(SEED_1bp) is False:
    related = df.loc[df[1].isin(list(one_bp))]
    out_tsv, out_fa = SEED_1bp, SEED_1bp_fa
    writeTsvFa(related, out_tsv, out_fa)
    
else:
    related = pd.read_csv(SEED_1bp, sep='\t',)

## write sequence related by more than 1 bp

In [46]:
if os.path.exists(SEED_morethan1bp) is False:
    less_related=df.loc[~df[1].isin(list(one_bp))]

    out_tsv, out_fa = SEED_morethan1bp, SEED_morethan1bp_fa
    
    # write tsv, fa
    writeTsvFa(less_related, out_tsv, out_fa)
    
else:
    less_related= pd.read_csv(SEED_morethan1bp, sep='\t')

#view
less_related

Unnamed: 0,id,seq
0,15-firstorder.2100,AAACAACGATCGCGG
1,15-firstorder.1087,AAACATCGTACGCGG
2,15-firstorder.3439,AAACCCGACGTACGG
3,15-firstorder.3182,AAACCGCGCGATAGG
4,15-firstorder.981,AAACCGTCGATGCGG
...,...,...
4218,15-firstorder.2479500,TTTGACGAATCGCGG
4219,15-firstorder.2479390,TTTGCGGTCGTACGG
4220,15-firstorder.2479145,TTTGGCGATCGACGG
4221,15-firstorder.2479018,TTTGGCGCGAATCGG
