In [1]:
import os
import re
import subprocess
import tempfile
import pybedtools
import argparse

from collections import defaultdict
from itertools import groupby
from Bio import SeqIO

class Args():
    def __init__(self):
        self.input_prots = 'Scler_orf_prediction/Scler_prot_pred'
        self.input_dna = 'Scler_orf_prediction/Scler_dna_pred'
        self.input_genome = 'data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa'
        self.min_orf = 300
        self.output = 'output'
        self.interval_length = 300

args = Args()

fasta_prot_dict = {}
d = defaultdict(list)

for record in SeqIO.parse(args.input_prots, 'fasta'):
    s = record.id.split('_')
    s1 = re.split(' |\[|\]|_', record.description)
    fasta_prot_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)
    d[s1[0]].append((s1[3], s1[5], 'ORF'+s1[1]))

fasta_dna_dict = {}

for record in SeqIO.parse(args.input_dna, 'fasta'):
    s = record.id.split('_')
    s1 = re.split(' |\[|\]|_', record.description)
    fasta_dna_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)

In [201]:
def find_intersection(chr_name, list_of_orfs, start_const, end_const, orf_const, min_orf):
    '''
    Find frameshifts intersections between 2 ORF.
    They should not be divisible by 3.

    Keyword arguments:
        val -- list of ORFs of one chromosome.
        start -- first coordinate.
        finish -- second coordinate.
        min_orf -- ORF length in bases.

    return:
        <generator begin, end, orf>

        begin -- First coordinate of intersected ORF.
        end -- Second coordinate of intersected ORF.
        orf -- Intersected ORF name.
    '''
    
    start_const, end_const = int(start_const)-1, int(end_const)
    diff_const = end_const - start_const

    for start_current, end_current, orf_current in list_of_orfs:
        start_current, end_current = int(start_current)-1, int(end_current)
        diff_current = end_current - start_current
        
        if all([abs(diff_current)>args.min_orf, abs(diff_const)>args.min_orf]):
            su, eu, so, eo = f"su:{start_current}", f"eu:{end_current}", f"so:{start_const}", f"eo:{end_const}"
            
            if all([diff_current>0, diff_const>0, abs(start_current-start_const)%3!=0,
                    start_current<end_const, start_const<end_current]):
                
                strand_const, strand_current = '5\'->3\'', '5\'->3\''
                
                if (start_current<start_const and end_current>end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {su:-<25}-{eu:->25} 3\'')
                    print(f'+ current_orf = global_orf:    5\' {su:-<12}>{so:-^12}>{eo:-^12}>{eu:->12} 3\'')
                    print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                    print(f'intersection: 5\' {so:-<12}{eo:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_current, end_current)
                    intersection = (start_const, end_const)
                    position_const, position_current = 'INSIDE', 'OUTSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current>start_const and end_current<end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {so:-<25}-{eo:->25} 3\'')
                    print(f'+ const_orf = global_orf:      5\' {so:-<12}>{su:-^12}>{eu:-^12}>{eo:->12} 3\'')
                    print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                    print(f'intersection: 5\' {su:-<12}{eu:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_const, end_const)
                    intersection = (start_current, end_current)
                    position_const, position_current = 'OUTSIDE', 'INSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current>start_const and end_current>end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {so:-<25}-{eu:->25} 3\'')
                    print(f'+ const_orf = first_orf:       5\' {so:-<12}>{su:-^12}>{eo:->12}>{eu:->12} 3\'')
                    print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                    print(f'intersection: 5\' {su:-<12}{eo:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_const, end_current)
                    intersection = (start_current, end_const)
                    position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<start_const and end_current<end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {su:-<25}-{eo:->25} 3\'')
                    print(f'+ current_orf = first_orf:     5\' {su:-<12}>{so:-^12}>{eu:-^12}>{eo:->12} 3\'')
                    print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                    print(f'intersection: 5\' {so:-<12}{eu:->12} 3\'')
                    print('===') 
                    
                    glob_orf = (start_current, end_const)
                    intersection = (start_const, end_current)
                    position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
            elif all([diff_current<0, diff_const>0, abs(end_current-start_const)%3!=0,
                     start_current>start_const, end_current<end_const]):
                
                strand_const, strand_current = '5\'->3\'', '3\'->5\''
                start_current, end_current = start_current+1, end_current-1
                    
                if (start_current>end_const and end_current<start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {so:-<12}-{su:->12} 3\'')
                    print(f'+ current_orf = first_orf:     5\' {so:-<12}>{eo:-<12} 3\'')
                    print(f'-                              3\' {eu:->12}<{su:->12} 5\'')
                    print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                    print(f'intersection: 5\' {eu:-<12}{eo:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_const, start_current)
                    intersection = (end_current, end_const)
                    position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<end_const and end_current<start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eu:-<12}-{eo:->12} 3\'')
                    print(f'+ const_orf = first_orf:       5\' {so:->12}>{eo:->12} 3\'')
                    print(f'-                              3\' {eu:-<12}<{su:-<12} 5\'')
                    print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                    print(f'intersection: 5\' {so:-<12}{su:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_current, end_const)
                    intersection = (start_const, start_current)
                    position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<end_const and end_current>start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {so:-<12}-{eo:->12} 3\'')
                    print(f'+ const_orf = global_orf:      5\' {so:-<12}>{eo:->12} 3\'')
                    print(f'-                              3\' {eu:->12}<{su:-<12} 5\'')
                    print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                    print(f'intersection: 5\' {eu:-<12}{su:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_const, end_const)
                    intersection = (end_current, start_current)
                    position_const, position_current = 'OUTSIDE', 'INSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current>end_const and end_current<start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eu:-<12}-{su:->12} 3\'')
                    print(f'+ const_orf = global_orf:      5\' {so:->12}>{eo:-<12} 3\'')
                    print(f'-                              3\' {eu:-<12}<{su:->12} 5\'')
                    print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                    print(f'intersection: 5\' {so:-<12}{eo:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_current, start_current)
                    intersection = (start_const, end_const)
                    position_const, position_current = 'INSIDE', 'OUTSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current

            elif all([diff_current>0, diff_const<0, abs(start_current-end_const)%3!=0,
                     start_current<start_const, end_current>end_const]):
                
                strand_const, strand_current = '3\'->5\'', '5\'->3\''
                start_const, end_const = start_const+1, end_const-1
                
                if (start_current>end_const and end_current>start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eo:-<12}-{eu:->12} 3\'')
                    print(f'+ const_orf = first_orf:       3\' {eo:-<12}<{so:-<12} 5\'')
                    print(f'-                              5\' {su:->12}>{eu:->12} 3\'')
                    print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                    print(f'intersection: 5\' {su:-<12}{so:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_const, end_current)
                    intersection = (start_current, start_const)
                    position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<end_const and end_current<start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {su:-<12}-{so:->12} 3\'')
                    print(f'+ current_orf = first_orf:     3\' {eo:->12}<{so:->12} 5\'')
                    print(f'-                              5\' {su:-<12}>{eu:-<12} 3\'')
                    print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                    print(f'intersection: 5\' {eo:-<12}{eu:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_current, start_const)
                    intersection = (end_const, end_current)
                    position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current>end_const and end_current<start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eo:-<12}-{so:->12} 3\'')
                    print(f'+ const_orf = global_orf:      3\' {eo:-<12}<{so:->12} 5\'')
                    print(f'-                              5\' {su:->12}>{eu:-<12} 3\'')
                    print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                    print(f'intersection: 5\' {su:-<12}{eu:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_const, start_const)
                    intersection = (start_current, end_current)
                    position_const, position_current = 'OUTSIDE', 'INSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<end_const and end_current>start_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {su:-<12}-{eu:->12} 3\'')
                    print(f'+ const_orf = global_orf:      3\' {eo:->12}<{so:-<12} 5\'')
                    print(f'-                              5\' {su:-<12}>{eu:->12} 3\'')
                    print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                    print(f'intersection: 5\' {eo:-<12}{so:->12} 3\'')
                    print('===')
                    
                    glob_orf = (start_current, end_current)
                    intersection = (end_const, start_const)
                    position_const, position_current = 'INSIDE', 'OUTSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
            elif all([diff_current<0, diff_const<0, abs(end_current-end_const)%3!=0,
                     start_current>end_const, end_current<start_const]):
                
                strand_const, strand_current = '3\'->5\'', '3\'->5\''
                start_const, end_const, start_current, end_current =\
                start_const+1, end_const-1, start_current+1, end_current-1
                
                if (start_current>start_const and end_current>end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eo:-<25}-{su:->25} 3\'')
                    print(f'- const_orf = first_orf:       3\' {eo:-<12}<{eu:-^12}<{so:-^12}<{su:->12} 5\'')
                    print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                    print(f'intersection: 5\' {eu:-<12}{so:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_const, start_current)
                    intersection = (end_current, start_const)
                    position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<start_const and end_current<end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eu:-<25}-{so:->25} 3\'')
                    print(f'- current_orf = first_orf:     3\' {eu:-<12}<{eo:-^12}<{su:-^12}<{so:->12} 5\'')
                    print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                    print(f'intersection: 5\' {eo:-<12}{su:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_current, start_const)
                    intersection = (end_const, start_current)
                    position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current
                    
                elif (start_current<start_const and end_current>end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eo:-<25}-{so:->25} 3\'')
                    print(f'- const_orf = first_orf:       3\' {eo:-<12}<{eu:-^12}<{su:-^12}<{so:->12} 5\'')
                    print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                    print(f'intersection: 5\' {eu:-<12}{su:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_const, start_const)
                    intersection = (end_current, start_current)
                    position_const, position_current = 'OUTSIDE', 'INSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current

                    
                elif (start_current>start_const and end_current<end_const):
                    print(f'{chr_name}: {orf_const} vs {orf_current}')
                    print(f'  global_orf:                  5\' {eu:-<25}-{su:->25} 3\'')
                    print(f'-                              3\' {eu:-<12}<{eo:-^12}<{so:-^12}<{su:->12} 5\'')
                    print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                    print(f'intersection: 5\' {eo:-<12}{so:->12} 3\'')
                    print('===')
                    
                    glob_orf = (end_current, start_current)
                    intersection = (end_const, start_const)
                    position_const, position_current = 'INSIDE', 'OUTSIDE'
                    
                    yield start_current, end_current, orf_current, glob_orf, intersection,\
                    strand_const, strand_current, position_const, position_current

In [None]:
dict_of_intersections = defaultdict(lambda: defaultdict(list))

for k, v in d.items():

    for start_const, end_const, orf_const in v:
        
        start_const, end_const = int(start_const), int(end_const)
        diff = abs(end_const - start_const) 
        for start_current, end_current, orf_current,\
        glob_orf, intersection, strand_const, strand_current,\
        position_const, position_current\
        in find_intersection(k, v, start_const, end_const, orf_const, args.min_orf):
                dict_of_intersections[k][(start_const, end_const, orf_const, strand_const)].\
                    append((start_current, end_current, orf_current, glob_orf, intersection, strand_current,\
                           position_const, position_current))

In [11]:
import tempfile
from pybedtools import bedtool
from Bio import SeqIO

In [204]:
def get_intervals(tmp, fasta, thrhld_interval):
    
    seq_dict = dict()
    with open(tmp.name) as f:
        f = f.read()
        a = bedtool.BedTool(f, from_string=True)
        a = a.sequence(fi=fasta, name=True)
        for record in SeqIO.parse(a.seqfn, 'fasta'):
            if len(str(record.seq)) >= thrhld_interval:
                seq_dict[record.id] = str(record.seq)
            
    return seq_dict

In [205]:
tmp = tempfile.NamedTemporaryFile()

with open(tmp.name, 'w') as w:
    for k, v in dict_of_intersections.items():
        for (_, _, orf_const, _), vv in v.items():
            if len(vv) > 1:
                for i in vv:
                    start_intrsct, end_intrsct = int(i[4][0]), int(i[4][1])
                    w.write(f'{k}\t{start_intrsct}\t{end_intrsct}\t{k}:{orf_const}:{i[2]}:'\
                            f'{start_intrsct}:{end_intrsct}\n')
seq_intrsctd = get_intervals(tmp, args.input_genome, args.interval_length)

with open(tmp.name, 'w') as w:
    for k, v in dict_of_intersections.items():
        for (_, _, orf_const, _), vv in v.items():
            if len(vv) > 1:
                for i in vv:
                    start_glob, end_glob = int(i[3][0]), int(i[3][1])
    #                 print(f"{k}\t{start_glob}\t{end_glob}\t{k}:{orf_const}:{i[2]}:{start_glob}:{end_glob}\n")
                    w.write(f"{k}\t{start_glob}\t{end_glob}\t{k}:{orf_const}:{i[2]}:"\
                            f"{start_glob}:{end_glob}\n")
glob_seq = get_intervals(tmp, args.input_genome, args.interval_length)

In [273]:
for k, v in dict_of_intersections.items():
    for (start_const, end_const, orf_const, strand_const), vv in v.items():
        
        fasta_const_dna = fasta_dna_dict[f'{k}_{orf_const}']
        fasta_const_prot = fasta_prot_dict[f'{k}_{orf_const}']
        
        if len(vv)>1:

            for (start_current, end_current, orf_current,\
                 glob, intersect, strand_current, pos_const, pos_current) in vv:
                
                if orf_const == orf_current:
                    continue
                    
                start_glob, end_glob = int(glob[0]), int(glob[1])
                start_int, end_int = int(intersect[0]), int(intersect[1])
                glob_dna = glob_seq[f'{k}:{orf_const}:{orf_current}:{start_glob}:{end_glob}']
                
                try:
                    intersection_dna = seq_intrsctd[f'{k}:{orf_const}:{orf_current}:{start_int}:{end_int}']
                    int_dna_name = f'>{k}:{orf_const}:{orf_current}:{start_int}:{end_int}'
                except:
                    continue
                    
                fasta_cur_dna = fasta_dna_dict[f'{k}_{orf_current}']
                fasta_cur_prot = fasta_prot_dict[f'{k}_{orf_current}']
                
                if (strand_const == '5\'->3\'' and strand_current == '5\'->3\''):

                    Xdnao, Ydnao = start_const, end_const
                    Xdnau, Ydnau = start_current, end_current

                elif (strand_const == '5\'->3\'' and strand_current == '3\'->5\''):

                    Xdnao, Ydnao = start_const, end_const
                    Xdnau, Ydnau = end_current, start_current

                elif (strand_const == '3\'->5\'' and strand_current == '5\'->3\''):

                    Xdnao, Ydnao = end_const, start_const
                    Xdnau, Ydnau = start_current, end_current             

                elif (strand_const == '3\'->5\'' and strand_current == '3\'->5\''):

                    Xdnao, Ydnao = end_const, start_const
                    Xdnau, Ydnau = end_current, start_current
                            
                if pos_const == 'INSIDE':
                    
                    prot_const_intersected = fasta_const_prot
                    prot_cur_intersected = fasta_cur_prot[(Xdnao-Ydnau)//3:(Ydnao-Xdnau)//3]
                    
                if pos_const == 'OUTSIDE':
                    
                    prot_const_intersected = fasta_const_prot[(Xdnau-Ydnao)//3:(Ydnau-Xdnao)//3]
                    prot_cur_intersected = fasta_cur_prot
                    
                if pos_const == 'LEFT_INTERSECTING':
                     
                    prot_const_intersected = fasta_const_prot[0:(Ydnau-Xdnao)//3]
                    prot_cur_intersected = fasta_cur_prot[(Xdnao-Xdnau)//3:(Ydnau-Xdnau)//3]
                    
                if pos_const == 'RIGHT_INTERSECTING':
                            
                    prot_const_intersected = fasta_const_prot[(Xdnau-Xdnao)//3:(Ydnao-Xdnao)//3]
                    prot_cur_intersected = fasta_cur_prot[0:(Ydnao-Xdnau)//3]
                    
                    
#                 break
                print(f'>GLOBAL_ORF:{k}:{orf_const}:{orf_current}:{start_glob}:{end_glob}\n'
                      f'{glob_dna}\n'
                      f'>DNA_MAIN:{k}:{orf_const}:{start_const}:{end_const}:{strand_const}:{pos_const}\n'
                      f'{fasta_const_dna}\n'
                      f'>DNA_ALT:{k}:{orf_current}:{start_current}:{end_current}:{strand_current}:{pos_current}\n'
                      f'{fasta_cur_dna}\n'
                      f'>INTERSECTION_DNA:{int_dna_name}\n'
                      f'{intersection_dna}\n'
                      f'>PROT_MAIN:{k}:{orf_const}:{start_const}:{end_const}\n'
                      f'{fasta_const_prot}\n'
                      f'>PROT_ALT:{k}:{orf_current}:{start_current}:{end_current}\n'
                      f'{fasta_cur_prot}\n'
                      f'>PROT_MAIN_INERSECTED:{k}:{orf_const}:{start_int}:{end_int}\n'
                      f'{prot_const_intersected}\n'
                      f'>PROT_ALT_INTERSECTED:{k}:{orf_current}:{start_int}:{end_int}\n'
                      f'{prot_cur_intersected}\n'
                      f'=========================================================\n')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




>GLOBAL_ORF:CH476637:ORF314:ORF313:987452:988331
CTTAGTACAATCAGCACACCCATTAAGGATAATCGAACTCAACTTCTTCTTAGTTCCACAATTTGGGAGACAAATGTTATGGTAAATATGTTGACTCCCCAACTTTCCGTTTGTACTATGCGGTACACAAGTAGTCTCTGTCCTAGAACATAGTATGCCTGCTCTGGGACATTTACTCAAACACTGGGATACAGCACCGCCACCATGGCAGGATGGAACATTCACACAAAGTTTCCCACCTCGACAATCTGCATCCTTGGTACATTTAATGACTTTCGCGGATGCGGGGGCATTTCGAGAAGCTCCTTCATTCGCATCTGGGAGACCATTGTTAAGAGCTTGGAGGATGAAATTCGCAATGGGGGAGGTGGTTAGGGAAGAAAGATAATCGGTAATGTTGGTGGGAGTCTTGGGGCTTCCGGATGGACCAATGAAATTCGGTAAGTTCTGCGATGTGGAAGAATAGCCGGTTGAATCCTCGACGCACAGACTAGAGTTATTCCAGATGGCATCGGCTGCATACTGCCAAAGATTGGTATAGTTATATATGGTTTGTGGAGTATAGTAGATGCGACAATCTGCGGCTTCATAGGCAAATTGTAGTGGAATTTCCGCATTTTTGCGGACTTGATCACGAAGATTAATATCCGCATAATACACAAACACGCTGAGTTGCTCCGTCCTATTTGGAAGGAATGTCGCATCTGGTGAGCTCTGATTTTGTAAAAGTTGTTGAGTGATTTCGATGTTAGCATCAAGGGTAAAAGTATCGTAGCCGCGTGCACCGCGAGATCCAGAAGGGGCTTGCATCGGTCCTGCTGTTGGGGTTCCACCCACCGCAACAACTTTCACACCTGCTTCATGGCGGAAAAATTCCAT
>DNA_MAIN:CH476637:ORF314:988331:987453:3'->5':OUTSIDE
ATGGAATTTTTCCGC

In [71]:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
        
    parser.add_argument('-input_prots', type=str,
                        help='The address to the input predicted proteins from the EMBOSS getorf find 1.')
    parser.add_argument('-input_dna', type=str,
                        help='The address to the input predicted dna from the EMBOSS getorf find 3.')
    parser.add_argument('-min_orf', type=int, default=300, help='Minimum ORF length.')
    parser.add_argument('-output', type=str, default='output.fasta', help='The address to the output.')
    
    args = parser.parse_args()

In [39]:
import os
import re
import tempfile
import pybedtools
import argparse
from pybedtools import bedtool

from collections import defaultdict
from Bio import SeqIO


def main(args):

    fasta_prot_dict = {}
    d = defaultdict(list)

    for record in SeqIO.parse(args.input_prots, 'fasta'):
        s = record.id.split('_')
        s1 = re.split(' |\[|\]|_', record.description)
        fasta_prot_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)
        d[s1[0]].append((s1[3], s1[5], 'ORF'+s1[1]))

    fasta_dna_dict = {}

    for record in SeqIO.parse(args.input_dna, 'fasta'):
        s = record.id.split('_')
        s1 = re.split(' |\[|\]|_', record.description)
        fasta_dna_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)


    def find_intersection(chr_name, list_of_orfs, start_const, end_const, orf_const, min_orf):
        '''
        Find frameshifts intersections between 2 ORF.
        They should not be divisible by 3.

        Keyword arguments:
            val -- list of ORFs of one chromosome.
            start -- first coordinate.
            finish -- second coordinate.
            min_orf -- ORF length in bases.

        return:
            <generator begin, end, orf>

            begin -- First coordinate of intersected ORF.
            end -- Second coordinate of intersected ORF.
            orf -- Intersected ORF name.
        '''

        start_const, end_const = int(start_const)-1, int(end_const)
        diff_const = end_const - start_const

        for start_current, end_current, orf_current in list_of_orfs:
            start_current, end_current = int(start_current)-1, int(end_current)
            diff_current = end_current - start_current

            if all([abs(diff_current)>args.min_orf, abs(diff_const)>args.min_orf]):
                su, eu, so, eo = f"su:{start_current}", f"eu:{end_current}", f"so:{start_const}", f"eo:{end_const}"

                if all([diff_current>0, diff_const>0, abs(start_current-start_const)%3!=0,
                        start_current<end_const, start_const<end_current]):

                    strand_const, strand_current = '5\'->3\'', '5\'->3\''

                    if (start_current<start_const and end_current>end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {su:-<25}-{eu:->25} 3\'')
                        print(f'+ current_orf = global_orf:    5\' {su:-<12}>{so:-^12}>{eo:-^12}>{eu:->12} 3\'')
                        print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                        print(f'intersection: 5\' {so:-<12}{eo:->12} 3\'')
                        print('===')

                        glob_orf = (start_current, end_current)
                        intersection = (start_const, end_const)
                        position_const, position_current = 'INSIDE', 'OUTSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current>start_const and end_current<end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {so:-<25}-{eo:->25} 3\'')
                        print(f'+ const_orf = global_orf:      5\' {so:-<12}>{su:-^12}>{eu:-^12}>{eo:->12} 3\'')
                        print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                        print(f'intersection: 5\' {su:-<12}{eu:->12} 3\'')
                        print('===')

                        glob_orf = (start_const, end_const)
                        intersection = (start_current, end_current)
                        position_const, position_current = 'OUTSIDE', 'INSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current>start_const and end_current>end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {so:-<25}-{eu:->25} 3\'')
                        print(f'+ const_orf = first_orf:       5\' {so:-<12}>{su:-^12}>{eo:->12}>{eu:->12} 3\'')
                        print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                        print(f'intersection: 5\' {su:-<12}{eo:->12} 3\'')
                        print('===')

                        glob_orf = (start_const, end_current)
                        intersection = (start_current, end_const)
                        position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<start_const and end_current<end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {su:-<25}-{eo:->25} 3\'')
                        print(f'+ current_orf = first_orf:     5\' {su:-<12}>{so:-^12}>{eu:-^12}>{eo:->12} 3\'')
                        print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                        print(f'intersection: 5\' {so:-<12}{eu:->12} 3\'')
                        print('===') 

                        glob_orf = (start_current, end_const)
                        intersection = (start_const, end_current)
                        position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                elif all([diff_current<0, diff_const>0, abs(end_current-start_const)%3!=0,
                         start_current>start_const, end_current<end_const]):

                    strand_const, strand_current = '5\'->3\'', '3\'->5\''
                    start_current, end_current = start_current+1, end_current-1

                    if (start_current<end_const and end_current<start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eu:-<12}-{eo:->12} 3\'')
                        print(f'+ current_orf = first_orf:     5\' {so:->12}>{eo:->12} 3\'')
                        print(f'-                              3\' {eu:-<12}<{su:-<12} 5\'')
                        print(f'LEFT_BACKWARD_INTERSECTING:{orf_current}\nRIGHT_BACKWARD_INTERSECTING:{orf_const}')
                        print(f'intersection: 5\' {so:-<12}{su:->12} 3\'')
                        print('===')

                        glob_orf = (start_const, start_current)
                        intersection = (end_current, end_const)
                        position_const, position_current = 'RIGHT_BACKWARD_INTERSECTING', 'LEFT_BACKWARD_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<end_const and end_current>start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {so:-<12}-{eo:->12} 3\'')
                        print(f'+ const_orf = global_orf:      5\' {so:-<12}>{eo:->12} 3\'')
                        print(f'-                              3\' {eu:->12}<{su:-<12} 5\'')
                        print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                        print(f'intersection: 5\' {eu:-<12}{su:->12} 3\'')
                        print('===')

                        glob_orf = (start_const, end_const)
                        intersection = (end_current, start_current)
                        position_const, position_current = 'OUTSIDE', 'INSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current>end_const and end_current<start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eu:-<12}-{su:->12} 3\'')
                        print(f'+ const_orf = global_orf:      5\' {so:->12}>{eo:-<12} 3\'')
                        print(f'-                              3\' {eu:-<12}<{su:->12} 5\'')
                        print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                        print(f'intersection: 5\' {so:-<12}{eo:->12} 3\'')
                        print('===')

                        glob_orf = (end_current, start_current)
                        intersection = (start_const, end_const)
                        position_const, position_current = 'INSIDE', 'OUTSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    else:
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eu:-<12}-{eo:->12} 3\'')
                        print(f'+ const_orf = first_orf:       5\' {so:->12}>{eo:->12} 3\'')
                        print(f'-                              3\' {eu:-<12}<{su:-<12} 5\'')
                        print(f'LEFT_FORWARD_INTERSECTING:{orf_const}\nRIGHT_FORWARD_INTERSECTING:{orf_current}')
                        print(f'intersection: 5\' {so:-<12}{su:->12} 3\'')
                        print('===')

                        glob_orf = (end_current, end_const)
                        intersection = (start_const, start_current)
                        position_const, position_current = 'LEFT_FORWARD_INTERSECTING', 'RIGHT_FORWARD_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                elif all([diff_current>0, diff_const<0, abs(start_current-end_const)%3!=0,
                         start_current<start_const, end_current>end_const]):

                    strand_const, strand_current = '3\'->5\'', '5\'->3\''
                    start_const, end_const = start_const+1, end_const-1

                    if (start_current>end_const and end_current>start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eo:-<12}-{eu:->12} 3\'')
                        print(f'+ const_orf = first_orf:       3\' {eo:-<12}<{so:-<12} 5\'')
                        print(f'-                              5\' {su:->12}>{eu:->12} 3\'')
                        print(f'LEFT_BACKWARD_INTERSECTING:{orf_current}\nRIGHT_BACKWARD_INTERSECTING:{orf_const}')
                        print(f'intersection: 5\' {su:-<12}{so:->12} 3\'')
                        print('===')

                        glob_orf = (end_const, end_current)
                        intersection = (start_current, start_const)
                        position_const, position_current = 'RIGHT_BACKWARD_INTERSECTING', 'LEFT_BACKWARD_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<end_const and end_current<start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {su:-<12}-{so:->12} 3\'')
                        print(f'+ current_orf = first_orf:     3\' {eo:->12}<{so:->12} 5\'')
                        print(f'-                              5\' {su:-<12}>{eu:-<12} 3\'')
                        print(f'LEFT_FORWARD_INTERSECTING:{orf_const}\nRIGHT_FORWARD_INTERSECTING:{orf_current}')
                        print(f'intersection: 5\' {eo:-<12}{eu:->12} 3\'')
                        print('===')

                        glob_orf = (start_current, start_const)
                        intersection = (end_const, end_current)
                        position_const, position_current = 'LEFT_FORWARD_INTERSECTING', 'RIGHT_FORWARD_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current>end_const and end_current<start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eo:-<12}-{so:->12} 3\'')
                        print(f'+ const_orf = global_orf:      3\' {eo:-<12}<{so:->12} 5\'')
                        print(f'-                              5\' {su:->12}>{eu:-<12} 3\'')
                        print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                        print(f'intersection: 5\' {su:-<12}{eu:->12} 3\'')
                        print('===')

                        glob_orf = (end_const, start_const)
                        intersection = (start_current, end_current)
                        position_const, position_current = 'OUTSIDE', 'INSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<end_const and end_current>start_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {su:-<12}-{eu:->12} 3\'')
                        print(f'+ current_orf = global_orf:    3\' {eo:->12}<{so:-<12} 5\'')
                        print(f'-                              5\' {su:-<12}>{eu:->12} 3\'')
                        print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                        print(f'intersection: 5\' {eo:-<12}{so:->12} 3\'')
                        print('===')

                        glob_orf = (start_current, end_current)
                        intersection = (end_const, start_const)
                        position_const, position_current = 'INSIDE', 'OUTSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                elif all([diff_current<0, diff_const<0, abs(end_current-end_const)%3!=0,
                         start_current>end_const, end_current<start_const]):

                    strand_const, strand_current = '3\'->5\'', '3\'->5\''
                    start_const, end_const, start_current, end_current =\
                    start_const+1, end_const-1, start_current+1, end_current-1

                    if (start_current>start_const and end_current>end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eo:-<25}-{su:->25} 3\'')
                        print(f'- const_orf = first_orf:       3\' {eo:-<12}<{eu:-^12}<{so:-^12}<{su:->12} 5\'')
                        print(f'LEFT_INTERSECTING:{orf_current}\nRIGHT_INTERSECTING:{orf_const}')
                        print(f'intersection: 5\' {eu:-<12}{so:->12} 3\'')
                        print('===')

                        glob_orf = (end_const, start_current)
                        intersection = (end_current, start_const)
                        position_const, position_current = 'RIGHT_INTERSECTING', 'LEFT_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<start_const and end_current<end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eu:-<25}-{so:->25} 3\'')
                        print(f'- current_orf = first_orf:     3\' {eu:-<12}<{eo:-^12}<{su:-^12}<{so:->12} 5\'')
                        print(f'LEFT_INTERSECTING:{orf_const}\nRIGHT_INTERSECTING:{orf_current}')
                        print(f'intersection: 5\' {eo:-<12}{su:->12} 3\'')
                        print('===')

                        glob_orf = (end_current, start_const)
                        intersection = (end_const, start_current)
                        position_const, position_current = 'LEFT_INTERSECTING', 'RIGHT_INTERSECTING'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

                    elif (start_current<start_const and end_current>end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eo:-<25}-{so:->25} 3\'')
                        print(f'- const_orf = first_orf:       3\' {eo:-<12}<{eu:-^12}<{su:-^12}<{so:->12} 5\'')
                        print(f'OUTSIDE:{orf_const}\nINSIDE:{orf_current}')
                        print(f'intersection: 5\' {eu:-<12}{su:->12} 3\'')
                        print('===')

                        glob_orf = (end_const, start_const)
                        intersection = (end_current, start_current)
                        position_const, position_current = 'OUTSIDE', 'INSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current


                    elif (start_current>start_const and end_current<end_const):
                        print(f'{chr_name}: {orf_const} vs {orf_current}')
                        print(f'  global_orf:                  5\' {eu:-<25}-{su:->25} 3\'')
                        print(f'-                              3\' {eu:-<12}<{eo:-^12}<{so:-^12}<{su:->12} 5\'')
                        print(f'OUTSIDE:{orf_current}\nINSIDE:{orf_const}')
                        print(f'intersection: 5\' {eo:-<12}{so:->12} 3\'')
                        print('===')

                        glob_orf = (end_current, start_current)
                        intersection = (end_const, start_const)
                        position_const, position_current = 'INSIDE', 'OUTSIDE'

                        yield start_current, end_current, orf_current, glob_orf, intersection,\
                        strand_const, strand_current, position_const, position_current

    dict_of_intersections = defaultdict(lambda: defaultdict(list))

    for k, v in d.items():

        for start_const, end_const, orf_const in v:

            start_const, end_const = int(start_const), int(end_const)
            diff = abs(end_const - start_const) 
            for start_current, end_current, orf_current,\
            glob_orf, intersection, strand_const, strand_current,\
            position_const, position_current\
            in find_intersection(k, v, start_const, end_const, orf_const, args.min_orf):
                    dict_of_intersections[k][(start_const, end_const, orf_const, strand_const)].\
                        append((start_current, end_current, orf_current, glob_orf, intersection, strand_current,\
                               position_const, position_current))


    def get_intervals(tmp, fasta, thrhld_interval):

        seq_dict = dict()
        with open(tmp.name) as f:
            f = f.read()
            a = bedtool.BedTool(f, from_string=True)
            a = a.sequence(fi=fasta, name=True)
            for record in SeqIO.parse(a.seqfn, 'fasta'):
                if len(str(record.seq)) >= thrhld_interval:
                    seq_dict[record.id] = str(record.seq)

        return seq_dict

    tmp = tempfile.NamedTemporaryFile()

    with open(tmp.name, 'w') as w:
        for k, v in dict_of_intersections.items():
            for (_, _, orf_const, _), vv in v.items():
                if len(vv) > 1:
                    for i in vv:
                        start_intrsct, end_intrsct = int(i[4][0]), int(i[4][1])
                        w.write(f'{k}\t{start_intrsct}\t{end_intrsct}\t{k}:{orf_const}:{i[2]}:'\
                                f'{start_intrsct}:{end_intrsct}\n')
    seq_intrsctd = get_intervals(tmp, args.input_genome, args.interval_length)

    with open(tmp.name, 'w') as w:
        for k, v in dict_of_intersections.items():
            for (_, _, orf_const, _), vv in v.items():
                if len(vv) > 1:
                    for i in vv:
                        start_glob, end_glob = int(i[3][0]), int(i[3][1])
                        w.write(f"{k}\t{start_glob}\t{end_glob}\t{k}:{orf_const}:{i[2]}:"\
                                f"{start_glob}:{end_glob}\n")

    glob_seq = get_intervals(tmp, args.input_genome, args.interval_length)

    with open(args.output, 'w') as w:
        for k, v in dict_of_intersections.items():
            for (start_const, end_const, orf_const, strand_const), vv in v.items():

                fasta_const_dna = fasta_dna_dict[f'{k}_{orf_const}']
                fasta_const_prot = fasta_prot_dict[f'{k}_{orf_const}']

                if len(vv)>1:

                    for (start_current, end_current, orf_current,\
                         glob, intersect, strand_current, pos_const, pos_current) in vv:

                        if orf_const == orf_current:
                            continue

                        start_glob, end_glob = int(glob[0]), int(glob[1])
                        start_int, end_int = int(intersect[0]), int(intersect[1])
                        glob_dna = glob_seq[f'{k}:{orf_const}:{orf_current}:{start_glob}:{end_glob}']

                        try:
                            intersection_dna = seq_intrsctd[f'{k}:{orf_const}:{orf_current}:{start_int}:{end_int}']
                            int_dna_name = f'{k}:{orf_const}:{orf_current}:{start_int}:{end_int}'
                        except:
                            continue

                        fasta_cur_dna = fasta_dna_dict[f'{k}_{orf_current}']
                        fasta_cur_prot = fasta_prot_dict[f'{k}_{orf_current}']

                        Xdnao, Ydnao = start_const, end_const
                        Xdnau, Ydnau = start_current, end_current

#                         if (strand_const == '5\'->3\'' and strand_current == '5\'->3\''):

#                             Xdnao, Ydnao = start_const, end_const
#                             Xdnau, Ydnau = start_current, end_current

#                         elif (strand_const == '5\'->3\'' and strand_current == '3\'->5\''):

#                             Xdnao, Ydnao = start_const, end_const
#                             Xdnau, Ydnau = end_current, start_current

#                         elif (strand_const == '3\'->5\'' and strand_current == '5\'->3\''):

#                             Xdnao, Ydnao = end_const, start_const
#                             Xdnau, Ydnau = start_current, end_current             

#                         elif (strand_const == '3\'->5\'' and strand_current == '3\'->5\''):

#                             Xdnao, Ydnao = end_const, start_const
#                             Xdnau, Ydnau = end_current, start_current

                        if any([all([pos_const == 'INSIDE', strand_const == '5\'->3\'', strand_current == '5\'->3\'']),\
                                all([pos_const == 'INSIDE', strand_const == '3\'->5\'', strand_current == '3\'->5\''])]):

                            prot_const_intersected = fasta_const_prot
                            prot_cur_intersected = fasta_cur_prot[abs(Xdnao-Xdnau)//3:abs(Xdnao-Xdnau)//3+abs(Ydnao-Xdnau)//3]
                
                        elif any([all([pos_const == 'INSIDE', strand_const == '3\'->5\'', strand_current == '5\'->3\'']),\
                                  all([pos_const == 'INSIDE', strand_const == '5\'->3\'', strand_current == '3\'->5\''])]):

                            prot_const_intersected = fasta_const_prot
                            prot_cur_intersected = fasta_cur_prot[abs(Ydnao-Xdnau)//3:abs(Ydnao-Xdnau)//3+abs(Xdnao-Ydnao)//3]


                        elif any([all([pos_const == 'OUTSIDE', strand_const == '5\'->3\'', strand_current == '5\'->3\'']),\
                                 all([pos_const == 'OUTSIDE', strand_const == '3\'->5\'', strand_current == '3\'->5\''])]):
                            
                            prot_const_intersected = fasta_const_prot[abs(Xdnau-Xdnao)//3:abs(Xdnau-Xdnao)//3+abs(Ydnau-Xdnau)//3]
                            prot_cur_intersected = fasta_cur_prot

                        elif any([all([pos_const == 'OUTSIDE', strand_const == '3\'->5\'', strand_current == '5\'->3\'']),\
                                 all([pos_const == 'OUTSIDE', strand_const == '5\'->3\'', strand_current == '3\'->5\''])]):

                            prot_const_intersected = fasta_const_prot[abs(Ydnau-Xdnao)//3:abs(Ydnau-Xdnao)//3+abs(Xdnau-Ydnau)//3]
                            prot_cur_intersected = fasta_cur_prot

                        elif any([all([pos_const == 'LEFT_INTERSECTING', strand_const == '5\'->3\'', strand_current == '5\'->3\'']),\
                                  all([pos_const ==' RIGHT_INTERSECTING', strand_const == '3\'->5\'', strand_current == '3\'->5\''])]):
                            
                            prot_const_intersected = fasta_const_prot[0:abs(Ydnau-Xdnao)//3]
                            prot_cur_intersected = fasta_cur_prot[abs(Xdnao-Xdnau)//3:+abs(Xdnao-Xdnau)//3+abs(Ydnau-Xdnao)//3]
                            
                        elif any([all([pos_const == 'LEFT_INTERSECTING', strand_const == '3\'->5\'', strand_current == '3\'->5\'']),\
                                  all([pos_const ==' RIGHT_INTERSECTING', strand_const == '5\'->3\'', strand_current == '5\'->3\''])]):
                            
                            prot_const_intersected = fasta_const_prot[abs(Xdnau-Xdnao)//3:abs(Xdnau-Xdnao)//3+abs(Ydnao-Xdnau)//3]
                            prot_cur_intersected = fasta_cur_prot[0:+abs(Xdnao-Xdnau)//3]                           
                            

                        w.write(f'>GLOBAL_ORF:{k}:{orf_const}:{orf_current}:{start_glob}:{end_glob}\n'
                              f'{glob_dna}\n'
                              f'>DNA_MAIN:{k}:{orf_const}:{start_const}:{end_const}:{strand_const}:{pos_const}\n'
                              f'{fasta_const_dna}\n'
                              f'>DNA_ALT:{k}:{orf_current}:{start_current}:{end_current}:{strand_current}:{pos_current}\n'
                              f'{fasta_cur_dna}\n'
                              f'>INTERSECTION_DNA:{int_dna_name}\n'
                              f'{intersection_dna}\n'
                              f'>PROT_MAIN:{k}:{orf_const}:{start_const}:{end_const}\n'
                              f'{fasta_const_prot}\n'
                              f'>PROT_ALT:{k}:{orf_current}:{start_current}:{end_current}\n'
                              f'{fasta_cur_prot}\n'
                              f'>PROT_MAIN_INTERSECTED:{k}:{orf_const}:{start_int}:{end_int}\n'
                              f'{prot_const_intersected}\n'
                              f'>PROT_ALT_INTERSECTED:{k}:{orf_current}:{start_int}:{end_int}\n'
                              f'{prot_cur_intersected}\n'
                              f'=========================================================\n')

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-input_prots', type=str,
                        help='The address to the input predicted proteins from the EMBOSS getorf find 1.')
    parser.add_argument('-input_dna', type=str,
                        help='The address to the input predicted dna from the EMBOSS getorf find 3.')
    parser.add_argument('-input_genome', type=str, help='A Path to the input genome.')
    parser.add_argument('-min_orf', type=int, default=300, help='Minimum ORF length.')
    parser.add_argument('-interval_length', type=int, default=100, help='Minimum length of interval of intersection between 2 ORF.')
    parser.add_argument('-output', type=str, default='output.fasta', help='The address to the output.')

    args = parser.parse_args()

#     class Args():
#         def __init__(self):
#             self.input_prots = 'Scler_orf_prediction/Scler_prot_pred'
#             self.input_dna = 'Scler_orf_prediction/Scler_dna_pred'
#             self.input_genome = 'data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa'
#             self.min_orf = 300
#             self.output = 'output'
#             self.interval_length = 300

#     args = Args()

    main(args)

usage: ipykernel_launcher.py [-h] [-input_prots INPUT_PROTS]
                             [-input_dna INPUT_DNA]
                             [-input_genome INPUT_GENOME] [-min_orf MIN_ORF]
                             [-interval_length INTERVAL_LENGTH]
                             [-output OUTPUT]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-97678f97-1655-4665-8b44-1976c07f7ced.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [18]:
a1=
b1=2037460
a2=2038300
b2=2037600

In [19]:
a1b1

1636

In [20]:
a2b2

700

In [30]:
(abs(a2-a1)//3):((abs(a2-a1)+abs(b2-a1))//3)

SyntaxError: illegal target for annotation (<ipython-input-30-10bb09c2adeb>, line 1)

In [23]:
796//3+700//3

498

In [45]:
a1=290817
b1=290017
a2=290052
b2=290835

In [46]:
seq = 'MRRVTVIISHIHWILKRRAYKWWDILKAVNITILRECSLVSQEKSDFNNSRKSSSHQSISKNAVNHRTEWKMLWVTRHSPSGDEDDKSRNKISLWGTISLSAQPNSCQTSSPPDDSHSSVLNIICNPSTTPSMFGESVDTTPSCNNCRIEEFLGSSGFSQPNLSDEHDNREQNTITDKCTAHDEVCKTLAKMIITAETQSCNTTEKHLYPSYDWHRLSNNSMTMNRNFSNLSVKTFGDVKLKINAENDLNGKHKHQDIGES'

In [51]:
seq[0:(a1-a2)//3]

'MRRVTVIISHIHWILKRRAYKWWDILKAVNITILRECSLVSQEKSDFNNSRKSSSHQSISKNAVNHRTEWKMLWVTRHSPSGDEDDKSRNKISLWGTISLSAQPNSCQTSSPPDDSHSSVLNIICNPSTTPSMFGESVDTTPSCNNCRIEEFLGSSGFSQPNLSDEHDNREQNTITDKCTAHDEVCKTLAKMIITAETQSCNTTEKHLYPSYDWHRLSNNSMTMNRNFSNLSVKTFGDVKLKINAENDLNGKHKH'

In [56]:
seq = 'MLMLAVQVVFGIYLKFHITKGFHGKIRKVAVHGHGIVGKAMPVVAWVQMLFGGIAALGFCRDDHLGQCLAHFIMGSAFIGYGILLTIVMLVGQVWLRKTGRSQEFFDSAVIAAWGCVNTFTEHRWGGAWVANDIQHTTMGVIWWAAGLAGIWLSRKRDGSPKRNFIPGFVILITGWAMSGHPQHLPLSTMVHSVFGYTLMAAGLSRIIEIGFLLRDKATLSEDGDVNSFQYVPPFVSSSFQDPMNMANNYSYSTHQVSCSWELPRNK'

In [60]:
seq[0:abs(a1-a2)//3]

'MLMLAVQVVFGIYLKFHITKGFHGKIRKVAVHGHGIVGKAMPVVAWVQMLFGGIAALGFCRDDHLGQCLAHFIMGSAFIGYGILLTIVMLVGQVWLRKTGRSQEFFDSAVIAAWGCVNTFTEHRWGGAWVANDIQHTTMGVIWWAAGLAGIWLSRKRDGSPKRNFIPGFVILITGWAMSGHPQHLPLSTMVHSVFGYTLMAAGLSRIIEIGFLLRDKATLSEDGDVNSFQYVPPFVSSSFQDPMNMANNYSYSTH'

In [12]:
import os
import re
import subprocess
import tempfile
from pybedtools import bedtool
import argparse

from collections import defaultdict
from itertools import groupby
from Bio import SeqIO, Seq


In [246]:
fasta_dna_dict = dict()
orfs_dict = defaultdict(list)

for record in SeqIO.parse('input_predicted_prots_and_dna/Scler_dna_pred', 'fasta'):
    
    spl = re.split(' |_|\[|\]', record.description)
    
    if int(spl[3]) < int(spl[5]):
        fasta_dna_dict[f'{spl[0]}:ORF{spl[1]}:{spl[3]}:{spl[5]}:+'] = record.seq
        orfs_dict[spl[0]].append(f"ORF{spl[1]}:{spl[3]}:{spl[5]}:+")
    else:
        fasta_dna_dict[f'{spl[0]}:ORF{spl[1]}:{spl[3]}:{spl[5]}:-'] = record.seq
        orfs_dict[spl[0]].append(f"ORF{spl[1]}:{spl[3]}:{spl[5]}:-")

In [312]:
# def get_data(args):
    
    
# fasta_prot_dict = {}
# fasta_dna_dict = {}
# d = defaultdict(list)

#     for record in SeqIO.parse(args.input_prots, 'fasta'):
#         s = record.id.split('_')
#         s1 = re.split(' |\[|\]|_', record.description)
#         fasta_prot_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)
#         d[s1[0]].append((s1[3], s1[5], 'ORF'+s1[1]))



# for record in SeqIO.parse(args.input_dna, 'fasta'):
#     s = record.id.split('_')
#     s1 = re.split(' |\[|\]|_', record.description)
#     fasta_dna_dict[f'{s[0]}_ORF{s[1]}'] = str(record.seq)

class Intersection:
    
    def __init__(self, args, dna_1_name, dna_1, orfs2):
        
        self.chrom = dna_1_name.split(':')[0]
        self.genome = args.input_genome
        self.min_orf, self.min_dna_inter = args.min_orf, args.interval_length
        self.dna_1_name, self.dna_1 = dna_1_name, dna_1
        self.prot_1_name, self.prot_1 = dna_1_name, Seq.translate(dna_1)
        self.orfs2 = orfs2 
        
        self.global_orf = {}
        self.dna_2 = {}
        self.inter_dna_1 = {}
        self.inter_dna_2 = {}
        self.inter_prot_1 = {}
        self.prot_2 = {}
        self.inter_prot_2 = {}
        
        self.full_batch = ''
        self.status = False
    def get_interval(self, x, y, strand):
            
            if strand == "+":
                a = bedtool.BedTool(f'{self.chrom}\t{x-1}\t{y}\t.\t0\t{strand}',
                                    from_string=True)
                a = a.sequence(fi=args.input_genome, name=True, s=True)
                a = open(a.seqfn).read().lstrip('>').split('\n')
                
            elif strand == '-':
                a = bedtool.BedTool(f'{self.chrom}\t{y-1}\t{x}\t.\t0\t{strand}',
                                    from_string=True)
                a = a.sequence(fi=args.input_genome, s=True)
                a = open(a.seqfn).read().lstrip('>').split('\n')
            
            return a[1]
        
    def _process_intersection(self, dna1_name, dna2_name, fasta_dna_dict):
        
        d1, d2 = dna1_name.split(':'), dna2_name.split(':')
        orf1, orf2 = d1[1], d2[1]
        x1, y1, x2, y2 = int(d1[2]), int(d1[3]), int(d2[2]), int(d2[3])
        strand1, strand2 = d1[4], d2[4]
        
        if all([strand1=='+', strand2=='+', x1<y2, y1>x2,\
                all([abs(x1-y2)>=self.min_dna_inter, abs(x2-y1)>=self.min_dna_inter])]):
            
            if all([x1>x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+:LEFT_INTERSECTION'] =\
                                    self.get_interval(x1, y2, '+')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+:RIGHT_INTERSECTION'] =\
                                    self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+:LEFT_INTERSECTION']
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'] =\
                                    self.prot_1[0:abs(y2-x1)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'] =\
                                    self.prot_2[dna2_name][abs(x1-x2)//3:(abs(x1-x2)+abs(y2-x1))//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'] =\
                                    self.get_interval(x2, y1, '+')
                self.status = True
                    
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+:RIGHT_INTERSECTION'] =\
                                    self.get_interval(x2, y1, '+')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+:LEFT_INTERSECTION'] =\
                                    self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+:RIGHT_INTERSECTION']
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'] =\
                                    self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y1-x2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'] =\
                                    self.prot_2[dna2_name][0:(y1-x2)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'] =\
                                    self.get_interval(x1, y2, '+')
                self.status = True
                    
            elif all([x1<x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+:OUTSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+:INSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'] =\
                                    self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y2-x2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'] =\
                                    self.prot_2[dna2_name]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'] =\
                                    self.dna_1
                self.status = True
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+:INSIDE'] =\
                                    self.dna_1
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+:OUTSIDE'] =\
                                    self.dna_1
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x1}:+'] =\
                                    self.prot_1
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'] =\
                                    self.prot_2[dna2_name][abs(x1-x2)//3:(abs(x1-x2)+abs(y1-x1))//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'] =\
                                    self.dna_2[dna2_name]
                self.status = True

        elif all([strand1=='-', strand2=='-', x1>y2, y1<x2,\
                all([abs(x1-y2)>=self.min_dna_inter, abs(x2-y1)>=self.min_dna_inter])]):
            
            if all([x1>x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-:LEFT_INTERSECTION'] =\
                                    self.get_interval(x2, y1, '-')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-:RIGHT_INTERSECTION'] =\
                                    self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-:LEFT_INTERSECTION']
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-'] =\
                                    self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y1-x2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-'] =\
                                    self.prot_2[dna2_name][0:abs(y1-x2)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x1}:+'] =\
                                    self.get_interval(y2, x1, '+')
                self.status = True
                    
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-:RIGHT_INTERSECTION'] =\
                                    self.get_interval(x1, y2, '-')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-:LEFT_INTERSECTION'] =\
                                    self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-:RIGHT_INTERSECTION']
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'] =\
                                    self.prot_1[0:abs(y2-x1)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'] =\
                                    self.prot_2[dna2_name][abs(x1-x2)//3:(abs(x1-x2)+abs(y2-x1))//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x2}:+'] =\
                                    self.get_interval(y1, x2, '+')
                self.status = True    
            elif all([x1<x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-:INSIDE'] =\
                                    self.dna_1
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-:OUTSIDE'] =\
                                    self.dna_1
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'] =\
                                    self.prot_1
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'] =\
                                    self.prot_2[dna2_name][abs(x1-x2)//3:(abs(x1-x2)+abs(y1-x1))//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'] =\
                                    self.get_interval(x2, y2, '-')
                self.status = True    
            elif all([x1>x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-:OUTSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-:INSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'] =\
                                    self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y2-x2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'] =\
                                    self.prot_2[dna2_name]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'] =\
                                    self.dna_1
                self.status = True
        elif all([strand1=='+', strand2=='-', x1<x2, y1>y2,\
                all([abs(x1-x2)>=self.min_dna_inter, abs(y1-y2)>=self.min_dna_inter])]):
            
            if all([x1>y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+:LEFT_INTERSECTION'] =\
                                    self.get_interval(x1, x2, '+')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:-:RIGHT_INTERSECTION'] =\
                                    self.get_interval(x2, x1, '-')
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+'] =\
                                    self.prot_1[0:abs(x2-x1)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:-'] =\
                                    self.prot_2[dna2_name][0:abs(x1-x2)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+'] =\
                                    self.get_interval(y2, y1, "+")
                self.status = True    
            elif all([x1<y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+:RIGHT_INTERSECTION'] =\
                                    self.get_interval(y2, y1, '+')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:-:LEFT_INTERSECTION'] =\
                                    self.get_interval(y1, y2, '-')
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+'] =\
                                    self.prot_1[abs(y2-x1)//3:(abs(y2-x1)+abs(y1-y2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:-'] =\
                                    self.prot_2[dna2_name][abs(y1-x2)//3:(abs(y1-x2)+abs(y2-y1))//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+'] =\
                                    self.get_interval(x1, x2, "+")
                self.status = True
            elif all([x1<y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:+:OUTSIDE'] =\
                                    self.get_interval(y2, x2, '+')
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-:INSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:+'] =\
                                    self.prot_1[abs(y2-x1)//3:(abs(y2-x1)+abs(x2-y2))//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'] =\
                                    self.prot_2[dna2_name]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'] =\
                                    self.get_interval(x1, y1, "+")
                self.status = True    
            elif all([x1>y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+:INSIDE'] =\
                                    self.dna_1
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:-:OUTSIDE'] =\
                                    self.get_interval(y1, x1, '-')
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'] =\
                                    self.prot_1
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:-'] =\
                                    self.prot_2[dna2_name][(y1-x2)//3:(y1-x2+x1-y1)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'] =\
                                    self.get_interval(x2 ,y2, "-")
                self.status = True
        elif all([strand1=='-', strand2=='+', x1>x2, y1<y2,\
                all([abs(x1-x2)>=self.min_dna_inter, abs(y1-y2)>=self.min_dna_inter])]):
            
            if all([x1<y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:-:RIGHT_INTERSECTION'] =\
                                    self.get_interval(x1, x2, "-")
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+:LEFT_INTERSECTION'] =\
                                    self.get_interval(x2, x1, '+')
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:-'] =\
                                    self.prot_1[0:(x2-x1)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+'] =\
                                    self.prot_2[dna2_name][0:(x1-x2)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+'] =\
                                    self.get_interval(y1 ,y2, "+") 
                self.status = True
            elif all([x1>y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-:LEFT_INTERSECTION'] =\
                                    self.get_interval(y2, y1, "-")
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+:RIGHT_INTERSECTION'] =\
                                    self.get_interval(y1, y2, '+')
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-'] =\
                                    self.prot_1[(y2-x1)//3:(y2-x1+x2-y2)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+'] =\
                                    self.prot_2[dna2_name][(y1-x2)//3:(y1-x2+y2-y1)//3]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+'] =\
                                    self.get_interval(x2 ,x1, "+")
                self.status = True
            elif all([x1>y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:-:OUTSIDE'] =\
                                    self.get_interval(y2, x2, "-")
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+:INSIDE'] =\
                                    self.dna_2[dna2_name]
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:-'] =\
                                    self.prot_1[(y2-x1)//3:(y2-x1+x2-y2)//3]
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'] =\
                                    self.prot_2[dna2_name]
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'] =\
                                    self.get_interval(x1 ,y1, "-")
                self.status = True
            elif all([x1<y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                self.dna_2[dna2_name] = fasta_dna_dict[dna2_name]
                self.prot_2[dna2_name] = Seq.translate(self.dna_2[dna2_name])
                
                self.inter_dna_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-:INSIDE'] =\
                                    self.dna_1
                self.inter_dna_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:+:OUTSIDE'] =\
                                    self.get_interval(y1, x1, "+")
                self.inter_prot_1[f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'] =\
                                    self.prot_1
                self.inter_prot_2[f'{self.chrom}:{orf1}_vs_{orf2}:{y1-x2}:{y1-x2+x1-y1}:+'] =\
                                    self.prot_2[dna2_name][(y1-x2)//3:(y1-x2+x1-y1)//3]
                    
                self.global_orf[f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'] = self.dna_2[dna2_name]
                self.status = True
        
        
    def get_dna_intersection(self, fasta_dna_dict):
        
        spl1 = self.dna_1_name.split(':')
        
        for i in self.orfs2:
            dna_2_name = self.chrom+':'+i
            
            if len(str(fasta_dna_dict[self.chrom+':'+i])) >= self.min_orf:
                spl2 = dna_2_name.split(':')
                if spl1[1] == spl2[1]:
                    continue
                
                self._process_intersection(self.dna_1_name, dna_2_name, fasta_dna_dict)
        
    def get_full_batch(self):
        
        self.full_batch = self.full_batch + f">{self.global_orf_name}\n{self.global_orf}\n"\
                                            f">{self.dna_1_name}\n{self.dna_1}\n"\
                                            f">{self.dna_2_name}\n{self.dna_2}\n"\
                                            f">{self.inter_dna_name_1}\n{self_inter_dna_1}\n"\
                                            f">{self.inter_dna_name_2}\n{self_inter_dna_2}\n"\
                                            f">{self.prot_1_name}\n{self.prot_1}\n"\
                                            f">{self.prot_2_name}\n{self.prot_2}\n"\
                                            f">{self.inter_prot_name_1}\n{self.inter_prot_1}\n"\
                                            f">{self.inter_prot_name_2}\n{self.inter_prot_2}\n"\
                                            f"=========================================================\n"
     
        
    

In [313]:
Inter.dna_2

{'CH476625:ORF610:1781798:1782235:+': Seq('ATGAGAGACGCATCATCATATAATGCCATATTCCACTGTGCATGCCATGCCGGT...GTC', SingleLetterAlphabet())}

In [314]:
fasta_dna_dict['CH476621:ORF2:9676:10020:+']

Seq('ATGGTTGCACTTCTTTCTCCTATCAATACTGACTCAATGGCTGCTTCTCAGCAG...ATC', SingleLetterAlphabet())

In [315]:
object_list = []
for chrom, orfs in orfs_dict.items():

    for dna1_name in orfs:
        if len(str(fasta_dna_dict[chrom+':'+dna1_name])) >= args.min_orf:
            Inter = Intersection(args, chrom+':'+dna1_name,\
                                fasta_dna_dict[chrom+':'+dna1_name],
                                orfs)
            Inter.get_dna_intersection(fasta_dna_dict)  
            if Inter.status:
                object_list.append(Inter)

process: CH476621 ORF5 vs ORF6
process: CH476621 ORF6 vs ORF5
process: CH476621 ORF8 vs ORF1805
process: CH476621 ORF10 vs ORF9
process: CH476621 ORF16 vs ORF1792
process: CH476621 ORF18 vs ORF1789
process: CH476621 ORF21 vs ORF20
process: CH476621 ORF29 vs ORF1776
process: CH476621 ORF30 vs ORF29
process: CH476621 ORF30 vs ORF1776
process: CH476621 ORF41 vs ORF1770
process: CH476621 ORF42 vs ORF1767
process: CH476621 ORF45 vs ORF1766
process: CH476621 ORF47 vs ORF1761
process: CH476621 ORF48 vs ORF1758
process: CH476621 ORF54 vs ORF1754
process: CH476621 ORF56 vs ORF54
process: CH476621 ORF56 vs ORF55
process: CH476621 ORF56 vs ORF1752
process: CH476621 ORF56 vs ORF1753
process: CH476621 ORF56 vs ORF1754
process: CH476621 ORF65 vs ORF64
process: CH476621 ORF72 vs ORF1734
process: CH476621 ORF74 vs ORF1733
process: CH476621 ORF80 vs ORF1728
process: CH476621 ORF86 vs ORF1724
process: CH476621 ORF87 vs ORF1722
process: CH476621 ORF88 vs ORF1722
process: CH476621 ORF89 vs ORF1722
process

process: CH476621 ORF677 vs ORF1143
process: CH476621 ORF679 vs ORF678
process: CH476621 ORF682 vs ORF1130
process: CH476621 ORF684 vs ORF1128
process: CH476621 ORF685 vs ORF1127
process: CH476621 ORF687 vs ORF1124
process: CH476621 ORF687 vs ORF1125
process: CH476621 ORF688 vs ORF1122
process: CH476621 ORF689 vs ORF1117
process: CH476621 ORF690 vs ORF689
process: CH476621 ORF690 vs ORF1115
process: CH476621 ORF690 vs ORF1116
process: CH476621 ORF690 vs ORF1117
process: CH476621 ORF700 vs ORF1097
process: CH476621 ORF702 vs ORF701
process: CH476621 ORF702 vs ORF1096
process: CH476621 ORF703 vs ORF1094
process: CH476621 ORF703 vs ORF1095
process: CH476621 ORF710 vs ORF1080
process: CH476621 ORF710 vs ORF1081
process: CH476621 ORF713 vs ORF1076
process: CH476621 ORF717 vs ORF1069
process: CH476621 ORF725 vs ORF1061
process: CH476621 ORF726 vs ORF1058
process: CH476621 ORF726 vs ORF1059
process: CH476621 ORF726 vs ORF1060
process: CH476621 ORF731 vs ORF1057
process: CH476621 ORF732 vs ORF

process: CH476621 ORF1382 vs ORF1381
process: CH476621 ORF1385 vs ORF1386
process: CH476621 ORF1386 vs ORF1385
process: CH476621 ORF1388 vs ORF414
process: CH476621 ORF1389 vs ORF411
process: CH476621 ORF1389 vs ORF412
process: CH476621 ORF1391 vs ORF400
process: CH476621 ORF1391 vs ORF401
process: CH476621 ORF1391 vs ORF402
process: CH476621 ORF1395 vs ORF396
process: CH476621 ORF1396 vs ORF396
process: CH476621 ORF1399 vs ORF1400
process: CH476621 ORF1400 vs ORF1399
process: CH476621 ORF1402 vs ORF393
process: CH476621 ORF1407 vs ORF388
process: CH476621 ORF1424 vs ORF363
process: CH476621 ORF1424 vs ORF364
process: CH476621 ORF1426 vs ORF357
process: CH476621 ORF1426 vs ORF358
process: CH476621 ORF1427 vs ORF1428
process: CH476621 ORF1428 vs ORF1427
process: CH476621 ORF1436 vs ORF351
process: CH476621 ORF1437 vs ORF351
process: CH476621 ORF1438 vs ORF347
process: CH476621 ORF1443 vs ORF1444
process: CH476621 ORF1444 vs ORF1443
process: CH476621 ORF1445 vs ORF336
process: CH476621 O

process: CH476622 ORF120 vs ORF1649
process: CH476622 ORF123 vs ORF1644
process: CH476622 ORF124 vs ORF1642
process: CH476622 ORF128 vs ORF1638
process: CH476622 ORF131 vs ORF129
process: CH476622 ORF131 vs ORF130
process: CH476622 ORF136 vs ORF1629
process: CH476622 ORF138 vs ORF1627
process: CH476622 ORF151 vs ORF1615
process: CH476622 ORF152 vs ORF1612
process: CH476622 ORF156 vs ORF1610
process: CH476622 ORF161 vs ORF160
process: CH476622 ORF161 vs ORF1604
process: CH476622 ORF161 vs ORF1605
process: CH476622 ORF161 vs ORF1606
process: CH476622 ORF162 vs ORF1603
process: CH476622 ORF164 vs ORF1602
process: CH476622 ORF167 vs ORF1596
process: CH476622 ORF169 vs ORF168
process: CH476622 ORF172 vs ORF171
process: CH476622 ORF172 vs ORF1594
process: CH476622 ORF172 vs ORF1595
process: CH476622 ORF173 vs ORF1593
process: CH476622 ORF178 vs ORF1586
process: CH476622 ORF179 vs ORF1585
process: CH476622 ORF189 vs ORF1562
process: CH476622 ORF192 vs ORF1556
process: CH476622 ORF193 vs ORF15

process: CH476622 ORF738 vs ORF1027
process: CH476622 ORF739 vs ORF1027
process: CH476622 ORF740 vs ORF1024
process: CH476622 ORF741 vs ORF1024
process: CH476622 ORF747 vs ORF748
process: CH476622 ORF748 vs ORF747
process: CH476622 ORF749 vs ORF1021
process: CH476622 ORF750 vs ORF1018
process: CH476622 ORF750 vs ORF1019
process: CH476622 ORF751 vs ORF750
process: CH476622 ORF751 vs ORF1018
process: CH476622 ORF751 vs ORF1019
process: CH476622 ORF753 vs ORF1016
process: CH476622 ORF754 vs ORF1016
process: CH476622 ORF759 vs ORF1010
process: CH476622 ORF769 vs ORF1001
process: CH476622 ORF770 vs ORF998
process: CH476622 ORF774 vs ORF996
process: CH476622 ORF775 vs ORF774
process: CH476622 ORF775 vs ORF996
process: CH476622 ORF778 vs ORF776
process: CH476622 ORF778 vs ORF777
process: CH476622 ORF782 vs ORF994
process: CH476622 ORF783 vs ORF991
process: CH476622 ORF785 vs ORF990
process: CH476622 ORF787 vs ORF786
process: CH476622 ORF793 vs ORF988
process: CH476622 ORF795 vs ORF986
process

process: CH476622 ORF1316 vs ORF446
process: CH476622 ORF1320 vs ORF444
process: CH476622 ORF1321 vs ORF442
process: CH476622 ORF1322 vs ORF440
process: CH476622 ORF1322 vs ORF1323
process: CH476622 ORF1323 vs ORF1322
process: CH476622 ORF1325 vs ORF1326
process: CH476622 ORF1326 vs ORF439
process: CH476622 ORF1326 vs ORF1325
process: CH476622 ORF1332 vs ORF434
process: CH476622 ORF1332 vs ORF1333
process: CH476622 ORF1333 vs ORF434
process: CH476622 ORF1333 vs ORF1332
process: CH476622 ORF1337 vs ORF426
process: CH476622 ORF1348 vs ORF414
process: CH476622 ORF1348 vs ORF415
process: CH476622 ORF1353 vs ORF413
process: CH476622 ORF1356 vs ORF412
process: CH476622 ORF1357 vs ORF409
process: CH476622 ORF1360 vs ORF404
process: CH476622 ORF1360 vs ORF405
process: CH476622 ORF1363 vs ORF1364
process: CH476622 ORF1364 vs ORF1363
process: CH476622 ORF1365 vs ORF401
process: CH476622 ORF1373 vs ORF396
process: CH476622 ORF1389 vs ORF383
process: CH476622 ORF1392 vs ORF380
process: CH476622 OR

process: CH476623 ORF242 vs ORF241
process: CH476623 ORF242 vs ORF1419
process: CH476623 ORF242 vs ORF1420
process: CH476623 ORF252 vs ORF1409
process: CH476623 ORF254 vs ORF1405
process: CH476623 ORF257 vs ORF256
process: CH476623 ORF258 vs ORF1401
process: CH476623 ORF268 vs ORF1396
process: CH476623 ORF269 vs ORF1393
process: CH476623 ORF287 vs ORF1374
process: CH476623 ORF296 vs ORF297
process: CH476623 ORF297 vs ORF296
process: CH476623 ORF300 vs ORF1367
process: CH476623 ORF309 vs ORF1358
process: CH476623 ORF311 vs ORF1357
process: CH476623 ORF317 vs ORF1352
process: CH476623 ORF318 vs ORF1351
process: CH476623 ORF319 vs ORF1350
process: CH476623 ORF322 vs ORF321
process: CH476623 ORF322 vs ORF1348
process: CH476623 ORF324 vs ORF1345
process: CH476623 ORF329 vs ORF1342
process: CH476623 ORF330 vs ORF1338
process: CH476623 ORF335 vs ORF334
process: CH476623 ORF335 vs ORF1336
process: CH476623 ORF338 vs ORF1332
process: CH476623 ORF341 vs ORF1328
process: CH476623 ORF342 vs ORF132

process: CH476623 ORF1039 vs ORF634
process: CH476623 ORF1040 vs ORF630
process: CH476623 ORF1044 vs ORF627
process: CH476623 ORF1045 vs ORF627
process: CH476623 ORF1051 vs ORF619
process: CH476623 ORF1054 vs ORF1055
process: CH476623 ORF1055 vs ORF1054
process: CH476623 ORF1064 vs ORF604
process: CH476623 ORF1065 vs ORF603
process: CH476623 ORF1069 vs ORF596
process: CH476623 ORF1072 vs ORF594
process: CH476623 ORF1073 vs ORF593
process: CH476623 ORF1079 vs ORF588
process: CH476623 ORF1080 vs ORF588
process: CH476623 ORF1083 vs ORF586
process: CH476623 ORF1089 vs ORF581
process: CH476623 ORF1102 vs ORF561
process: CH476623 ORF1107 vs ORF556
process: CH476623 ORF1112 vs ORF550
process: CH476623 ORF1113 vs ORF546
process: CH476623 ORF1113 vs ORF547
process: CH476623 ORF1115 vs ORF1116
process: CH476623 ORF1116 vs ORF1115
process: CH476623 ORF1123 vs ORF531
process: CH476623 ORF1134 vs ORF507
process: CH476623 ORF1134 vs ORF508
process: CH476623 ORF1135 vs ORF1137
process: CH476623 ORF11

process: CH476624 ORF21 vs ORF1429
process: CH476624 ORF23 vs ORF1425
process: CH476624 ORF24 vs ORF1422
process: CH476624 ORF26 vs ORF1418
process: CH476624 ORF27 vs ORF1417
process: CH476624 ORF30 vs ORF1415
process: CH476624 ORF33 vs ORF32
process: CH476624 ORF33 vs ORF1412
process: CH476624 ORF37 vs ORF1407
process: CH476624 ORF39 vs ORF1405
process: CH476624 ORF39 vs ORF1406
process: CH476624 ORF41 vs ORF1397
process: CH476624 ORF43 vs ORF1395
process: CH476624 ORF50 vs ORF1389
process: CH476624 ORF53 vs ORF1387
process: CH476624 ORF54 vs ORF1385
process: CH476624 ORF57 vs ORF1383
process: CH476624 ORF58 vs ORF1382
process: CH476624 ORF68 vs ORF67
process: CH476624 ORF78 vs ORF1370
process: CH476624 ORF81 vs ORF1362
process: CH476624 ORF82 vs ORF1362
process: CH476624 ORF87 vs ORF1360
process: CH476624 ORF89 vs ORF1357
process: CH476624 ORF92 vs ORF1355
process: CH476624 ORF94 vs ORF1350
process: CH476624 ORF101 vs ORF1345
process: CH476624 ORF103 vs ORF102
process: CH476624 ORF10

process: CH476624 ORF666 vs ORF796
process: CH476624 ORF667 vs ORF666
process: CH476624 ORF667 vs ORF796
process: CH476624 ORF667 vs ORF797
process: CH476624 ORF670 vs ORF787
process: CH476624 ORF672 vs ORF670
process: CH476624 ORF672 vs ORF671
process: CH476624 ORF672 vs ORF787
process: CH476624 ORF672 vs ORF788
process: CH476624 ORF675 vs ORF780
process: CH476624 ORF686 vs ORF685
process: CH476624 ORF687 vs ORF775
process: CH476624 ORF709 vs ORF756
process: CH476624 ORF710 vs ORF755
process: CH476624 ORF713 vs ORF750
process: CH476624 ORF716 vs ORF715
process: CH476624 ORF721 vs ORF742
process: CH476624 ORF730 vs ORF731
process: CH476624 ORF731 vs ORF730
process: CH476624 ORF742 vs ORF721
process: CH476624 ORF750 vs ORF713
process: CH476624 ORF753 vs ORF755
process: CH476624 ORF754 vs ORF755
process: CH476624 ORF755 vs ORF710
process: CH476624 ORF755 vs ORF753
process: CH476624 ORF755 vs ORF754
process: CH476624 ORF756 vs ORF709
process: CH476624 ORF757 vs ORF758
process: CH476624 OR

process: CH476624 ORF1251 vs ORF190
process: CH476624 ORF1252 vs ORF188
process: CH476624 ORF1256 vs ORF180
process: CH476624 ORF1257 vs ORF176
process: CH476624 ORF1263 vs ORF1265
process: CH476624 ORF1264 vs ORF1265
process: CH476624 ORF1265 vs ORF1263
process: CH476624 ORF1265 vs ORF1264
process: CH476624 ORF1273 vs ORF1274
process: CH476624 ORF1274 vs ORF1273
process: CH476624 ORF1282 vs ORF157
process: CH476624 ORF1286 vs ORF1287
process: CH476624 ORF1287 vs ORF1286
process: CH476624 ORF1290 vs ORF145
process: CH476624 ORF1292 vs ORF142
process: CH476624 ORF1292 vs ORF143
process: CH476624 ORF1295 vs ORF138
process: CH476624 ORF1296 vs ORF138
process: CH476624 ORF1297 vs ORF1298
process: CH476624 ORF1298 vs ORF1297
process: CH476624 ORF1301 vs ORF135
process: CH476624 ORF1307 vs ORF1308
process: CH476624 ORF1308 vs ORF1307
process: CH476624 ORF1310 vs ORF1311
process: CH476624 ORF1311 vs ORF1310
process: CH476624 ORF1312 vs ORF127
process: CH476624 ORF1317 vs ORF125
process: CH476

process: CH476625 ORF466 vs ORF1065
process: CH476625 ORF467 vs ORF1065
process: CH476625 ORF468 vs ORF1063
process: CH476625 ORF469 vs ORF1063
process: CH476625 ORF470 vs ORF1063
process: CH476625 ORF471 vs ORF1059
process: CH476625 ORF472 vs ORF1057
process: CH476625 ORF476 vs ORF1049
process: CH476625 ORF478 vs ORF1045
process: CH476625 ORF479 vs ORF478
process: CH476625 ORF479 vs ORF1045
process: CH476625 ORF483 vs ORF1043
process: CH476625 ORF484 vs ORF483
process: CH476625 ORF484 vs ORF1042
process: CH476625 ORF484 vs ORF1043
process: CH476625 ORF485 vs ORF486
process: CH476625 ORF486 vs ORF485
process: CH476625 ORF487 vs ORF1040
process: CH476625 ORF489 vs ORF1038
process: CH476625 ORF491 vs ORF1036
process: CH476625 ORF504 vs ORF1029
process: CH476625 ORF505 vs ORF1028
process: CH476625 ORF508 vs ORF1026
process: CH476625 ORF510 vs ORF1022
process: CH476625 ORF523 vs ORF1008
process: CH476625 ORF536 vs ORF994
process: CH476625 ORF537 vs ORF990
process: CH476625 ORF538 vs ORF988

process: CH476625 ORF1026 vs ORF508
process: CH476625 ORF1028 vs ORF505
process: CH476625 ORF1029 vs ORF504
process: CH476625 ORF1036 vs ORF491
process: CH476625 ORF1038 vs ORF489
process: CH476625 ORF1040 vs ORF487
process: CH476625 ORF1042 vs ORF484
process: CH476625 ORF1043 vs ORF483
process: CH476625 ORF1043 vs ORF484
process: CH476625 ORF1045 vs ORF478
process: CH476625 ORF1045 vs ORF479
process: CH476625 ORF1049 vs ORF476
process: CH476625 ORF1053 vs ORF1054
process: CH476625 ORF1054 vs ORF1053
process: CH476625 ORF1056 vs ORF1057
process: CH476625 ORF1057 vs ORF472
process: CH476625 ORF1057 vs ORF1056
process: CH476625 ORF1059 vs ORF471
process: CH476625 ORF1062 vs ORF1063
process: CH476625 ORF1063 vs ORF468
process: CH476625 ORF1063 vs ORF469
process: CH476625 ORF1063 vs ORF470
process: CH476625 ORF1063 vs ORF1062
process: CH476625 ORF1064 vs ORF1065
process: CH476625 ORF1065 vs ORF466
process: CH476625 ORF1065 vs ORF467
process: CH476625 ORF1065 vs ORF1064
process: CH476625 OR

process: CH476626 ORF102 vs ORF101
process: CH476626 ORF102 vs ORF1130
process: CH476626 ORF102 vs ORF1131
process: CH476626 ORF104 vs ORF103
process: CH476626 ORF105 vs ORF1129
process: CH476626 ORF106 vs ORF1126
process: CH476626 ORF108 vs ORF107
process: CH476626 ORF110 vs ORF109
process: CH476626 ORF119 vs ORF1111
process: CH476626 ORF119 vs ORF1112
process: CH476626 ORF119 vs ORF1113
process: CH476626 ORF120 vs ORF121
process: CH476626 ORF121 vs ORF120
process: CH476626 ORF128 vs ORF1103
process: CH476626 ORF131 vs ORF1099
process: CH476626 ORF132 vs ORF1097
process: CH476626 ORF134 vs ORF1094
process: CH476626 ORF135 vs ORF1094
process: CH476626 ORF136 vs ORF137
process: CH476626 ORF136 vs ORF1093
process: CH476626 ORF137 vs ORF136
process: CH476626 ORF137 vs ORF1093
process: CH476626 ORF142 vs ORF1091
process: CH476626 ORF146 vs ORF1083
process: CH476626 ORF148 vs ORF1080
process: CH476626 ORF154 vs ORF1077
process: CH476626 ORF156 vs ORF1074
process: CH476626 ORF163 vs ORF1067


process: CH476626 ORF768 vs ORF769
process: CH476626 ORF769 vs ORF768
process: CH476626 ORF779 vs ORF410
process: CH476626 ORF781 vs ORF782
process: CH476626 ORF782 vs ORF407
process: CH476626 ORF782 vs ORF408
process: CH476626 ORF782 vs ORF781
process: CH476626 ORF789 vs ORF399
process: CH476626 ORF790 vs ORF396
process: CH476626 ORF792 vs ORF395
process: CH476626 ORF795 vs ORF390
process: CH476626 ORF796 vs ORF389
process: CH476626 ORF797 vs ORF388
process: CH476626 ORF803 vs ORF804
process: CH476626 ORF804 vs ORF386
process: CH476626 ORF804 vs ORF803
process: CH476626 ORF811 vs ORF377
process: CH476626 ORF813 vs ORF814
process: CH476626 ORF814 vs ORF813
process: CH476626 ORF816 vs ORF373
process: CH476626 ORF821 vs ORF370
process: CH476626 ORF822 vs ORF368
process: CH476626 ORF822 vs ORF369
process: CH476626 ORF829 vs ORF830
process: CH476626 ORF830 vs ORF829
process: CH476626 ORF833 vs ORF365
process: CH476626 ORF837 vs ORF838
process: CH476626 ORF838 vs ORF837
process: CH476626 OR

process: CH476627 ORF103 vs ORF1089
process: CH476627 ORF104 vs ORF1087
process: CH476627 ORF105 vs ORF1087
process: CH476627 ORF106 vs ORF1087
process: CH476627 ORF107 vs ORF1087
process: CH476627 ORF113 vs ORF112
process: CH476627 ORF114 vs ORF1081
process: CH476627 ORF117 vs ORF1070
process: CH476627 ORF120 vs ORF1069
process: CH476627 ORF121 vs ORF119
process: CH476627 ORF121 vs ORF120
process: CH476627 ORF121 vs ORF1069
process: CH476627 ORF129 vs ORF130
process: CH476627 ORF130 vs ORF129
process: CH476627 ORF140 vs ORF139
process: CH476627 ORF140 vs ORF1064
process: CH476627 ORF142 vs ORF1056
process: CH476627 ORF150 vs ORF1054
process: CH476627 ORF150 vs ORF1055
process: CH476627 ORF156 vs ORF1043
process: CH476627 ORF156 vs ORF1044
process: CH476627 ORF157 vs ORF156
process: CH476627 ORF157 vs ORF1043
process: CH476627 ORF157 vs ORF1044
process: CH476627 ORF158 vs ORF1044
process: CH476627 ORF160 vs ORF1039
process: CH476627 ORF182 vs ORF180
process: CH476627 ORF182 vs ORF181
p

process: CH476627 ORF819 vs ORF368
process: CH476627 ORF820 vs ORF366
process: CH476627 ORF821 vs ORF364
process: CH476627 ORF825 vs ORF360
process: CH476627 ORF826 vs ORF827
process: CH476627 ORF827 vs ORF359
process: CH476627 ORF827 vs ORF826
process: CH476627 ORF832 vs ORF834
process: CH476627 ORF833 vs ORF834
process: CH476627 ORF834 vs ORF832
process: CH476627 ORF834 vs ORF833
process: CH476627 ORF841 vs ORF356
process: CH476627 ORF843 vs ORF355
process: CH476627 ORF845 vs ORF352
process: CH476627 ORF847 vs ORF848
process: CH476627 ORF848 vs ORF351
process: CH476627 ORF848 vs ORF847
process: CH476627 ORF849 vs ORF346
process: CH476627 ORF857 vs ORF858
process: CH476627 ORF858 vs ORF857
process: CH476627 ORF864 vs ORF334
process: CH476627 ORF865 vs ORF329
process: CH476627 ORF867 vs ORF321
process: CH476627 ORF867 vs ORF322
process: CH476627 ORF867 vs ORF868
process: CH476627 ORF868 vs ORF321
process: CH476627 ORF868 vs ORF322
process: CH476627 ORF868 vs ORF867
process: CH476627 OR

process: CH476629 ORF280 vs ORF825
process: CH476629 ORF282 vs ORF821
process: CH476629 ORF282 vs ORF823
process: CH476629 ORF288 vs ORF287
process: CH476629 ORF289 vs ORF799
process: CH476629 ORF290 vs ORF794
process: CH476629 ORF295 vs ORF294
process: CH476629 ORF303 vs ORF783
process: CH476629 ORF304 vs ORF782
process: CH476629 ORF309 vs ORF777
process: CH476629 ORF315 vs ORF775
process: CH476629 ORF316 vs ORF774
process: CH476629 ORF320 vs ORF770
process: CH476629 ORF336 vs ORF335
process: CH476629 ORF347 vs ORF750
process: CH476629 ORF349 vs ORF348
process: CH476629 ORF353 vs ORF352
process: CH476629 ORF353 vs ORF743
process: CH476629 ORF355 vs ORF738
process: CH476629 ORF356 vs ORF357
process: CH476629 ORF356 vs ORF736
process: CH476629 ORF357 vs ORF356
process: CH476629 ORF358 vs ORF733
process: CH476629 ORF358 vs ORF734
process: CH476629 ORF359 vs ORF732
process: CH476629 ORF361 vs ORF360
process: CH476629 ORF362 vs ORF363
process: CH476629 ORF362 vs ORF731
process: CH476629 OR

process: CH476629 ORF954 vs ORF953
process: CH476629 ORF957 vs ORF96
process: CH476629 ORF958 vs ORF96
process: CH476629 ORF963 vs ORF87
process: CH476629 ORF964 vs ORF965
process: CH476629 ORF965 vs ORF964
process: CH476629 ORF965 vs ORF966
process: CH476629 ORF966 vs ORF965
process: CH476629 ORF970 vs ORF80
process: CH476629 ORF977 vs ORF978
process: CH476629 ORF978 vs ORF66
process: CH476629 ORF978 vs ORF977
process: CH476629 ORF979 vs ORF58
process: CH476629 ORF987 vs ORF50
process: CH476629 ORF988 vs ORF50
process: CH476629 ORF993 vs ORF46
process: CH476629 ORF996 vs ORF43
process: CH476629 ORF1001 vs ORF37
process: CH476629 ORF1004 vs ORF36
process: CH476629 ORF1005 vs ORF36
process: CH476629 ORF1007 vs ORF33
process: CH476629 ORF1013 vs ORF15
process: CH476629 ORF1013 vs ORF16
process: CH476629 ORF1020 vs ORF1021
process: CH476629 ORF1021 vs ORF6
process: CH476629 ORF1021 vs ORF1020
process: CH476629 ORF1024 vs ORF1025
process: CH476629 ORF1025 vs ORF1024
process: CH476629 ORF10

process: CH476631 ORF641 vs ORF198
process: CH476631 ORF642 vs ORF197
process: CH476631 ORF655 vs ORF188
process: CH476631 ORF655 vs ORF656
process: CH476631 ORF656 vs ORF188
process: CH476631 ORF656 vs ORF655
process: CH476631 ORF657 vs ORF187
process: CH476631 ORF660 vs ORF186
process: CH476631 ORF666 vs ORF174
process: CH476631 ORF679 vs ORF166
process: CH476631 ORF685 vs ORF687
process: CH476631 ORF686 vs ORF687
process: CH476631 ORF687 vs ORF685
process: CH476631 ORF687 vs ORF686
process: CH476631 ORF698 vs ORF699
process: CH476631 ORF699 vs ORF698
process: CH476631 ORF699 vs ORF700
process: CH476631 ORF700 vs ORF699
process: CH476631 ORF702 vs ORF703
process: CH476631 ORF703 vs ORF150
process: CH476631 ORF703 vs ORF702
process: CH476631 ORF713 vs ORF129
process: CH476631 ORF717 vs ORF123
process: CH476631 ORF720 vs ORF119
process: CH476631 ORF722 vs ORF114
process: CH476631 ORF729 vs ORF730
process: CH476631 ORF730 vs ORF729
process: CH476631 ORF733 vs ORF734
process: CH476631 OR

process: CH476633 ORF539 vs ORF540
process: CH476633 ORF540 vs ORF539
process: CH476633 ORF544 vs ORF223
process: CH476633 ORF551 vs ORF212
process: CH476633 ORF554 vs ORF203
process: CH476633 ORF555 vs ORF202
process: CH476633 ORF556 vs ORF200
process: CH476633 ORF560 vs ORF198
process: CH476633 ORF562 vs ORF195
process: CH476633 ORF562 vs ORF196
process: CH476633 ORF562 vs ORF197
process: CH476633 ORF565 vs ORF193
process: CH476633 ORF565 vs ORF194
process: CH476633 ORF566 vs ORF192
process: CH476633 ORF567 vs ORF568
process: CH476633 ORF568 vs ORF567
process: CH476633 ORF572 vs ORF189
process: CH476633 ORF573 vs ORF189
process: CH476633 ORF580 vs ORF184
process: CH476633 ORF581 vs ORF582
process: CH476633 ORF582 vs ORF581
process: CH476633 ORF594 vs ORF170
process: CH476633 ORF610 vs ORF157
process: CH476633 ORF611 vs ORF612
process: CH476633 ORF612 vs ORF156
process: CH476633 ORF612 vs ORF611
process: CH476633 ORF612 vs ORF613
process: CH476633 ORF613 vs ORF156
process: CH476633 OR

process: CH476635 ORF497 vs ORF498
process: CH476635 ORF498 vs ORF497
process: CH476635 ORF504 vs ORF505
process: CH476635 ORF505 vs ORF200
process: CH476635 ORF505 vs ORF504
process: CH476635 ORF506 vs ORF199
process: CH476635 ORF507 vs ORF508
process: CH476635 ORF508 vs ORF507
process: CH476635 ORF510 vs ORF196
process: CH476635 ORF516 vs ORF184
process: CH476635 ORF522 vs ORF177
process: CH476635 ORF523 vs ORF176
process: CH476635 ORF529 vs ORF167
process: CH476635 ORF535 vs ORF164
process: CH476635 ORF542 vs ORF159
process: CH476635 ORF547 vs ORF153
process: CH476635 ORF555 vs ORF146
process: CH476635 ORF555 vs ORF147
process: CH476635 ORF557 vs ORF144
process: CH476635 ORF558 vs ORF559
process: CH476635 ORF559 vs ORF558
process: CH476635 ORF576 vs ORF577
process: CH476635 ORF577 vs ORF576
process: CH476635 ORF582 vs ORF583
process: CH476635 ORF583 vs ORF124
process: CH476635 ORF583 vs ORF125
process: CH476635 ORF583 vs ORF582
process: CH476635 ORF595 vs ORF596
process: CH476635 OR

process: CH476637 ORF473 vs ORF145
process: CH476637 ORF473 vs ORF146
process: CH476637 ORF477 vs ORF142
process: CH476637 ORF482 vs ORF483
process: CH476637 ORF483 vs ORF482
process: CH476637 ORF488 vs ORF132
process: CH476637 ORF489 vs ORF131
process: CH476637 ORF497 vs ORF498
process: CH476637 ORF498 vs ORF497
process: CH476637 ORF502 vs ORF503
process: CH476637 ORF503 vs ORF502
process: CH476637 ORF509 vs ORF118
process: CH476637 ORF516 vs ORF108
process: CH476637 ORF518 vs ORF107
process: CH476637 ORF520 vs ORF104
process: CH476637 ORF520 vs ORF105
process: CH476637 ORF532 vs ORF96
process: CH476637 ORF532 vs ORF97
process: CH476637 ORF533 vs ORF94
process: CH476637 ORF536 vs ORF91
process: CH476637 ORF545 vs ORF78
process: CH476637 ORF550 vs ORF72
process: CH476637 ORF551 vs ORF70
process: CH476637 ORF554 vs ORF68
process: CH476637 ORF559 vs ORF67
process: CH476637 ORF573 vs ORF58
process: CH476637 ORF582 vs ORF46
process: CH476637 ORF584 vs ORF43
process: CH476637 ORF584 vs ORF4

process: CH476639 ORF529 vs ORF530
process: CH476639 ORF530 vs ORF40
process: CH476639 ORF530 vs ORF529
process: CH476639 ORF531 vs ORF37
process: CH476639 ORF537 vs ORF26
process: CH476639 ORF537 vs ORF27
process: CH476639 ORF537 vs ORF28
process: CH476639 ORF540 vs ORF14
process: CH476639 ORF547 vs ORF548
process: CH476639 ORF548 vs ORF547
process: CH476639 ORF549 vs ORF550
process: CH476639 ORF550 vs ORF10
process: CH476639 ORF550 vs ORF549
process: CH476639 ORF553 vs ORF1
process: CH476641 ORF6 vs ORF511
process: CH476641 ORF7 vs ORF507
process: CH476641 ORF11 vs ORF506
process: CH476641 ORF15 vs ORF503
process: CH476641 ORF17 vs ORF501
process: CH476641 ORF19 vs ORF499
process: CH476641 ORF22 vs ORF21
process: CH476641 ORF35 vs ORF490
process: CH476641 ORF36 vs ORF490
process: CH476641 ORF38 vs ORF488
process: CH476641 ORF41 vs ORF40
process: CH476641 ORF42 vs ORF484
process: CH476641 ORF45 vs ORF482
process: CH476641 ORF50 vs ORF478
process: CH476641 ORF51 vs ORF478
process: CH47

process: CH476643 ORF137 vs ORF259
process: CH476643 ORF138 vs ORF257
process: CH476643 ORF142 vs ORF251
process: CH476643 ORF143 vs ORF251
process: CH476643 ORF145 vs ORF248
process: CH476643 ORF145 vs ORF249
process: CH476643 ORF148 vs ORF247
process: CH476643 ORF156 vs ORF155
process: CH476643 ORF157 vs ORF245
process: CH476643 ORF162 vs ORF240
process: CH476643 ORF169 vs ORF233
process: CH476643 ORF172 vs ORF229
process: CH476643 ORF179 vs ORF223
process: CH476643 ORF181 vs ORF218
process: CH476643 ORF183 vs ORF217
process: CH476643 ORF187 vs ORF186
process: CH476643 ORF188 vs ORF213
process: CH476643 ORF198 vs ORF204
process: CH476643 ORF204 vs ORF198
process: CH476643 ORF213 vs ORF188
process: CH476643 ORF217 vs ORF183
process: CH476643 ORF218 vs ORF181
process: CH476643 ORF221 vs ORF222
process: CH476643 ORF222 vs ORF221
process: CH476643 ORF223 vs ORF179
process: CH476643 ORF224 vs ORF225
process: CH476643 ORF225 vs ORF224
process: CH476643 ORF229 vs ORF172
process: CH476643 OR

process: CH476647 ORF166 vs ORF168
process: CH476647 ORF167 vs ORF168
process: CH476647 ORF168 vs ORF82
process: CH476647 ORF168 vs ORF166
process: CH476647 ORF168 vs ORF167
process: CH476647 ORF170 vs ORF172
process: CH476647 ORF171 vs ORF172
process: CH476647 ORF172 vs ORF81
process: CH476647 ORF172 vs ORF170
process: CH476647 ORF172 vs ORF171
process: CH476647 ORF178 vs ORF180
process: CH476647 ORF179 vs ORF180
process: CH476647 ORF180 vs ORF178
process: CH476647 ORF180 vs ORF179
process: CH476647 ORF183 vs ORF74
process: CH476647 ORF185 vs ORF70
process: CH476647 ORF187 vs ORF65
process: CH476647 ORF191 vs ORF192
process: CH476647 ORF192 vs ORF191
process: CH476647 ORF197 vs ORF53
process: CH476647 ORF197 vs ORF54
process: CH476647 ORF211 vs ORF42
process: CH476647 ORF219 vs ORF32
process: CH476647 ORF224 vs ORF25
process: CH476647 ORF229 vs ORF19
process: CH476647 ORF232 vs ORF15
process: CH476647 ORF232 vs ORF16
process: CH476647 ORF233 vs ORF12
process: CH476647 ORF239 vs ORF8
p

In [317]:
for o in object_list:
    print(len(o.dna_2.keys()))

1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
5
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
2
2
1
2
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
2
1
1
1
1
2
1
1
1
2
1
1
1
3
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
2
1
2
1
1
1
3
3
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
3
2
2
1
2
1
1
1
1
2
1
1
1
2
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
2
1
1
2
2
1
1
1
1
2
1
1
1
1
2
1
3
1
3
1
1
1
2
1
1
1
1
1
2
1
1
4
1
2
2
2
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
1
2
1
2
1
1
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
1
1
1
1
1
1
2
1
2
2
1
1
1
1
1
1
1
1
2
2
1
1
1
1
2
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
2
1
1
1
2
1
1
1
2
1
1
1
1
2
1
1
2
1
1
1
2
3
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
3
3
1
1
1
1
1
2
1
2
3
1
2
1
1
2
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
3
1
1
1
2
1
2
1
1
1
1
1
1
2
3
1
1
1
1
1
1
1
1
2
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
2
1
1


In [291]:
Inter.__dict__

{'chrom': 'DS267916',
 'genome': './Sclerotinia_sclerotiorum_orgin_data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa',
 'min_orf': 300,
 'min_dna_inter': 300,
 'dna_1_name': 'DS267916:ORF1:1844:2422:+',
 'dna_1': Seq('ATGCCACCTGTACGATCAAAGAAATATAAAGAACGAGTCGAACTTGCTTTAATA...AGC', SingleLetterAlphabet()),
 'prot_1_name': 'DS267916:ORF1:1844:2422:+',
 'prot_1': Seq('MPPVRSKKYKERVELALIIESEGFEMPPCTYCEKHSRRCIVAKENSNRCSECAR...QGS', ExtendedIUPACProtein()),
 'orfs2': ['ORF1:1844:2422:+',
  'ORF2:5169:5507:+',
  'ORF3:10781:11242:+',
  'ORF4:12330:12701:+',
  'ORF5:12774:12307:-',
  'ORF6:7947:7630:-',
  'ORF7:7290:4213:-',
  'ORF8:3990:2728:-',
  'ORF9:2642:2310:-',
  'ORF10:985:680:-'],
 'global_orf': {'DS267916:ORF1_vs_ORF9:1844:2642:+': 'ATGCCACCTGTACGATCAAAGAAATATAAAGAACGAGTCGAACTTGCTTTAATAATTGAATCTGAAGGGTTCGAGATGCCTCCATGTACTTATTGTGAAAAACATTCACGTCGTTGTATTGTTGCTAAGGAAAATTCCAATCGTTGTAGTGAATGCGCTCGTCGGGGCCAGCGATGTGATATATCGGGTCCTTCTTCCGGAGATATGACGAGCATTATTCAGGAGCAGGAGCGTTTGGATCGCGAGCGC

1 2
1 3
1 4
2 3
2 4
3 4


In [354]:
import itertools
import os
import re
import subprocess
import tempfile
from pybedtools import bedtool
import argparse

from collections import defaultdict
from Bio import SeqIO, Seq


def get_data(args):
    
    fasta_dna_dict = dict()
    orfs_dict = defaultdict(list)

    for record in SeqIO.parse(args.input_dna, 'fasta'):

        spl = re.split(' |_|\[|\]', record.description)

        if int(spl[3]) < int(spl[5]):
            fasta_dna_dict[f'{spl[0]}:ORF{spl[1]}:{spl[3]}:{spl[5]}:+'] = record.seq
            orfs_dict[spl[0]].append(f"ORF{spl[1]}:{spl[3]}:{spl[5]}:+")
        else:
            fasta_dna_dict[f'{spl[0]}:ORF{spl[1]}:{spl[3]}:{spl[5]}:-'] = record.seq
            orfs_dict[spl[0]].append(f"ORF{spl[1]}:{spl[3]}:{spl[5]}:-")
    
    return fasta_dna_dict, orfs_dict

class Intersection:
    
    def __init__(self, args, dna_1_name, dna_1, dna_2_name, dna_2):
        
        self.genome = args.input_genome
        self.min_orf, self.min_dna_inter = args.min_orf, args.interval_length
        
        self.chrom = dna_1_name.split(':')[0]
        
        self.global_orf_name, self.global_orf = '', ''
        
        self.dna_1_name, self.dna_1 = dna_1_name, dna_1
        self.dna_2_name, self.dna_2 = dna_2_name, dna_2  
        
        self.prot_1_name, self.prot_1 = dna_1_name, Seq.translate(dna_1)
        self.prot_2_name, self.prot_2 = dna_2_name, Seq.translate(dna_2)
        
        
        self.inter_dna_1_name, self.inter_dna_1 = '', ''
        self.inter_dna_2_name, self.inter_dna_2 = '', ''
        
        self.inter_prot_1_name, self.inter_prot_1 = '', ''
        self.inter_prot_2_name, self.inter_prot_2 = '', ''
        
        self.status = False
        
    def _get_interval(self, x, y, strand):
            
            if strand == "+":
                a = bedtool.BedTool(f'{self.chrom}\t{x-1}\t{y}\t.\t0\t{strand}',
                                    from_string=True)
                a = a.sequence(fi=args.input_genome, name=True, s=True)
                a = open(a.seqfn).read().lstrip('>').split('\n')
                
            elif strand == '-':
                a = bedtool.BedTool(f'{self.chrom}\t{y-1}\t{x}\t.\t0\t{strand}',
                                    from_string=True)
                a = a.sequence(fi=args.input_genome, s=True)
                a = open(a.seqfn).read().lstrip('>').split('\n')
            
            return a[1]
        
    def process_intersection(self):
        
        d1, d2 = self.dna_1_name.split(':'), self.dna_2_name.split(':')
        orf1, orf2 = d1[1], d2[1]
        x1, y1, x2, y2 = int(d1[2]), int(d1[3]), int(d2[2]), int(d2[3])
        strand1, strand2 = d1[4], d2[4]
        
        if all([strand1=='+', strand2=='+', x1<y2, y1>x2,\
                all([abs(x1-y2)>=self.min_dna_inter, abs(x2-y1)>=self.min_dna_inter])]):
            
            if all([x1>x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':LEFT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':RIGHT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'
                self.inter_dna_1 = self._get_interval(x1, y2, '+')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'
                self.inter_dna_2 = self.inter_dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'
                self.inter_prot_1 = self.prot_1[0:abs(y2-x1)//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'
                self.inter_prot_2 = self.prot_2[abs(x1-x2)//3:(abs(x1-x2)+abs(y2-x1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'
                self.global_orf = self._get_interval(x2, y1, '+')
                
                self.status = True
                    
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':RIGHT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':LEFT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'
                self.inter_dna_1 = self._get_interval(x2, y1, '+')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'
                self.inter_dna_2 = self.inter_dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'
                self.inter_prot_1 = self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y1-x2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:+'
                self.inter_prot_2 = self.prot_2[0:abs(y1-x2)//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:+'
                self.global_orf = self._get_interval(x1, y2, '+')
                
                self.status = True
                    
            elif all([x1<x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':OUTSIDE'
                self.dna_2_name = self.dna_2_name + ':INSIDE'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_dna_1 = self.dna_2
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_dna_2 = self.dna_2
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_prot_1 = self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y2-x2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_prot_2 = self.prot_2
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.global_orf = self.dna_1
                
                self.status = True
                
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':INSIDE'
                self.dna_2_name = self.dna_2_name + ':OUTSIDE'

                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.inter_dna_1 = self.dna_1
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.inter_dna_2 = self.dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x1}:+'
                self.inter_prot_1 = self.prot_1
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.inter_prot_2 = self.prot_2[abs(x1-x2)//3:(abs(x1-x2)+abs(y1-x1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.global_orf = self.dna_2
                
                self.status = True

        elif all([strand1=='-', strand2=='-', x1>y2, y1<x2,\
                all([abs(x1-y2)>=self.min_dna_inter, abs(x2-y1)>=self.min_dna_inter])]):
            
            if all([x1>x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':LEFT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':RIGHT_INTERSECTION'

                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-'
                self.inter_dna_1 = self._get_interval(x2, y1, '-')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-'
                self.inter_dna_2 = self.inter_dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y1}:-'
                self.inter_prot_1 = self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y1-x2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-'
                self.inter_prot_2 = self.prot_2[0:abs(y1-x2)//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x1}:+'
                self.global_orf = self._get_interval(y2, x1, '+')
                
                self.status = True
                    
            elif all([x1<x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')

                self.dna_1_name = self.dna_1_name + ':RIGHT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':LEFT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'
                self.inter_dna_1 = self._get_interval(x1, y2, '-')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'
                self.inter_dna_2 = self.inter_dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'
                self.inter_prot_1 = self.prot_1[0:abs(y2-x1)//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y2}:-'
                self.inter_prot_2 = self.prot_2[abs(x1-x2)//3:(abs(x1-x2)+abs(y2-x1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x2}:+'
                self.global_orf = self._get_interval(y1, x2, '+')
                
                self.status = True   
                
            elif all([x1<x2, y1>y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':INSIDE'
                self.dna_2_name = self.dna_2_name + ':OUTSIDE'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_dna_1 = self.dna_1
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_dna_2 = self.dna_1
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_prot_1 = self.prot_1
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_prot_2 = self.prot_2[abs(x1-x2)//3:(abs(x1-x2)+abs(y1-x1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.global_orf = self._get_interval(x2, y2, '-')
                
                self.status = True    
                
            elif all([x1>x2, y1<y2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':OUTSIDE'
                self.dna_2_name = self.dna_2_name + ':INSIDE'
                    
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_dna_1 = self.dna_2
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_dna_2 = self.dna_2
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_prot_1 = self.prot_1[abs(x2-x1)//3:(abs(x2-x1)+abs(y2-x2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_prot_2 = self.prot_2 
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.global_orf = self.dna_1
                
                self.status = True
                
        elif all([strand1=='+', strand2=='-', x1<x2, y1>y2,\
                all([abs(x1-x2)>=self.min_dna_inter, abs(y1-y2)>=self.min_dna_inter])]):
            
            if all([x1>y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':LEFT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':RIGHT_INTERSECTION'
                
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+'
                self.inter_dna_1 = self._get_interval(x1, x2, '+')
                
                self.inter_dna_2 = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:-'
                self.inter_dna_2 = self._get_interval(x2, x1, '-')
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+'
                self.inter_prot_1 = self.prot_1[0:abs(x2-x1)//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:-'
                self.inter_prot_2 = self.prot_2[0:abs(x1-x2)//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+'
                self.global_orf = self._get_interval(y2, y1, "+")
                
                self.status = True  
                
            elif all([x1<y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')

                self.dna_1_name = self.dna_1_name + ':RIGHT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':LEFT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+'
                self.inter_dna_1 = self._get_interval(y2, y1, '+')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:-'
                self.inter_dna_2 = self._get_interval(y1, y2, '-')
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:+'
                self.inter_prot_1 = self.prot_1[abs(y2-x1)//3:(abs(y2-x1)+abs(y1-y2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:-'
                self.inter_prot_2 = self.prot_2[abs(y1-x2)//3:(abs(y1-x2)+abs(y2-y1))//3]
                             
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:+'
                self.global_orf = self._get_interval(x1, x2, "+")
                
                self.status = True
                
            elif all([x1<y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':OUTSIDE'
                self.dna_2_name = self.dna_2_name + ':INSIDE'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:+'
                self.inter_dna_1 = self._get_interval(y2, x2, '+')
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_dna_2 = self.dna_2
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:+'
                self.inter_prot_1 = self.prot_1[abs(y2-x1)//3:(abs(y2-x1)+abs(x2-y2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.inter_prot_2 = self.prot_2
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.global_orf = self._get_interval(x1, y1, "+")
                
                self.status = True    
                
            elif all([x1>y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':INSIDE'
                self.dna_2_name = self.dna_2_name + ':OUTSIDE'

                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.inter_dna_1 = self.dna_1
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:-'
                self.inter_dna_2 = self._get_interval(y1, x1, '-')
                
                self.inter_prot_1_name =f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:+'
                self.inter_prot_1 = self.prot_1
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:-'
                self.inter_prot_2 = self.prot_2[abs(y1-x2)//3:(abs(y1-x2)+abs(x1-y1))//3]
                    
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:-'
                self.global_orf = self._get_interval(x2 ,y2, "-")
                
                self.status = True
                
        elif all([strand1=='-', strand2=='+', x1>x2, y1<y2,\
                all([abs(x1-x2)>=self.min_dna_inter, abs(y1-y2)>=self.min_dna_inter])]):
            
            if all([x1<y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':RIGHT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':LEFT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:-'
                self.inter_dna_1 = self._get_interval(x1, x2, "-")
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+'
                self.inter_dna_2 = self._get_interval(x2, x1, '+')
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{x2}:-'
                self.inter_prot_1 = self.prot_1[0:abs(x2-x1)//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+'
                self.inter_prot_2 = self.prot_2[0:abs(x1-x2)//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+'
                self.global_orf = self._get_interval(y1 ,y2, "+") 
                
                self.status = True
                
            elif all([x1>y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':LEFT_INTERSECTION'
                self.dna_2_name = self.dna_2_name + ':RIGHT_INTERSECTION'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-'
                self.inter_dna_1 = self._get_interval(y2, y1, "-")
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+'
                self.inter_dna_2 = self._get_interval(y1, y2, '+')
                
                self.inter_prot_1 = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{y1}:-'
                self.inter_prot_1 = self.prot_1[abs(y2-x1)//3:(abs(y2-x1)+abs(x2-y2))//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{y2}:+'
                self.inter_prot_2 = self.prot_2[abs(y1-x2)//3:(abs(y1-x2)+abs(y2-y1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{x1}:+'
                self.global_orf = self._get_interval(x2 ,x1, "+")
                
                self.status = True
                
            elif all([x1>y2, y1<x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':OUTSIDE'
                self.dna_2_name = self.dna_2_name + ':INSIDE'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:-'
                self.inter_dna_1 = self._get_interval(y2, x2, "-")
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_dna_2 = self.dna_2
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y2}:{x2}:-'
                self.inter_prot_1 = self.prot_1[(y2-x1)//3:(y2-x1+x2-y2)//3]
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.inter_prot_2 = self.prot_2
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.global_orf = self._get_interval(x1 ,y1, "-")
                
                self.status = True
                
            elif all([x1<y2, y1>x2]):
                
                print(f'process: {self.chrom} {orf1} vs {orf2}')
                
                self.dna_1_name = self.dna_1_name + ':INSIDE'
                self.dna_2_name = self.dna_2_name + ':OUTSIDE'
                
                self.inter_dna_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_dna_1 = self.dna_1
                
                self.inter_dna_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1}:{x1}:+'
                self.inter_dna_2 = self._get_interval(y1, x1, "+")
                
                self.inter_prot_1_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x1}:{y1}:-'
                self.inter_prot_1 = self.prot_1
                
                self.inter_prot_2_name = f'{self.chrom}:{orf1}_vs_{orf2}:{y1-x2}:{y1-x2+x1-y1}:+'
                self.inter_prot_2 = self.prot_2[abs(y1-x2)//3:(abs(y1-x2)+abs(x1-y1))//3]
                
                self.global_orf_name = f'{self.chrom}:{orf1}_vs_{orf2}:{x2}:{y2}:+'
                self.global_orf = self.dna_2
                
                self.status = True
        
    def get_full_batch(self):
        
        return f">GLOBAL_ORF:{self.global_orf_name}\n{self.global_orf}\n"\
                f">DNA_MAIN:{self.dna_1_name}\n{self.dna_1}\n"\
                f">DNA_ALT:{self.dna_2_name}\n{self.dna_2}\n"\
                f">INTERSECTION_DNA_MAIN:{self.inter_dna_1_name}\n{self.inter_dna_1}\n"\
                f">INTERSECTION_DNA_ALT:{self.inter_dna_2_name}\n{self.inter_dna_2}\n"\
                f">PROT_MAIN:{self.prot_1_name}\n{self.prot_1}\n"\
                f">PROT_ALT:{self.prot_2_name}\n{self.prot_2}\n"\
                f">PROT_INTERSECTED_MAIN:{self.inter_prot_1_name}\n{self.inter_prot_1}\n"\
                f">PROT_INTERSECTED_ALT:{self.inter_prot_2_name}\n{self.inter_prot_2}\n"\
                f"=========================================================\n"
     
        
def main(args):
    
    fasta_dna_dict, orfs_dict = get_data(args)
    
    with open(args.output, 'w') as w:
        for chrom, orfs in orfs_dict.items():
            for dna1_name, dna2_name in itertools.combinations(orfs, 2):
                if all([len(fasta_dna_dict[f"{chrom}:{dna1_name}"])>=args.min_orf,
                        len(fasta_dna_dict[f"{chrom}:{dna2_name}"])>=args.min_orf]):

                    Inter = Intersection(args,
                                         f"{chrom}:{dna1_name}", fasta_dna_dict[f"{chrom}:{dna1_name}"],\
                                         f"{chrom}:{dna2_name}", fasta_dna_dict[f"{chrom}:{dna2_name}"])
                    Inter.process_intersection()
                    if Inter.status:
                        w.write(Inter.get_full_batch())

if __name__ == '__main__':

#     parser = argparse.ArgumentParser(
#         formatter_class=argparse.ArgumentDefaultsHelpFormatter)

#     parser.add_argument('-input_dna', type=str,
#                         help='The address to the input predicted dna from the EMBOSS getorf find 3.')
#     parser.add_argument('-input_genome', type=str, help='A Path to the input genome.')
#     parser.add_argument('-min_orf', type=int, default=300, help='Minimum ORF length.')
#     parser.add_argument('-interval_length', type=int, default=100, help='Minimum length of interval of intersection between 2 ORF.')
#     parser.add_argument('-output', type=str, default='output.fasta', help='The address to the output.')

#     args = parser.parse_args()

    class Args():
        def __init__(self):
            self.input_dna = './input_predicted_prots_and_dna/Scler_dna_pred'
            self.input_genome = './Sclerotinia_sclerotiorum_orgin_data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa'
            self.min_orf = 600
            self.output = 'output'
            self.interval_length = 300

    args = Args()

    main(args)

process: CH476621 ORF5 vs ORF6
process: CH476621 ORF29 vs ORF1776
process: CH476621 ORF30 vs ORF1776
process: CH476621 ORF56 vs ORF1753
process: CH476621 ORF91 vs ORF1716
process: CH476621 ORF135 vs ORF1662
process: CH476621 ORF136 vs ORF137
process: CH476621 ORF159 vs ORF1634
process: CH476621 ORF268 vs ORF1515
process: CH476621 ORF270 vs ORF1512
process: CH476621 ORF285 vs ORF286
process: CH476621 ORF285 vs ORF1500
process: CH476621 ORF285 vs ORF1501
process: CH476621 ORF286 vs ORF1500
process: CH476621 ORF286 vs ORF1501
process: CH476621 ORF288 vs ORF1499
process: CH476621 ORF296 vs ORF1491
process: CH476621 ORF444 vs ORF1360


KeyboardInterrupt: 

In [350]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-input_dna', type=str,
                        help='The address to the input predicted dna from the EMBOSS getorf find 3.')
    parser.add_argument('-input_genome', type=str, help='A Path to the input genome.')
    parser.add_argument('-min_orf', type=int, default=300, help='Minimum ORF length.')
    parser.add_argument('-interval_length', type=int, default=100, help='Minimum length of interval of intersection between 2 ORF.')
    parser.add_argument('-output', type=str, default='output.fasta', help='The address to the output.')

    args = parser.parse_args()

#     class Args():
#         def __init__(self):
#             self.input_dna = 'Scler_orf_prediction/Scler_dna_pred'
#             self.input_genome = 'data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa'
#             self.min_orf = 300
#             self.output = 'output'
#             self.interval_length = 300

#     args = Args()

    main(args)


process: CH476621 ORF5 vs ORF6
process: CH476621 ORF8 vs ORF1805
process: CH476621 ORF16 vs ORF1792
process: CH476621 ORF18 vs ORF1789
process: CH476621 ORF29 vs ORF1776
process: CH476621 ORF30 vs ORF1776
process: CH476621 ORF41 vs ORF1770
process: CH476621 ORF42 vs ORF1767
process: CH476621 ORF45 vs ORF1766
process: CH476621 ORF47 vs ORF1761
process: CH476621 ORF48 vs ORF1758
process: CH476621 ORF54 vs ORF1754
process: CH476621 ORF56 vs ORF1752
process: CH476621 ORF56 vs ORF1753
process: CH476621 ORF56 vs ORF1754
process: CH476621 ORF72 vs ORF1734
process: CH476621 ORF74 vs ORF1733
process: CH476621 ORF80 vs ORF1728
process: CH476621 ORF86 vs ORF1724
process: CH476621 ORF87 vs ORF1722
process: CH476621 ORF88 vs ORF1722
process: CH476621 ORF89 vs ORF1722
process: CH476621 ORF91 vs ORF1716
process: CH476621 ORF96 vs ORF1710
process: CH476621 ORF97 vs ORF1708
process: CH476621 ORF101 vs ORF1702
process: CH476621 ORF101 vs ORF1703
process: CH476621 ORF103 vs ORF104
process: CH476621 ORF11

process: CH476621 ORF821 vs ORF954
process: CH476621 ORF836 vs ORF942
process: CH476621 ORF849 vs ORF926
process: CH476621 ORF850 vs ORF925
process: CH476621 ORF856 vs ORF917
process: CH476621 ORF874 vs ORF875
process: CH476621 ORF874 vs ORF899
process: CH476621 ORF875 vs ORF899
process: CH476621 ORF877 vs ORF896
process: CH476621 ORF878 vs ORF879
process: CH476621 ORF882 vs ORF893
process: CH476621 ORF897 vs ORF898
process: CH476621 ORF916 vs ORF917
process: CH476621 ORF930 vs ORF933
process: CH476621 ORF931 vs ORF933
process: CH476621 ORF932 vs ORF933
process: CH476621 ORF979 vs ORF980
process: CH476621 ORF992 vs ORF993
process: CH476621 ORF1031 vs ORF1032
process: CH476621 ORF1042 vs ORF1043
process: CH476621 ORF1045 vs ORF1046
process: CH476621 ORF1053 vs ORF1054
process: CH476621 ORF1063 vs ORF1064
process: CH476621 ORF1067 vs ORF1068
process: CH476621 ORF1073 vs ORF1074
process: CH476621 ORF1080 vs ORF1081
process: CH476621 ORF1094 vs ORF1095
process: CH476621 ORF1110 vs ORF1111


process: CH476622 ORF571 vs ORF1201
process: CH476622 ORF571 vs ORF1202
process: CH476622 ORF575 vs ORF1195
process: CH476622 ORF579 vs ORF1193
process: CH476622 ORF583 vs ORF1187
process: CH476622 ORF585 vs ORF1185
process: CH476622 ORF586 vs ORF1185
process: CH476622 ORF587 vs ORF1183
process: CH476622 ORF591 vs ORF1178
process: CH476622 ORF596 vs ORF1175
process: CH476622 ORF597 vs ORF598
process: CH476622 ORF599 vs ORF1172
process: CH476622 ORF605 vs ORF1165
process: CH476622 ORF607 vs ORF1163
process: CH476622 ORF612 vs ORF1152
process: CH476622 ORF613 vs ORF1150
process: CH476622 ORF616 vs ORF1145
process: CH476622 ORF617 vs ORF1144
process: CH476622 ORF618 vs ORF1144
process: CH476622 ORF620 vs ORF1139
process: CH476622 ORF621 vs ORF1134
process: CH476622 ORF628 vs ORF1132
process: CH476622 ORF630 vs ORF1130
process: CH476622 ORF631 vs ORF1127
process: CH476622 ORF634 vs ORF1125
process: CH476622 ORF634 vs ORF1126
process: CH476622 ORF635 vs ORF1124
process: CH476622 ORF636 vs O

process: CH476623 ORF338 vs ORF1332
process: CH476623 ORF341 vs ORF1328
process: CH476623 ORF342 vs ORF1328
process: CH476623 ORF350 vs ORF1326
process: CH476623 ORF350 vs ORF1327
process: CH476623 ORF351 vs ORF1324
process: CH476623 ORF351 vs ORF1325
process: CH476623 ORF353 vs ORF1321
process: CH476623 ORF353 vs ORF1322
process: CH476623 ORF360 vs ORF1313
process: CH476623 ORF362 vs ORF1308
process: CH476623 ORF363 vs ORF1308
process: CH476623 ORF364 vs ORF1306
process: CH476623 ORF366 vs ORF1304
process: CH476623 ORF366 vs ORF1305
process: CH476623 ORF367 vs ORF1302
process: CH476623 ORF367 vs ORF1303
process: CH476623 ORF368 vs ORF1301
process: CH476623 ORF369 vs ORF1299
process: CH476623 ORF372 vs ORF1297
process: CH476623 ORF373 vs ORF1295
process: CH476623 ORF380 vs ORF1278
process: CH476623 ORF386 vs ORF1272
process: CH476623 ORF387 vs ORF1269
process: CH476623 ORF393 vs ORF1264
process: CH476623 ORF395 vs ORF1258
process: CH476623 ORF396 vs ORF1256
process: CH476623 ORF397 vs 

KeyboardInterrupt: 

In [346]:
x.__dict__

{'genome': './Sclerotinia_sclerotiorum_orgin_data/Sclerotinia_sclerotiorum.ASM14694v1.dna.toplevel.fa',
 'min_orf': 300,
 'min_dna_inter': 300,
 'chrom': 'CH476621',
 'global_orf_name': 'CH476621:ORF5_vs_ORF6:21535:22390:+',
 'global_orf': 'ATGGCGACCCGATGTCTTACACGCAGATGTGCTGGGCTTACCTCCGCTCCTCCTACATCAGTTGTCCAAGGATTTATAAAGACAGCCACGACAGCTCGACCATTCCATCAAAACACCCCGCGTCCGATCTTCGACTTCCTCGCTCCACGTTTGGGTGCTCCCTTCCAGTCATATGTTCTCAAGCGGAATAGGATATCAAGGGACTCTATTAGGTATTTCTCGAATACTGTGGTACGGAAAGCTACTGTCACTACATTCAATCCGCGAAAAAACGATGATGGGACAGAGCAGAAGATTGAAATTACACCCCGGGCTGCAGATGTACGTATTTTTGTCTTCCCCTCCTACCGCCCTCTTCAAGCACCGCAAATCCGATTATATACTAATTGTTTGTTCCCAGCGTCTACAACAACTGCGCAAAAAAGAGAACAACCCCAATCTCGCTCTCCGGGTCGAAGTCCAATCGGGGGGCTGCCACGGCTTCCAATATGTAATGTCTTTATGCGATCTCCCAGCCAATATCTCTCGAGATCTTTTCTCCAAAGACCCTGTCCAATCCGAATCTTTTTCCTCTTCCAGTACCTCATCCACAGATTCAAATGCCATGCCTACTTCATCTAGCTCTTCGAAGTCAAATACCTCAGCCACAATGGATCTTCGCGAAGATGATACAGTATTCAGCTACATTAGCGATCATTTACATGCCAACGTCGTTATGGATGTCTTTAGTCTTGACGCCTTGAAAGGTAGTAAAATAGA

In [329]:
import itertools
mylist = [1,2,3,4]
for a, b in itertools.combinations(mylist, 2):
    print(a, b)

1 2
1 3
1 4
2 3
2 4
3 4


In [1]:
import numpy as np
import pandas as pd

In [2]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

In [7]:
df = pd.DataFrame({'A':[i for i in range(11)], 'B':[i for i in range(11)]})
df['B'] = np.nan
df

Unnamed: 0,A,B
0,0,
1,1,
2,2,
3,3,
4,4,
5,5,
6,6,
7,7,
8,8,
9,9,
