In [3]:
import pandas as pd
import json
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq

In [None]:
# -- global --

SUB_MATRICES_PATH='../utils/substitution_matrices.json'
data=json.load(open(SUB_MATRICES_PATH))
SUB_MATRICES = {k:data[k]['type'] for k in list(data.keys())}

# -- user defined variables: --

# mandatory
FASTA_1 = '../../example/protein_seq1.fasta'
FASTA_2 = '../../example/protein_seq2.fasta'



# optional
SEQ_TYPE='protein'
WINDOW_SIZE=10
THRESHOLD=23
SCORE_MATRIX='blosum62'



code

In [None]:
# ----------------------- mandatory -----------------------

In [None]:
def validate_path(path_str):
    path=Path(path_str)
    if not path.exists():
        print(f'<!> Path {path} does not exist.')
        return False
    return True

validate_path(FASTA_2)


In [None]:
def is_protein(fasta_file):
    try:
        seq = SeqIO.read(fasta_file, 'fasta')
        seq = str(seq.seq)
        if all([aa in 'ACDEFGHIKLMNPQRSTVWY' for aa in seq]):
            return True
        else:
            return False
    except:
        print(f'<!> File {fasta_file} is not a valid fasta file.')
        return False
    
def is_dna(fasta_file):
    try:
        seq = SeqIO.read(fasta_file, 'fasta')
        seq = str(seq.seq)
        if all([nt in 'ACGT' for nt in seq]):
            return True
        else:
            return False
    except:
        print(f'<!> File {fasta_file} is not a valid fasta file.')
        return False
    
def get_type(fasta_file):
    if is_protein(fasta_file):
        return 'protein'
    elif is_dna(fasta_file):
        return 'dna'
    else:
        return None
    
SEQ_TYPE = get_type(FASTA_1)

In [None]:
def get_seq_from_fasta(fasta_file):
    t= get_type(fasta_file)
    seq = SeqIO.read(fasta_file, 'fasta')
    seq.id = seq.id + f' {t}'
    return seq

SEQ_1 = get_seq_from_fasta(FASTA_1)
SEQ_2 = get_seq_from_fasta(FASTA_2)


In [None]:
# ------- optional parameters -------

In [None]:
def validate_substitution_matrix(matrix_name='blosum62', sequence_type='protein'):
    types=['protein','dna']
    matrix_name = matrix_name.lower()
    sequence_type = sequence_type.lower()


    if sequence_type not in types:
        print('<!> Invalid sequence type. Must be one of: {}\n\t-- Taking protein as default--'.format(types))
        sequence_type='protein'

    keys = [k for k,v in SUB_MATRICES.items() if v==sequence_type]
    # print(keys)
    
    if matrix_name not in keys:
        if sequence_type=='protein':
            print('<!> Invalid substitution matrix. Must be one of: {}\n\t'.format(keys))
            matrix_name='blosum62'
        else:
            print('<!> Invalid substitution matrix. Must be one of: {}'.format(keys))
            matrix_name='DNAFull'
        
    print(f'\n\t -- Taking {matrix_name} as substitution matrix for {sequence_type} sequences --')
    return matrix_name, sequence_type


In [None]:
def read_submat_from_json(matrix_name, json_file=SUB_MATRICES_PATH):

    json_file = open(json_file)
    data = json.load(json_file)
    df = pd.DataFrame(data[matrix_name]['matrix'])
    df.index = df.columns
    return df

In [None]:
# def validate_threshold(w=WINDOW_SIZE,t=THRESHOLD):
#     # not using it for now 
#     if t>w:
#         print(f'<!> Threshold ({t}) must be less than window size ({w}).\n\t-- Taking threshold as half of window size --')
#         t = w//2
#     return t

def validate_positive_integers(w=WINDOW_SIZE,t=THRESHOLD):
    if not all([isinstance(i,int) for i in [w,t]]):
        print(f'<!> Window size and threshold must be integers. \n\t-- Taking default values')
        w=10
        t=23
    return w,t

In [None]:
def get_parameters(w=WINDOW_SIZE, t=THRESHOLD, sm=SCORE_MATRIX, stype=SEQ_TYPE, sub_mat_path=SUB_MATRICES_PATH):
    SCORE_MATRIX, SEQ_TYPE=validate_substitution_matrix(sm, stype)
    # THRESHOLD=validate_threshold(w, t)
    WINDOW_SIZE, THRESHOLD=validate_positive_integers(w, t)
    SUBSTITUTION_MATRIX=read_submat_from_json(sm, sub_mat_path)

    return WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE, SUBSTITUTION_MATRIX

WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE, SUBSTITUTION_MATRIX = get_parameters(WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE)

In [None]:
def get_parameters_description(seq_type=SEQ_TYPE, window_size=WINDOW_SIZE, threshold=THRESHOLD, score_matrix=SCORE_MATRIX):
    return f'''
    -- Parameters --
    Sequence type: {seq_type}
    Window size: {window_size}
    Threshold: {threshold}
    Score matrix: {score_matrix}
    '''

print(get_parameters_description())

In [None]:
SUBSTITUTION_MATRIX

In [None]:

# def is_fasta(fasta_file):
#     try:
#         SeqIO.read(fasta_file, 'fasta')
#         return True
#     except:
#         print(f'<!> File {fasta_file} is not a valid fasta file.')
#         return False
    
# is_fasta(FASTA_2)

Knit:

In [24]:
def validate_path(path_str):
    path=Path(path_str)
    if not path.exists():
        print(f'<!> Path {path} does not exist.')
        return False
    return True

def is_protein(fasta_file):
    try:
        seq = SeqIO.read(fasta_file, 'fasta')
        seq = str(seq.seq)
        if all([aa in 'ACDEFGHIKLMNPQRSTVWY' for aa in seq]):
            return True
        else:
            return False
    except:
        print(f'<!> File {fasta_file} is not a valid fasta file.')
        return False
    
def is_dna(fasta_file):
    try:
        seq = SeqIO.read(fasta_file, 'fasta')
        seq = str(seq.seq)
        if all([nt in 'ACGT' for nt in seq]):
            return True
        else:
            return False
    except:
        print(f'<!> File {fasta_file} is not a valid fasta file.')
        return False
    
def get_type(fasta_file):
    if is_protein(fasta_file):
        return 'protein'
    elif is_dna(fasta_file):
        return 'dna'
    else:
        return None
    
def get_seq_from_fasta(fasta_file):
    t= get_type(fasta_file)
    seq = SeqIO.read(fasta_file, 'fasta')
    seq.id = seq.id + f' {t}'
    return seq

def validate_substitution_matrix(matrix_name='blosum62', sequence_type='protein'):
    types=['protein','dna']
    matrix_name = matrix_name.lower()
    sequence_type = sequence_type.lower()


    if sequence_type not in types:
        print('<!> Invalid sequence type. Must be one of: {}\n\t-- Taking protein as default--'.format(types))
        sequence_type='protein'

    keys = [k for k,v in SUB_MATRICES.items() if v==sequence_type]
    # print(keys)
    
    if matrix_name not in keys:
        if sequence_type=='protein':
            print('<!> Invalid substitution matrix. Must be one of: {}\n\t'.format(keys))
            matrix_name='blosum62'
        else:
            print('<!> Invalid substitution matrix. Must be one of: {}'.format(keys))
            matrix_name='DNAFull'
        
    print(f'\n\t -- Taking {matrix_name} as substitution matrix for {sequence_type} sequences --')
    return matrix_name, sequence_type

def read_submat_from_json(matrix_name, json_file=SUB_MATRICES_PATH):

    json_file = open(json_file)
    data = json.load(json_file)
    df = pd.DataFrame(data[matrix_name]['matrix'])
    df.index = df.columns
    return df

def validate_positive_integers(w=WINDOW_SIZE,t=THRESHOLD):
    if not all([isinstance(i,int) for i in [w,t]]):
        print(f'<!> Window size and threshold must be integers. \n\t-- Taking default values')
        w=10
        t=23
    return w,t

def get_parameters(w=WINDOW_SIZE, t=THRESHOLD, sm=SCORE_MATRIX, stype=SEQ_TYPE, sub_mat_path=SUB_MATRICES_PATH):
    SCORE_MATRIX, SEQ_TYPE=validate_substitution_matrix(sm, stype)
    # THRESHOLD=validate_threshold(w, t)
    WINDOW_SIZE, THRESHOLD=validate_positive_integers(w, t)
    SUBSTITUTION_MATRIX=read_submat_from_json(sm, sub_mat_path)

    return WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE, SUBSTITUTION_MATRIX



def get_parameters_description(seq_type=SEQ_TYPE, window_size=WINDOW_SIZE, threshold=THRESHOLD, score_matrix=SCORE_MATRIX):
    return f'''
    -- Parameters --
    Sequence type: {seq_type}
    Window size: {window_size}
    Threshold: {threshold}
    Score matrix: {score_matrix}
    '''

def get_parameters_description_df(seq_type=SEQ_TYPE, window_size=WINDOW_SIZE, threshold=THRESHOLD, score_matrix=SCORE_MATRIX, seq1=SEQ_1, seq2=SEQ_2):
    data = {
        'Parameter': ['Sequence type', 'Window size', 'Threshold', 'Score matrix', 'Seq1 length', 'Seq2 length'],
        'Value': [seq_type, window_size, threshold, score_matrix, len(seq1.seq), len(seq2.seq)]
    }
    df = pd.DataFrame(data, columns=['Parameter', 'Value'])
    df=df.set_index('Parameter')
    return df





In [25]:
WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE, SUBSTITUTION_MATRIX = get_parameters(WINDOW_SIZE, THRESHOLD, SCORE_MATRIX, SEQ_TYPE)
SEQ_1 = get_seq_from_fasta(FASTA_1)
SEQ_2 = get_seq_from_fasta(FASTA_2)

print(get_parameters_description())
get_parameters_description_df()


	 -- Taking blosum62 as substitution matrix for protein sequences --

    -- Parameters --
    Sequence type: protein
    Window size: 10
    Threshold: 23
    Score matrix: blosum62
    


Unnamed: 0_level_0,Value
Parameter,Unnamed: 1_level_1
Sequence type,protein
Window size,10
Threshold,23
Score matrix,blosum62
Seq1 length,680
Seq2 length,319
