In [25]:
import pandas as pd
import re

In [37]:
def get_blosum_dictionary():
    blosum_df = pd.read_table('blosum.txt', sep=';')
    blosum_dict_rows = blosum_df.to_dict()

    blosum_dict = {}
    {blosum_dict.update(g) for g in [{(r,c):v for r,v in v1.items()} for c,v1 in blosum_dict_rows.items()]}

    return blosum_dict

In [None]:
blosum_dictionary = get_blosum_dictionary()

In [5]:
def preprocess(sequence):
    return [sequence[i:i+3] for i in range(len(sequence)-2)]

In [6]:
searched_sequence = 'YANCLEHKMGS'

searched_sequence_preprocessed = preprocess('YANCLEHKMGS')
searched_sequence_preprocessed

['YAN', 'ANC', 'NCL', 'CLE', 'LEH', 'EHK', 'HKM', 'KMG', 'MGS']

In [19]:
aminoacid_replacements = {
    "ATG":"M",
    "TTT":"F",
    "TTC":"F",
    "TTA":"L",
    "TTG":"L",
    "TCT":"S",
    "TCC":"S",
    "TCA":"S",
    "TCG":"S",
    "TAT":"Y",
    "TAC":"Y",
    "TGT":"C",
    "TGC":"C",
    "TGG":"W",
    "CTT":"L",
    "CTC":"L",
    "CTA":"L",
    "CTG":"L",
    "CCT":"P",
    "CCC":"P",
    "CCA":"P",
    "CCG":"P",
    "CAT":"H",
    "CAC":"H",
    "CAA":"Q",
    "CAG":"Q",
    "CGT":"R",
    "CGC":"R",
    "CGA":"R",
    "CGG":"R",
    "ATT":"I",
    "ATC":"I",
    "ATA":"I",
    "ACT":"T",
    "ACC":"T",
    "ACA":"T",
    "ACG":"T",    
    "AAT":"N",
    "AAC":"N",
    "AAA":"K",
    "AAG":"K",
    "AGT":"S",
    "AGC":"S",
    "AGA":"R",
    "AGG":"R",
    "GTT":"V",
    "GTC":"V",
    "GTA":"V",
    "GTG":"V",
    "GCT":"A",
    "GCC":"A",
    "GCA":"A",
    "GCG":"A",
    "GAT":"D",
    "GAC":"D",
    "GAA":"E",
    "GAG":"E",
    "GGT":"G",
    "GGC":"G",
    "GGA":"G",
    "GGG":"G",
    "TAA":"",
    "TAG":"",
    "TGA":"",
}

def get_chromozon_aminoacids():
    genome = []

    with open('.\data\chr17\chr17.fa') as f:
        for line in f.readlines()[1:]:
            if 'N' not in line:
                genome.append(line.upper().strip().split('\n'))
    
    genome = [''.join(row) for row in genome] 
    print(genome[0:5000])
    return get_aminoacids(''.join(genome)[1:100000])

def get_aminoacids(gene):
    return "".join([aminoacid_replacements.get(c, c) for c in re.findall('...', gene)])

In [20]:
def calculate_blosom_score(sequences, individual_scores=False):
    scores = [blosum_dictionary[(sequence_a, sequence_b)] for sequence_a, sequence_b in zip(list(sequences[0]), list(sequences[1]))]
    return sum(scores) if not individual_scores else scores

In [24]:
calculate_blosom_score(('TAA', 'TGA'))

NameError: name 'blosum_dictionary' is not defined

In [22]:
chromozon_aminoacids = get_chromozon_aminoacids()

['GATCATGCAGCTCTTCCAGGCCCACTGCTTCTTCCTGTCCACTAGGCCAC', 'AGCCGCCCTCCAGGCCCACTATGCACACATCTTCCCCTCCAAGGTTTGTT', 'CTGCCCCTGCCCTGACTCCCAGCCCTGTGGGGGTCCTGACCGCACCTCAC', 'CTGGCTCAGACTCTTGACGCTGCCCTGGCTGCCCCACCACTGCTTCTGCC', 'CGAGAGTCACGTGAGGCTGAGAGTAGGGGCAGGGGCAGCAGTGGTGCCAG', 'TTGGGGGGCGGTCCAGTGGGAGGAGCCTCAGCCTCGCGGGCTGCTCCGTG', 'GGACTGATGACTGCATGATCTTCTGGGCACCTCACGGATCTTCAACTGCA', 'GGTGAAACGGATGCTGGTGGTGGGTGCAGGGCCGCTGGGAGCTGCTGCAT', 'GGTTCCCAGAGGCTGGACTGGGGCAGGTGCCAACTGAAGCTGCTGGGGCA', 'GCATGGGCAGGATGTTCTGCACACAAACCTTGGAGAAGAAGATGTGTGCA', 'TAGCAGGTCCACTGCTGCTGCCCCTGCCCTGACTCCCAGCCCTGCCTGAC', 'CCCACCTCAACCTGCTCAGGCTCTGGCGCAACCCTGGCTGCCCTGCCACT', 'GCCTCTGCCCCAGAGTTGGGGCCTTGACAACCTGGTTGGAAGGGGACACC', 'CCAGCCCTGCCTCAACACCTGGGGGTCTCCATAACTACCACAGGCAGGTG', 'GGCAACCCCAAAGATCCCAGGACTCACAGTACCCCCTGAGAACATGGACA', 'GTATGTGGGGGTAGAAATGGAGGGCAGGATGGTTATCTTCTCCCAGGTGA', 'AGCCATTTAATCCTTTCAGTTTGGGACGGAGTAAGGCCTTCCTCTTTTTT', 'TTTTTTTTTTTTTTTTTGAGACCGAGTCTTGCTCTGTCACCCAGGCTGGA', 'GTGCAGTGGTGCGATCTTGGCTCACT

In [23]:
t_value = 11
max_drop_off = 2

In [112]:
def extend_search_result(sequence_index, data_index, sequence, sequence_fraction, data):    
    result = sequence_fraction

    sequence = list(sequence)
    data = list(data)

    drop_off = 0

    left_sequence_index = sequence_index
    right_sequence_index = sequence_index + 2
    
    left_data_index = data_index
    right_data_index = data_index + 2

    while left_data_index > 0 \
        and left_sequence_index > 0 \
        and drop_off >= -max_drop_off:
        left_data_index -= 1
        left_sequence_index -= 1

        drop_off = blosum_dictionary[(sequence[left_sequence_index], data[left_data_index])]

        if drop_off >= -max_drop_off:
            result = sequence[left_sequence_index] + result

    while right_data_index < len(data)-1 \
        and right_sequence_index < len(sequence)-1 \
        and drop_off >= -max_drop_off:
        right_data_index += 1
        right_sequence_index += 1
        drop_off = blosum_dictionary[(sequence[right_sequence_index], data[right_data_index])]

        if drop_off >= -max_drop_off:
            result = result + sequence[right_sequence_index]
        
    return result

In [113]:
def get_search_results(data, sequence, t_value):
    for sequence_index, sequence_fraction in [(i,sequence[i:i+3]) for i in range(len(sequence)-2)]:
        for data_index, data_fraction in [(i,data[i:i+3]) for i in range(len(data)-2)]:
            if calculate_blosom_score((data_fraction, sequence_fraction)) >= t_value:
                yield (sequence_index, data_index, sequence, sequence_fraction, data)

In [118]:
search_results = get_search_results(chromozon_aminoacids, searched_sequence, t_value)
search_results = [(search_result[1], extend_search_result(*search_result)) for search_result in search_results]

In [119]:
search_results

[(2065, 'YANC'),
 (2204, 'YANCLEHK'),
 (2448, 'YANC'),
 (2454, 'YANCL'),
 (2553, 'YAN'),
 (2571, 'YANCL'),
 (3148, 'YANC'),
 (3285, 'YANCLEHK'),
 (3529, 'YANC'),
 (3535, 'YANCL'),
 (4094, 'YANCL'),
 (4547, 'YANCL'),
 (5296, 'YANCL'),
 (5567, 'YANCLE'),
 (5841, 'YANCL'),
 (6427, 'YANC'),
 (6747, 'YANC'),
 (6892, 'YAN'),
 (8625, 'YANCL'),
 (9604, 'YANC'),
 (10002, 'YANCLEHKM'),
 (10476, 'YAN'),
 (10515, 'YANCLEHKMGS'),
 (10627, 'YAN'),
 (10878, 'YANCLE'),
 (11464, 'YAN'),
 (11681, 'YANCL'),
 (12010, 'YAN'),
 (12116, 'YANC'),
 (12303, 'YANC'),
 (12437, 'YANC'),
 (12594, 'YANCLEH'),
 (13308, 'YAN'),
 (13733, 'YAN'),
 (14615, 'YANCL'),
 (14709, 'YANCL'),
 (15000, 'YANC'),
 (15232, 'YAN'),
 (15658, 'YAN'),
 (16656, 'YAN'),
 (16865, 'YAN'),
 (17039, 'YANCL'),
 (17106, 'YAN'),
 (17921, 'YAN'),
 (18480, 'YANC'),
 (18637, 'YAN'),
 (18744, 'YAN'),
 (19137, 'YANC'),
 (19191, 'YANCLEHKM'),
 (21007, 'YANCLE'),
 (21357, 'YANCLE'),
 (22025, 'YANCLE'),
 (22080, 'YAN'),
 (23878, 'YAN'),
 (24084, 'YAN'),

In [110]:
chromozon_aminoacids[10515:10525]

'YKNWSEGKAE'

In [111]:
calculate_blosom_score(('YKNWSEGKAE','YANLEHKMGS'), individual_scores=True)

[7, -1, 6, -2, 0, 0, -2, -1, 0, 0]