In [1]:
import pandas as pd
import numpy as np
import os
from uuid import uuid4
from itertools import product
import concurrent.futures
from datetime import datetime
import re
from collections import namedtuple, Counter
from copy import deepcopy

from tgsts.sequtils.trimming import trim_intersection
from utils import build_full_length_alignments, add_pipes
from tgsts.sequtils import str_to_fasta_seqrecord
from tgsts.sequtils.kmers import calculate_kmer_distance
from tgsts.sequtils.rra import identify_rrs_ali, mask_rrs
from tgsts.utils.parallel import run_concurrently
from tgsts.typeclasses import MismatchList
from tgsts.libs import ANTypingLibs
from tgsts.align import mm_profile_from_seqs
from tgsts.typeclasses import ExonDict
from tgsts.align import exonic_alignment


from sfat import Annotator 

In [2]:
path = './erap_tile_test_new.xlsx'

df = pd.read_excel(path).replace({np.nan: None})
df.head()


Unnamed: 0,Samples,Library ID,Method,Fragment,NumReads,Fragment Name,cds_mismatch_list,gDNA_mismatch_list,analysis_code,Sequence
0,AMAI,ERAP1_BCAv2_11,pbAA,1,103,1a,No_MM,"5utr:-296delACACACACACACAC>, intron1:2595delT>...",1,GAAAGTCCTGGGGGCCACCTCTAACCCACCTTCCTCCTCTACAGCA...
1,AMAI,ERAP1_BCAv2_11,pbAA,1,122,1a,No_MM,"5utr:-331delA>, 5utr:-296delACACACACACACACACAC...",1,GAAAGTCCTGGGGGCCACCTCTAACCCACCTTCCTCCTCTACAGCA...
2,AMAI,ERAP1_BCA_22,laa,2,179,2a,No_MM,"intron5:13741delA, intron11:20063C>A, intron13...",1,TTTGTGTTCCTCTAGTGCTGAGATTTCTTATTCTTGTGGAGGTATC...
3,AMAI,ERAP1_BCA_22,laa,2,213,2a,No_MM,"intron5:13741delA, intron13:22270insT, intron1...",1,TTTGTGTTCCTCTAGTGCTGAGATTTCTTATTCTTGTGGAGGTATC...
4,AMAI,ERAP1_BCA_28,laa,3,171,3f,exon15:2285C>G,"intron11:20063C>A, intron13:22270insT, intron1...",1,AATGAAACTATAGATAACAATTATTTCTATTATCTTTTCAGGGAGC...


In [3]:
df[df['Samples']=='AMALA']

Unnamed: 0,Samples,Library ID,Method,Fragment,NumReads,Fragment Name,cds_mismatch_list,gDNA_mismatch_list,analysis_code,Sequence
10,AMALA,ERAP1_BCA_12,laa,1,393,1a,No_MM,"5utr:-357delA, 5utr:-323insACAC, intron1:1293d...",2,GAAAGTCCTGGGGGCCACCTCTAACCCACCTTCCTCCTCTACAGCA...
11,AMALA,ERAP1_BCA_22,pbAA,2,77,2a,No_MM,intron11:20096delTG,2,TTTGTGTTCCTCTAGTGCTGAGATTTCTTATTCTTGTGGAGGTATC...
12,AMALA,ERAP1_BCA_28,pbAA,3,715,3a,No_MM,"intron11:20096delTG, intron14:24183insA, intro...",2,AATGAAACTATAGATAACAATTATTTCTATTATCTTTTCAGGGAGC...
13,AMALA,ERAP1_BCA_25_reload,pbAA,4,103,4a,No_MM,"intron14:24484insA, intron18:27923G>A, intron1...",2,GCTTGGCAAAATGTCCTGAAGTCTTGTTGCATAATTTGCTCTCAAA...
14,AMALA,ERAP1_BCA_26,pbAA,5,68,5a,No_MM,"intron19:33709A>C, intron19:35500delT, intron1...",2,TCAAGTCAGTTAATACCCTAAGAATTAGATTTTATTTCTTATTCTG...


In [4]:
#Fill rows
rows = []

prev_filled_row = None
for i, row in enumerate(df.to_dict(orient="records")):
    
    if row['Samples'] is not None:
        prev_filled_row = row
        
    for column, value in row.items():
        if 'Typing' not in column and value is None:
            row[column] = prev_filled_row[column]

        
    rows.append(row)
    

#Filter rows

#Get nonzero analysis codes and remove missing seqs
rows = [
    row 
    for row
    in rows
    if row['analysis_code'] != 0
    and row['Sequence'] is not None
]    

In [5]:
erap_reference = 'GTACAGTGGCCCTTGGTAGTGCAGGAAAGTCCTGGGGGCCACCTCTAACCCACCTTCCTCCTCTACAGCATCTCCCACTGTAGTCATTCTCTACCGAAGCCCCAGAAGGTGCGGCACTTTGCCACGACAGAGTACTGGGTTCATGTTTCTTTCCGAGGCGGGCCAAGAGCTCTCAGCCCACTGGCAGTGGCGAGATGACGGACACCCAGCGAGTCCAATGGGCGTCGAACGCGTCTAGGCTTGGTGGACTTGTCAGCGCCTGCCTGGCTTCGGTCCCCAACTTGAGCACCGGCCCTTTCCTGCATGCCCCTAACCCTCGCAACGCTAAACAGTGAAAAAAAAAAAAAGACAAAAACAAAAAGCATCTCAACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACACGGATCCGCGTTCAGAAAGGCGTGCACTTCCTACGCCTGATCCCCCGCATCGCAACCTCGCAGCTTCCCCGGCGTGCAGCGCTCATTTACCAATTCCCTTCCTGGGAGTTGCGGCTTCCCTCGCTCGGCCCCACTCCCGTTTACCCTTTCCCCAGCTCCCGCCTTAGCCAGGGGCTTCCCCGCCTGCCGCTAGGGCTCGGGCCGAAGCGCCGCTCAGCGCCAGCCTGCCGCTCCCCGGGCTCCACTTTC|ACTTTCGGTCCTGGGGGAGCTAGGCCGGCGGCAGTGGTGGTGGCGGCGGCGCAAGGGTGAGGGCGGCCCCAGAACCCCAG|GTACAGCGCGCTCGAGCCGCGGGTAGGGGACTGCGGGCCGGGAGGAGAGCGCGGCACCCGCCCCTTCCCTGCGCCCGTCAAGTGGGGGGCTGAGGGCCTGGGGCACGGGAGGAGGGAGACGGGGCACGGGAGGAGGGAGACGGGGCGCGGGAGGAAGGCGACGGGGCGCGGGAGGAAGGCGACGGGGCGCGAGACAGGGCGCGGGAAGGGCGGGGGGAGTCGCTGGCTAGGCCCGAGTCCGCGGGGTGCCCGGCGGGTTGGCGGCGGGCCCACCCCTGCCGGTCCCTGTCCCTGTCCCTCCGGGCGCGTGGCCGGTGCGCCTGCTTCACGGGTCTCCCCGCTGTTCGGCCGGCGGGAGCCTCCCTCAGCGCTCCGCCTGGCGCCTGGATGCCTGCCAGTCCTGCAGGCCACCGACGCCCGCGCGAGGCCAAAAGGCGGGGTGGGGCGGGCAGCTGGCTCGGGCTGAGGAGGGCACCTGCCCATAGCTGCTAGAGAAACCCAGAGGCTTTGGGTTAAAGACTCTGGTGGGGTGGGATGCGCGGGCCGTGTGTGTTCTTAAGGTCACTTCCCTCCCTGCTTCTCCTGTTCTTCTGGTCAGCAATTCTCTCTCTCCCCTTCGCTCTGGCTCTGGCTGGGTTTTATTCAGATAAAGCACCTCTGTTGACGCAAATTAAAAGTTTCCTATCTGGGTGCCTCACTGGCCAGGTGGTCCTACAAAGTTAATTCCATGAGGGGAAGGGGGAGAGCACACACTTCCTCACGCTTTTGGATTTCTTTGTGTAGGCTAGGTTCAGAAAGAAATTATCTGTTTCCTATTAAACACCCAGAGGATTCGCTCTGAACTCAGGACGTGGTCAACAATTAACAAAACAACAAAACAAAACAAAACAAAACAAAAAACTTGAAAATTGGGCACAGTTGTCTCTTGCCTGAGGATTTTTAATTAGTATAAGTAGCACATTTTCAGGTGCGGCCTGAATAGAAACATTCTAGTACTTTTTTTTTTTTCAAATTAATCCAGCATTTTTATTATTTACCAACAGTGCTTGTTAATTTCATTGTTCAGGAAATTCTGGAAGAACCTCAATTACTTCTTGATGATCTATTTCATATACTATAGTGCCCCAATAAAAGGAAGGGAGGCAGAGGTTGCAGTGAGCCAAGATCGCACCACTGCACTCCAGCCTGGGAGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAGAGGGATCTTGGTGAAGCTGAAAAAGCAGGGAGTTTATACTCACACAGATGTGGATTGCATTCCAACAAGTTGTGTGAACTTAGCAAAGTTACATGAATGGTTCTTTGCTTCAGTTACCCCACCTTGAATGAGAATAATAGGCTATTAGAGAGAGTGAGCATGTGTAAAGTGCCTGGCACTTTGGAGGAGCTTAGTAAATATTAGTTCTTTTTCCTCCTTGGCTTCTATTCCTTTAAGTGTTAGTGGAGTGTAACTTCAAAGAGAATTGTACTTCTCTTGGGAAGAGCTGCTTATATTGAGTAGACTACTGTTTTTGAGAGCTTTCTTTTTTTTTAACTGCCTCCCTTTCAGAAAATTGTTAGTAAAACCAGACCCTAGGAGACCAGCCAGAACCATGAAATGCCATGTTTCAAACTGGAACACTTACTATTGGAAGCAAAGCCAAACAAGAGCTAATTTTTCAGGGGAAGAGCCAAACACACGAACATGATCACAGAATCTTGGATGTAGCCTACAGTTTGGGATTAATAAGAGAATTTATTAGTGAAGCCCTTTATTCACTACATGGAGTTTTTACCAAGCCCCACTCATGCACTGCATCCTCGTTGAGACATAACTGTTTCTCTTTGGACCCCTCATGGACCCAACCCTGCAAAGCCTCTGATCCAAGGTCCCGGTACCAACCCCTTCCGCAGCACATCAGCCTTTCTGTCAGCTCATAACGAGTTGGAATTTCTAGATCTTCTCTGGGGCTGTTGGAGAGGTCTCGGGGACTTTCAGAGTCCTTACATGCTTGAACCTGCCACCTTCACAGAGTCCTCTGGATCCCGTCTTGGGGCAGGGGCGATGCTCATTAAGCTGTTGCTGCCAGTAATTCCATATGGAAAGCAAAACACAAGTTCCATTTACTCTCTAGTTCCCCAACTTCAAGGGCAAAAAAATGTTCTCCCTGTTCACACTTCCTGTCTCACCTGGGTGGTGCCTTTTGAACTGGGATTATGAGATTTCCAAGACTCTCTCTAATGTGTAGGTATCCTTTCTGTTTAGCCTCCAGATTGCTCCAGAGGTGAGGAGAAGGGAATTCCCTTGAGCTGTGCATTTGGGAAGGGAGCAAGGAAGTCAGGGGTTAGGGAAGGCACTTCAGCCATTGCCTTGAATTAGTATCCTATCACATAGAGTTGAAGGGGGAAAGCCAGGATTTGGCAAGGATGAGCTTTTCAACCTTGGCTTCTCAGTAAAATCTCTGGACAGTTTTTTTTTTTTTTAAAAAAAAAAACCAAAAACCAGAAATTAAACAATCCGTAAAACCATACCCCGATCTTATCACCATAGGTTCCAGTTTAATTGTTGTCAATAAGGACCCAGGCATCGAAAATTTTAAAAGCTTCCCAGGTTACATTAATATGCAGCCAGAGTTAGGAAACTTGACATCTCAGAGAGGGAGAATTCCATGTACTGTGAACACTTTGAGGGATCCACATTGTAAGCTTGCTACTTTTCCCAACTGGAACACGAGGAGTTTGGGCCAATCACTTGCATTCACCTGGTATGGGCCCCTCTTGGCAAACACCCATTAGAAAGGTGCGTTTGTATAAAAGAAATAAAAACTTATGTTTGATGCTTGGGGCATGGTTTGCCAACTTCCTTAAAATTCACATTGCCTTTTTTTTTTTTTTTTAAAGACAGAGTCTTGCTCTGTTGCCAGGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGCAACTCTGCCTCCCAGGTTCAAGCGATTCTCCTGCCTCAGCCTCCCAAGTAATTAGGACTACAGGTGCGCACCACCACACCCAGCTAATTTTTGTATTTTTAGTAGAGACGGGATTTCGCCATGTTGGCCAGGATGGTCTCAATCTGTTGACCTTGTGATCTGCCCACCTCATCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCATGCCTGGCCTGAAATTCACATTGCTTTTATGTCTTTAAAATCAATCCAGGTGATTGATACGTTTGCCACAAACTACTGGGAAAATTAAGCTCTTTAAGCTCTTCCTGGCTAAATAAATAGGTAATTAACTTTTGGCATGACAATTTGAGGAAGACACTGATGTTATTAAAGGTTACCATTACATATACTTACAGGGGAAGTGGAGACCTCGTTCTAGTGGTGGCTGCCACTGTGTCAGCATTTGATTTCCAGGGTACTAGGGTGACTTTCTCCAAGGTCCAGTGTCAGTGGGGAGTGGTGCTTGATCAGATGTTCCTCTGATATGGTTCTGGTTTTGCTTCTTTCATGAGCCTGGTTTCCCAAATGTCCTGCAACTCTGTGACCCGTGTAGTGAGCCACTCAGGATCCCCTAATGATTCCTTTTCTGCCTATATCAGCCAGAGCTTGTTCGGTTGCTTTCAACCAAGAACCCTGACAGGTAGAGTTAATTTAAACTTTGAACATCAAATGATGCTTTCTAGTACGTGTTAATGATTGTTGCTAACTGTAAACATCTTTCTTATATGAAACCATAACATAGGGAAGGTCCTTTTACTTTCAGGAAAAGACCTAGTACTTTTGGAAGTTTATGCCTATTTCTGTGAATGCTGGGTGGATACATTCTGAAATTATGCTGTGTCAATAACATTTTAATGACATATATTTTTGCTTTTGTACATTTGTGCCGCTAG|GTAGGTAGAGCAAGAAGATGGTGTTTCTGCCCCTCAAATGGTCCCTTGCAACCATGTCATTTCTACTTTCCTCACTGTTGGCTCTCTTAACTGTGTCCACTCCTTCATGGTGTCAGAGCACTGAAGCATCTCCAAAACGTAGTGATGGGACACCATTTCCTTGGAATAAAATACGACTTCCTGAGTACGTCATCCCAGTTCATTATGATCTCTTGATCCATGCAAACCTTACCACGCTGACCTTCTGGGGAACCACGAAAGTAGAAATCACAGCCAGTCAGCCCACCAGCACCATCATCCTGCATAGTCACCACCTGCAGATATCTAGGGCCACCCTCAGGAAGGGAGCTGGAGAGAGGCTATCGGAAGAACCCCTGCAGGTCCTGGAACACCCCCGTCAGGAGCAAATTGCACTGCTGGCTCCCGAGCCCCTCCTTGTCGGGCTCCCGTACACAGTTGTCATTCACTATGCTGGCAATCTTTCGGAGACTTTCCACGGATTTTACAAAAGCACCTACAGAACCAAGGAAGGGGAACTGAG|GTATTTTTTTTTCTCTTTTTCTTTTAAACTGCAAGTGCTGCCCACGCTAAATTCATTATTTCAGATTGATTGTCTTTTAAAATTCCCTTTGCTGTTGAACTTTTTCTTCAGTTTTGCTTTTGCATCTTCTTTATAGTGTTAAAAATGGCTTTTTCCCTTGCTTTTTAAATCTCATTTTAAAATTCTATTTTAACCAATTTTCTTTCCCCCAGCTCTATCAGAGTAAATATCTATTTGTTTATTTGGTTCGATTTCTGAGACATAATAAACATGTTTAATTTTCCTGAACTGTGTATTAGTTTCCTAGGGCTGTTGTAACAAAGTACCACAGACTGGGTAGCTATAAACAACAAAATGTATTCTCTCTCAGGTCTGGAGGCTAGAAGTCTGAAATCAAGGTGTCAGCAGGCCCGTGCTCCCTCCAGACTCTGGGTAGAATCCTTCCTTATATCTTCCTAGCATCTAGTGGTGGCCGTGGATCCCTGGTACTCCATGCCTAGCACCTGCGTCATTCTAGTTTCTCCCTCTGTTAGTCGCATGGCCATTCTATTTTCCTATGTCCCAGTCTCCATCTTCTTATAAGGAAACCAGACATACCAGATTAGGGCCCAGCCTGGTGGCCTCTTCTTCACTCCATGATACTTGTAAAGACCCTATGTCCAAATAAGGTCACATGCACAGATACTACAGGTTAGGACTTCAGCAAATCCTACTAGCAACATATTAGAGGAAATACTTTTATCTCAGTTAAAACTTTTTTAGAGATCTCTTCTCACCTTGCTTTGGTTCTGTTTTTAAGGAGGAGACTTATTTGGGGGAGATTTTATGCTCAGTTTTAAAATGGAATTTTATTTGTTGGTAGATTATACTAATTTATTTTTCAAATTCCATATTATTTTATCAAGGTAAGAAAGTAAAATTTATTTCACTCATAGCCCCCTGAACTGACCACTACTTCTATTTCACTGGAATATTCTGCCAGACTTCTTTCTGGATATGCATATATATTATTTTATATAAATAGAATTCTTATATTTTCTCTTTTACAAGGAAATTTTTTAACTTAATATGTTGGGAAGATGTTTTATATAAATGACTATTACTAGACATCTTTTTAATGATTTTATTAAAATACATAGTTGTACATTAATATCTTTAATCCCTCGGGAATGGAAATGAAATTGTTTCCAATTTTTCAGTATCAACAACACTTTGATGAGCATTTTTGTAAATTATTTCTTTAGAATAAATGCCTAGAAGTAAAATTGGTAAGCCAAAGAGCCTATAAATTTTTGATAAACTTTGTCATTTTTTCTTACCCCTCACCAATAGTGTATATTGGCAGTCTTTTTAGAGCAAAAAAGGTCCCCAAATAAATGATCTCTTTTTAATTTACGTAACTTTGATTAAGGAAGTTGAGCAGCTTTTGATATGTTTGTTAATGATTTATAGGTTCTTCTTTTATGAATTGCCTGTTAATGACCTTTGACTTTGCCTGAGATTCCCTTGGTTGTTGGGGATTTTTTTTTTTTTTTTTTTTTTTTTTTTGTCGAGACGGAGTCTTGCTCTGTCACCCAGGCTGGAGTGCAGTGGCGCGAACTCGGCTCACCGCAACCTCCGCCTCCCGGGTTCAAGCAATTCTCCTGCCTCAGCCTCCTGAGTAGCTGGGATTACAGACGCGCACCACCACACCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAGGGTGGTCTCGAACTCCTGTCCTCGTGATCCGCCCGCCTAGGCCTCTCAAAGTGCCAAGATTACAGGTATGAGCCATTTTTAGGATTACATATTTTTAGGATCTCACTATCTGTTAAGGCTATTAAGTTTTCCTCACATACTTCTTAAATGAATTTTCCCTGTTTTTTACTTTTTCTTTTTAACTTTTTTTTTTTTCTTCCCGAGACAGGGTCTGGTTCTGTCGCCCAGGCTGGAGTGCAATGGCGCAATCTCAGGTCACTGAAACCTCTGCCTCCTGGGCTCAAACCATCGTCTCACCTCTGCCTCCCAAGTAGCTGGGACTACAGGCCTGCACCACCATGCCTGGCTAATTTTTGTATTTTTGGTAGAGATGGGGTTTTACCATGTTGCCCAGGTAGGTCTCACACTCCTGGGCTCAAGTAATCCTCCCACCTCAACCTCCCAAAAATGTGTTAGGATTACAGGCATGAGACACCATGACCATGCCCAGTGTAACTCGTCTTTTAACTTAGTTTATGATAGCTTTTGTCAGATGAAATATTTTTTAGTAGCAAAATCCATCAATCTTATAGTAACTTTAGGAATTAAAAAGCAAACACCTGTTTTAGAGTTCCTGGTGCTTCATGCTCAATTTTTATTTCACTTGCCTTAATTTTAG|GATACTAGCATCAACACAATTTGAACCCACTGCAGCTAGAATGGCCTTTCCCTGCTTTGATGAACCTGCCTTCAAAGCAAGTTTCTCAATCAAAATTAGAAGAGAGCCAAGGCACCTAGCCATCTCCAATATGCCATTG|GTGAGTCTGCACTCCTGTGTATTTTCTATAGGAAAATCTACTGATTCTCTGTGACTTGCACTAGCCCAGTGACAGTCAACATTGGGTCACCTGTTTTGTTTTATTGCCTGGCAGATCGTTACTAACTTTTCATTTATAACCTATGCTTTTGTTTCAAGCCATAGTTATATGTAATCAAAGTAAAAATTGCACCTAAAAATGCAAGATTTCAGTAACAGTGCCATTCCAGGTTATACATGCTGATAGGAGGGAAGTGGTATAAGAAATTCAGGTCAGGTTTAAATATTAGTGCCCTTCACAAAGCACTTTCACCCTCATTTTCTCATATGATCTTTTAAAATGATTTTAAAGGTAATGCTTTGTCAATAAGGCCATATTTTTAGCATACAGTTATTTTCTCTAAGTTACATATACTATATAGTAATATTCATTATATAATTTGTAACTGAATGTACAAAATTGGGCAGACAGAAAAAGAGAATAAAAGTAATCTTTTCCAAATATTATGGTGCTGAAGGTAAGTCATGATAGATAGCTTAGCTTCCAGAGGGAAACTATTATTCCCCAATCTCAATGCGGATGTGGACAGCATTCCCTCTGATTTTTAAAAGTGACTAGAAGATGACCATGCCAAATGAATAAAACTGTTCAGTAAGTGCCATCATCCTTTGATTCTGGTAGTTTAGAAAAGCATCAGCTGGGCCGTCATTCTGCAGCTGGTATATAACACCTCCTGGAAGCACATCCTTTGTTCAGAGAAACTCACTGGGGATCAGAGTCAGAGTAGAATAGGCTTTGCCTAGAGTCCTGAGGGAAGAACAGCTTTGTCCCTGTGCTGACCGGGGAAGCAATATCATAACATGGAGAGATACTGAGAGCCACAGACCAACCTCTAGTGTGGTGCTTCTCAACCTTAAAACTCTATACCTACCTGCCTTCATACCATAAGGATGCCTTCCTCAAACAAGATTCGGATCTCCCACCCCACGCCAGAGGGCTATCCTCTGTAAAAATCACTCTTCTGCAAATTCCTACCAGCCAAGAACTTCTGTCCCCACTCCCACCCTTGTATATGAAAAGACAAGAAACAATTATGCTATTTTCCTAATATAAATTTAATATACAGGATGTGCTTTTTCAAAATATAGTCCTCTCCCCCAGTATTGTGGTATTACTCCCTGGGACAGAAGTGAGTTCTTTAATTGGTGAATCAAAGTTCTAGGAGAATAAAGGACCAGGGATGGGAAGGAGACAGGAGAGAGACTGAAGGACCAAACAGGATTAGTGGAGAAATTTGTAGGCTTTCAGAAGGGAGGCCTGGAGCTTTGGAAGCGCCACAAAGATGCTACAGTCTAAATCCATGGATATCCAGGATCCACTTAGTGAAGATAGGAAAACTTCTTTTTTTTTTTTGAGGATAGGAAAACTTCTAATGCAATGTTGTCCCTTTGGACGGGAATACCTCTCACTAACAGAAATCTAATACGAGTAGCCTGACCTCAGGCTGCAGATATTGAGCTGAGGGGAGAACAATGGGGTCTCAAAAGATCTTTTTGGAGACCAGAAAAACACAATATATACCATTGGAACATTGAAGCTTTTGGGCATGGGGCAGAAATTAATCACATTTAAATTTGAATTAATTTAATCAGGTTATTTTCCTAATAATTAACACAACTCGAGAATGGAAATTTTTGGCCAGGTGTGGTGGCTCATGACTGTAATCTCGGCACTTTGGGAGGCTGAGGCAGGTGGATAACCTGAGGTCAGGAGTTCAAGACCAGCCTGGCCAACATGGTAAAACCCTGTCTCTACAAAAATACAAAATTAGCTGGGCGTGGTGGCACATGTCTGTAATCTTAGCTACTTGGGGGGCTGAGGCAGGAGAGTCGCTTGAACTCTGGAGGTGGAGGTTGCAGTGAGTCAAGATTGTGCCATTGCACTCTAGCCTGGGTGACAGAGTGAGACTCCATCTCAAAAAAAAAAAAAAGGAAATTTTTGTTGTAGGTAGGCAGAAGCAGAATGCATTTAAAAAGAAAAGATGATTTGGGATCCTTTATGAGTAATCCTAGGCTGGGTAGCAGAGTTGGTTTGAATGACCAAATAGTGACCAGAAGTTGGTGGCTGATGGGTATTAAGAAGGATGAGGGCCAGGTGAGGTGGCTTATGGTTGTAATCCTAACACTTTGGGAGGCAGAAGAAGAGGATTTCTTGAGGTCAGGAGTCCAAGACCAGCCAGGGCAACATAGCAAGACCCTATCTCTCAAAACAAAAAAAAAAGATGAGGTCAGAGCAATAGAGGTAAGTATTGGATTACAGGAAAAATGCCCGTGACCATGGTTTCACCCAGCTAATTCTGGCTGGTTCTTTTTCCATCTCCGTGCTTTTTATTGCTGACGTGTTAGACTTTCTTCTTTAGGGGCAGACCTCTAAGACTGTACCTCCATCAACTATACCCCACCCTTACTCTCTGATTGCACTTAAAAAGGTGATTCCAATGAAGCAAATGAAGCAAATCTTTTTTTTTTTTTTTTTTTGAGATGGAGTCTCGCTCTGTCATCCAGGCTGGAGTGCAGTGGCGTGATCTCGGCTCACTGCAAGCTCTGCCTTCCGGGTTCATGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCGGCTAATTTTTCATATTTTTAGTAGAGATGGGGTTCCACCGTGTTAGCCAGGATGGTCTCAATCTCCTTACCTTGTGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACTGTGTCCGGCCCAGGTTACTTTCAGTTATACAGCAGAACAGAAGCTCTTTTAGGTACTACAGGGTTATATATTTTCCCCGTTGCGTATATGCTCAACAGCTCGACATTGCATTGCCAGATAATTCTCAAACCTGTATTTAAGGAAAAGTGGATCAGCCACATCTTGGCAAAACTCACAATTTCAGTTTTGCTTTGTCTCATCCGTGTTATCAATCCACATATGCCAAATGTGGATTTACAGTGTATTGTAAACTTTAAAATGGTAAGTTGTATGGTATATGAATTATATCTCAATAAAAAAGAAATTGAGTAGAACTGTTTGACGTTAATGTCTAAATTATAATTAGACATTGGAAAGATAACTTTTAAAGTAACTATAGAAGCGTCATTAGACAGGGTCTGGCTCTGTCATCCAGGCAGGAGTGCAGTGGCTCAATCTTGGCTCACTGCAACCTCCACCTCCCAGGCTCAAGCCATCCTTCCACCTCAGCCTCCCCAGTAGCTATGACTACAGGCACGCACCACCAGCAGGACTAATTTTTGTATTTTTTTTGTAGAGATAGGGTTTCACCATGTTGCTTAGGCTGGTCGCAAACTCCTGAGCTCAAGCATTCTGCCTACCTCGGACTCCCCAAGTGCTGGGATTGATACACTTTTAATATTATGTCTGATAATTAGGAAATTTATCATGTTCACTGTATTGGATAATTGGATTACTTGATAATTTGAATTATTCTGATTTTAG|GTGAAATCTGTGACTGTTGCTGAAGGACTCATAGAAGACCATTTTGATGTCACTGTGAAGATGAGCACCTATCTGGTGGCCTTCATCATTTCAGATTTTGAGTCTGTCAGCAAGATAACCAAGAGTGGAGTCAAG|GTGAGCCTATGACTGTCACATATGGTGACCAGCTTGTTCTGGTTTGCTTGGAACTGGTTTTAAAACTGGAAGTCTGCCTGAGCGCAGTGGGTCGTGCGTGTAAACCCAACATAAACCCAACAGTTTGGGAGGCTGAGGTGGAAGAATCACTTGAGGCCAGGGGTTTGAGACCAGCCTGGACAAAATAGTGAGAACCTGTCTCTGCAAAAAATAAAATAAAAAAATTAGCCAGGCATGGTTCCTTGTGCCTGCTACTAGTCCTAGCTACTAGGGAGGATCCCTTGAGCCCAGGAGTTTGAGGCTTCAATGAGGTATGATTGTGCACTCCAGCCTGGGCAACAGAGCAAAACCATGTCTCTAAACAAACAAACAAAGACAAAACCAAATACCAAAATCCTGGAAGTCCTGCATCCTGGGAACCTTCTCAATCTCAGGCAAACTGGGATGGTTGGCCAGCCTGTTGTCACGGATGCTCATTTGTATAGTGAGGTTCTAATAACAACAACGTGGAGAGAGTGTGGCCTGGCCTGAGTCATGATCCTGGCTTCACTGCAGTCACTTCACTGACTCTCTGACCTTGGCCCTATTCCCTCTGAAACTTAGTATTTACTTCTTTGGAAGGTATAACTTGGACTAGATCCTGCAATGGTCTCTAAGGTTGCTTCTGGTTATGGCTTTCTGCAGTTTGGAAGTAAATGTTACTATCTGGCAGGGGATTTCTGGCTATGGTAAGGAAGATAGAGCAACCTGCTTGGAATACCCAAAGGCTTTGGGCCAGGTACACTGGAATGCTGGAGAGAAAAATCTTGTTTCAAGGCACACTTGTTCCTCATTTGGGTACTGTTGCATAGTGGGCAACCTATTCAACTGTGTGCCGTAGCTCAGAATGCAAACAGGTTTTTCTGAGGGGAGGAAGGGATGCTTTGTTTGAAGATACCTTATGTGTTTGTGCTGGTTTTCACTGAGGCCTGAATAGATGGGGATTCCCTGCTGAATTGCTTTGTGTTCCTCTAGTGCTGAGATTTCTTATTCTTGTGGAGGTATCTTTACTTAACTGGGGATTTGAAGGTGACGCTTGAGACTCGGATGAAGGGAACATTCTTAATTCAGCAGTGAAACTATCAGCAAAAACACCCGCCCATTGCTTTGCCACTTATCTGAATCTCTTAGAAATGATTATTTTAGTAATGTCTAATCTATATTAATATTTTTAATTCTTTCATTTCTTTAAACACATTAAGCATACAATTATATATCTGTGTCTGGTAATTGTTTTATCTGAATTCTTTGTGTATCTGATTTTGTGGTTCGTTGTTTCTGCTGGCTCTTGCTTATGGTATCTTGTTTCCTTGTTTGTATTATGAATTATGTTTGTGAGCTTACGTTGCCTGAGTCTAAAGTGGATTATTCCAGAGAGAAATTGTATTTGCTCTTACAGAGTGTCTGGGAGTACTTACTGGTCCAGGGATCACTTTACTTGTAGTTTCCTTGAGAAAGGGTAGTTATTTCTAGTTTACCTTTACATTAAAGGCCTGGCCTTTGGGTACTAGCTTTATGCAGGGATTGTATGTCCTGTTAGACTTTCTACTTTGGGCAGGCCCTGGACTTGGTCTCTTAACTCCTGAGTCCTTCAATGACATAAGAACCAAAGCTCAAGTCCAGCTGTGTTGGGCTAGTGCCGGCAGGGTTAAAGCTGGCTGCAGTGCTCTCCTGACATCAGAGGGTCTAACTGTCATTTCACTTTGGCTTCTAAATCTTTCTTTCTCATTTGCCATCTTATAAACACATTTAAGAACACTTTATACATGTTATCCAGCATTTGTTGTTGTTTTCAGAAGGGGGATTAATCAGGAACAGTCAGTATTAATGCAAGAAATGGAATTCCCAATTATTTTCTTTAATATTGGCAACCATATCCCACAATATGAAGACATTAATGTCAGTCTTCTACACAATGTGGGGAGAGAAGCCAGTTAAGATATTTGAATTCCTTTCTGTGCCTTTCTCTTTAG|GTTTCTGTTTATGCTGTGCCAGACAAGATAAATCAAGCAGATTATGCACTGGATGCTGCGGTGACTCTTCTAGAATTTTATGAGGATTATTTCAGCATACCGTATCCCCTACCCAAACAAG|GTAGAGATTTTGCACAGATATTACACATGACATTTGATGAACACAGTCATAGATTTGTCATTATAATTGGCACATCCCTGTAGTTGCCTCAGCAGCCCCTCAAGCCACAAAAACCCCAGCAAGTGACAAACCTGCGGTTGATCTTTCTGAGCATCTCCTCACCCTTGATGAGTACAGTAACTTCTAGTGATAGTGAAGAAAGCAGATCTTCATAGAGTTCTTGAGGCATATGGCATGGGGACTCTTTTGCCTTCTGATTTTATTAGTGGGCAGACAGCAGAGGGAAGAGGCTACATTTTTTCTTTACTGGCACCTGCTTGGCAGGAACCCAGAGGATGCTCAACAAACTGTTTTGAATGAATAAATTTATATAGTGTAAGACAATCTGAATTTTCTTTCTTACACAAGCCCTTAAAACTATTGTATGTTTGATTTTTTAGGTATATGTGGGTCTTGGGCATCCAAAATAGAATGGATTATTATAATTGTTTAGTTTTTCAATTTCCAAACTCCTCATCAGAAGGTTAGAAATGGAGTCAAGAGGCCTGAAAAGGCGGGCACGGTGGCTCATGTCTTGTAATCTCAGCACTTTGGGAGGCCCAGGCAGGTGGATTACTTGAGCCCAGGAGTTTGAGGCCAGCCTGGGCAACATGGCAAAACCCTGTTTCCACAAAAGTACAAAAATATTAGCTGAGTGTGGAGGTGCACTCTTGTAGTCCCAGCTACTTGCAAGGCTGAGATGGGAGGATCACCTGAGCTTGGGAGGTTGAGGCTGCAGTGAGGTGTAATTATACCACTGCAAATGCACTCCAGTATGGGTGACAGAGTGGGACCTTGTTTCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGCCTGAGATGCAAGTCTGACTTTGCCACATTTATAGCAAAGTGATGCTGAGTCACTAAGCCTCTTTTTCTCCATTGTGAAAGGTTCTTCCAGTCTAGAGCTCCATGATTATGCACGGGCAGCTGGTTCAAACACCTATCCATTCTGAAGATTAGTGTTTGGGAGAATGTATAGCTTAGAGACTGGTAATGTATTTTATTACTTCCTTCCCAAG|ATCTTGCTGCTATTCCCGACTTTCAGTCTGGTGCTATGGAAAACTGGGGACTGACAACATATAGAGAATCTGCTCTGTTGTTTGATGCAGAAAAGTCTTCTGCATCAAGTAAGCTTGGCATCACAATGACTGTGGCCCATGAACTGGCTCACCAG|GTATAAGCTCATTCACACTTTTAATAAAGTATAAACTACATTTATATTGCTTCTATGGGACATATAAGGCTATTTATATAATTTTTACTTTGTCTTTTTTTAATAGGAAAAATTGTTTCTCCAAAGCATTCGTTTTTATGTCTTATAATGCATGTTGAACTTTTTTTATTTTTACCTTGATTAAATATTGGTCCTGTAAATATATGTTAACATTCATAAACTTATATTGGACATCTAAAATATACTTCTTTCTGAGTGTCTTTGTTTATGGCTTATGTTGTGCTTTTAG|TGGTTTGGGAACCTGGTCACTATGGAATGGTGGAATGATCTTTGGCTAAATGAAGGATTTGCCAAATTTATGGAGTTTGTGTCTGTCAGTGTGACCCATCCTGAACTGAAAGTT|GTAAGTAGTTATTTATCCTTCACATTTGAGGTTAATTTGTTGTTTTGTTCAATATTGCTGGAAAATATTCACTAATCTTTGATTATAGAACTTATAAAAATATTTCACTGATAACTTCCTTGACAGTTTAGATATGAATCGTGTTGCAAAAATGCTAGTGAACTTTCAAGATATACCACAAAGACTTATTTACAAACTCAGTTGTGAATCTATGATTGTTTGTAAATTGTCCGATTTTATATTTCTTAATATCAAAGAAATAGAAATAATGGCAGCCTAAATGTTCCATTTCATTTTCCCAACCTTCAGGTTGCTCATTGCAGAATTATTAGGTACAGATTACTGATATCTCAATAGGACCCTACTTGCCAAATCAAATGAGTTATACTTAAGGTAACTGCACATTTGATTGTATAACAACCTAGGCTTCTGGATCACAAATTTATTGCTTTGGAACTGCTATATGGTTTATTTTTTAAATCACCACATTTAACTTAAGAAAATCACATATAGCAAAATTAAACTTATGTAAAAATTTTTTTATCCTACTCCAATGATTCTTTTCATTTGTAAATATTTATTCCTATTTTCATCTGCCTGTATACATTATTTAAAATATACATTTGTGAATATTTATTACTGCCTATTTTCATCTGCCTGTGTACATAATTAGAAAGTACACATCAATATTGCATTATAATCCTAAATATTTTCTTATGTTTCTACATGATCTTTAGTAATAAAAATGATTATGGATACATATTGTCCTGTTGAGCTAATGTGCTCTAATAAAGCTGGTTAAGGTTCTCCATTTATTTTCATTATGTTTTTAAAGGTAAGTATCTTTGCTCTAAAACAGTACAGACAATGATTGGAAATGTTGAAATTACTATACAGTTAATTTCTTGTTGTGTTGCTGTTTGGCTATAGGCATAATTGTTTCGTTTTTAGATTAATACGAAATTTTCTTTATTCTAAAGGACTTAAGATGAACAAAATCTATAATGTTTAACTAAAATCATTGTTTCTTGGGTAGCTTTTAGAATATATTAATTCTATTTTGTTTCATAAATATTAGGAAATATGGAATAGGTTGCCCCCAAAATGTGAAGTATGGACTTCTTGCCTCAGATAAAATGTCCCACCTTTGACATTTTTTATCTAAATGTAAATCATAGGTGATGTTTTCTTTTTTCTATCTCAATAG|GGAGATTATTTCTTTGGCAAATGTTTTGACGCAATGGAGGTAGATGCTTTAAATTCCTCACACCCTGTGTCTACACCTGTGGAAAATCCTGCTCAGATCCGGGAGATGTTTGATGATGTTTCTTATGATAAG|GTAAAAGTAGATTGAGTATAAGGATACAGTTTAGATACTAAAGTTATACATACTGGGGTGGAGAAGTTATAGGCAAGGTTGTGGGGTTAAACCCAGATTGAATGCCTTCTCTCTTGACACGTGCTGGCTGGAGTCAACTCTTTTAGGACTAACTTGCAGTTTGGCTCATAACCCTAAAGATTATTTTATGGGAAATTCTTCATATATTCTTTCTTGGGTTGAAAATTCATGGCTTACAGAAACTCTGCTTTTATTCATCATTCAACAAATAGGTATTGATTGAGCATCAACTTTGTACTAGACAAAAATTCCTGCCCTCGTGTAGCTTACTTTTCAAGGCCTTCAGCAATGGTTAATATTGTTGAGACGCAAATAATTGTCTTGCACAGTGTGCTTAGTAACAGAGTTGGGAATTGTTGACAACAGTTTAATGATTGGGAGATTTTATGTAAAATCCAGATTTCTAGCTTCTCTTGGGGAAGAAAAAGGAGGATTTGGCCACTGTATGTTCTTTGCAGCATAAGCTGGAGCTAAGTTGTTGTTCTCTCTTTGTAAGATCAAGGCTCTGCTTTTCCACTTTCCCTGCCATTCCCAACTGTTCTATTGTCTTCTCACCGAGGCTGAGAGTGTGTTGCCAGTTACCATTGTGCTTGGCTGTTGTTTTACCGGTAGCAAACAGAAAAGTCTTTCTTGCTTGCATGTTTCCATTGAAAGTCCAGGGGAAAAAGAATGTAAAAGAGCATTCTTCTTATCCTTGGACTACTTCCCTTATTTATATGCCCTGTCATGTGCCACTGGAGGCATTTGAGTTTGTGACTCACCATCCGTGGTAATGGGAGTGGAGGGGAAAAGAGCCCTTTACCAAGGAATACAGGGTGTCTGGGAAGACTCTTGTTCCCTTTCTCATTGTGACTCCAGCTCCATCAGCCCTCCATGCTCAAGGCTGCCTGGGCTCCCTGGACATATCCACTTTTCCTTCCCTGGCATCTACCTCTGCCTCCATCTCTAGTGCTCCACCCCTTGTTGTACTGGCCTCTCCTTAGTCCTGCCCTGGAATGGCAGTGGGAGAGCCAGGTAGTAGCTCAAGGTCCAATGTTTAATCTGCACCATTATCCCCACTCACATGTGAACAAAGGGAGTTGGCAGATGATGCTAATTTGCCCCATCGGGAGGTCTGGCTACTGATAGAAAATAAGGGCCTCAGTGGGCTCAGAGCATAAGCAATCACATTAGACAAATCTCCTGCCTAAACAGGTCCAGGTTTAACCTGCTTACTCTGTTTCACAAATTGCCAGACATTAACAGTGTTCCTGCAGTTGCGTTTTCAAAGAAATGTGTTTTATTGCAAAAGAATATGTGATTTCAGATGAGACTGCAATGAAACTATAGATAACAATTATTTCTATTATCTTTTCAG|GGAGCTTGTATTCTGAATATGCTAAGGGAGTATCTTAGTGCTGACGCATTTAAAAGTGGTATTGTACAGTATCTCCAGAAGCATAGCTATAAAAATACAAAAAACGAGGACCTGTGGGATAGTATGGCAAGT|GTGAGTATGTTTTTGAATATCTCTGCATTTGGGATTGACAGGCTTATCATCTTGTTTTGTTTTCCCTGCATTATGTTAATCCCTCTGAGGAGAATCATTGTTTTCTATAGAAATAAGAGTGATGTGTTTATTTTTGGTTTTTAG|ATTTGCCCTACAGATGGTGTAAAAGGGATGGATGGCTTTTGCTCTAGAAGTCAACATTCATCTTCATCCTCA|GTAAGTTTCTATATCTGTACATGTTCCCCCAAGCACATTCTTTTACTGCATATTCTTTGAAAGGCAGCTCTGTGCCAAACTTTCTGAGGTCCTTGATTATATCACCCTCATTCCAGATAAGACTGCATTTAAACTATTCCATACTCATAATCTTTTTCAATTTTTCTTAAAGTGTATCTATACTGGAGGGTTGCAGAGCTTTCCTTGGTAATGCTTCTCACTGATACTAATTTCTCTAGCTTCCCTTTTAAAGCAGTGGATTTATGACATGTTTCTATAGCAGATTACAGCTGCATTGTAGCAGTCAAAAGGATATGTCAGTCATTTACAGAGCTCTGCATTTGTACAAAGACAATGGCACTGAGCATTCTTGAATACTTGTCATGTGTCAGGCACATGTTAAGCACTTATATGTATTATCTCCTTTACTCTTCATAATAAACCTGTGAGCTGGGTACTATTACTATCCCTATTTTTAAAGTTGAGAACATGAAATACAAAGACATCTGATTGGTAAATTGCAGAGCAGAGATCTGAATTTAGATCTAACTCATGTTTTTAACTGCTAAGCTATAAAGTATTCATAACATCAAGTCACAAAACAGCCTGAGTCTCTGTTCATCTGGACTTGTGGGATGTTTTCAGAGGAAGCTGAGGGTGAGTCTGGAGATACAACAGAATTTTTTGTTTCTTTTTTCTTTTTATATGTTTACTTTCTTGGCTTTTTTGTTCCTAGAGACTGCTTTATTCAGTAGTTTCTAGATTTGTCCTGTGGCACATTTTATCTTTTTACTAATTCTTTTTTTTTTGAGACAGAGTCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGCGTGATCTCGGCTCACTGTAGCCTCTGCCTCCCCGGTTCCAGCGATTCTCTTGCCTCAGCCTCCCAGGTAGCCGGGATTACAGGCACACGCCACTACACCCGGTTAATTTTTGTATTCTTAGTAGAGATGAGGTTTCACCATGTTCGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACCCAACTCAGACTCCGAAAGTGCTAGGATTACAGGCATGAGCCACCGCGCCCAGCCTTTACTACTTCTTAAATCATTTCTTAAAAGCTTTTTTAGTATCAAAAAACAGCTCCTTTTGAGTTCCCACTATTTGTTGAGTGTGGGTCATCTCTGTGTTCTCACTTTAGAAACACAGGCTCTTGACTGAACATTGTTTCCACCTTGCTTGTCCAAAACCAGCATAGTTAGGTAGGTATTGAAAACCTGGCACTTTCTCTCCCTTCTCTTCCTTCATTCATTCACATGCCTGCTTTGTGCCCAGTGTTATTGCCAGCCCCAAAGTGTGCCCGGCAGAACTAGGTATGCTACCTGTCCTCCAGACACGTATGTGTAGTGGAGGAAAATGACAAGCAAACAGCTGGTGCTCTAATGGAGGTGTGATGGCTGGGGACATCACACTGAGAAAGGGATCAGAAAGTGCTCCTAAGAGGGGAGGTTGGCAACTGGTAGCGGCTGTTTGAGCATAAGCTGGTGTTCTCTCTTTTCTTAGAGTTGGGTTAAATGGGTGATGTGTCTGCCTTTTTGTGTACACACCAG|CATTGGCATCAGGAAGGGGTGGATGTGAAAACCATGATGAACACTTGGACACTGCAGAAGGGTTTTCCCCTAATAACCATCACAGTGAGGGGGAGGAATGTACACATGAAGCAAGAGCACTACATGAAGGGCTCTGACGGCGCCCCGGACACTGG|GTAATGCTCCTAGAGTAAAATTTGTTTTGTTGTCTAGGTAACATCTGCCTTGTAGGATGGAACCTTGCTTTTGAAATAATGCCCTTACCACTATTGCTAAAATATTTCAGCTGCATCTGTGTATCCTATGAAGTTGACTTATACTCCCTGCCCCCATCTTCCCAGTAGGATTAAGGAGGCTTTAAACCTTGGTTATTCTCAGTAAAGGTGACGATGTAATTACTTTAACATTCTCATATTTTGTAATTTGATATGATGGTAATTTCTGGTTACTGGCTTGAAATCAACTCCAACCTAAGCAACTGCTACTAGATTACAATAGTGCCTAGCATTTGGTTGGAGCTGAGGACAAAAGAATTTAGGTGATTTCCGAGAATGATGAGAGACTCAGTTGTCTTCTTCTGAGTTAGATTTGGAACCTGTTTGTCAGCTTAATGCTATAGAAGAATATTATTAGAAACAAGATGCTGCAACTTGATTGACCCTGGATGGATACCGTTAAAAAATTCTTTTATCTTGAAACAATTTCAGACTTATAGAGAAGTTACAAGAATAACACAACAAATTCCTATATATCCTTTACCTAGATACATGTTAACTCTTTACTCCTTTACTTTCTTCACCTCCCTCTCTCTCTTGCTCTCTCTCTCTCTCTATATATATATGTATATGTTTGTGTGGATATATGTGTGTGTGTATATATACATATATATGTGTATATATATACACACACATATATATATATAATATATACACCTCCCTCCCCCATCTTCCCAAAAGGATTAAGGAGGTTTTAAATCTTGATTAATCGATATCCATCTATCTACATACATACCTATATACATATGTCTGTATGTATATACTTGGCATATGTTTTTCTGAACTGTTTGACAGTAAATTGGTGACATGATTCCCCTTCACACCTAAATGTTTCAGCATAGTATTTTCTACAAAACAACATTTATGGCCGGTCACAGTGGCTCATGCCTTTAATACCAGGACTTTGGGAAGCCGAGGCAGGAGGATCACCTGAGGTTAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACACCATCTCTACTAAAAATACAAAAATTAGCCGGGCATGGTGGCGGGCGCCTGTAGTCCCAGCTACTCAGGAGGGTGAGGCAGGAGAATCGCTTGTACCCAGGAGGTGGAGGTTGCAGTGAGCCGAGATCGCACCACTGCACTCCAGCCTGGACAACAAGAGCGAAACTCCATCTCAAAACAAACAAACAAAAAACAAGAACATTCAGTTATATAACCATAGCCACAGTACAATTATCAAAATTGGGAAATAAACTTTATATAATACTATTATCTATAAATCCCATTCAACTTTTTCCAATTGTCCTAAAAATATCCTTATAGCAAAATAGAAAATAAATTCTGACTTAGGATTTAATACAGGATCACTTATTTTATTTAGTTGTTGCATCATTTTGGTCTCCATTTTTTTCTGAAACAACTCTTCAGTCTTTTTTGGGCTTCTCGATATTGACATTTTTTGAAGAGTACTGGCCAGTTATTTTATAGAATGCCTTTCAGTTTGGATTTTCTGCTGTTTCTCCATGATCAGGTTCAGCTTACATATCATTGACAGGAATCCCAAGGAGGTGATATTGTGTCCTTCCTAATATACTATTCAGGAGGCACATGAGTCCTGAAGTTAGGCCTCTGAGATGCAATCAGCCAATCACTGTAAAAAGGCTTTCAGGGTTCACGTGAGTCCAAATTCTTAGCTCTACTCAAGGAGTCCAAGAAAAGGAAATAGTGCCCGAGCTGACACTCCCTACAAAGCTGGAAAACAGACTGGGGATGTTTTGAGAGCTTGGCTGTAGCAGTTTTGGTACTATCTTGTTCTAAACTAACCAGTGCCTGGCACATGATAGGTGATTTAATAACTGCTTGTTGAATTGAATCAACAAATGAACAATCCACCTCTCCTTTATTTTAG|GTACCTGTGGCATGTTCCATTGACATTCATCACCAGCAAATCCGACATGGTCCATCGATTTTTGCTAAAAACAAAAACAG|GTAATTTATTTTGGAAACTACTAGTTAATTCAAGGAGGAATGTGAAAATGTGTAGGATTAAAGTGCTGTTTCTTTTGCTATTTATCTTGGTTAAAGCAAAAGGGATCAGATTGAAAATGCTTTCTGGTTTCAAAGAAGAATGGCCAATATCTTGCCAGATAGAGATTATAGCTGAATTATATAATGCTAAAAAGGAGGGATTACCAGTGAACATTCACTATTACTTTGTTAACATTACTGGGTTATATCATATGTATGGCCAGAGATGTAGTTGCCATTCTTTTCATAGATCTGTAAATCTCACAAAATTGATGTCTAAAGCTAGTGTTAGCTTTTGTCTATTGAGTTGCTGTTTTCTGGTTTCTGAGAGAAATAAGTGATGTTTTCAAATTTTTCAGTATTAAACTCTTCTCTAATTTCCTTGCCCAATCTTGGATGAGAAAAGCTTTCTTCAGGCAGGGGAGCAAATGGAACTTTGATTTATTTTATTTACTCTAG|ATGTGCTCATCCTCCCAGAAGAGGTGGAATGGATCAAATTTAATGTGGGCATGAATGGCTATTACATTGTGCATTACGAGGATGATGGATGGGACTCTTTGACTGGCCTTTTAAAAGGAACACACACAGCAGTCAGCAGTAATGATCGGGCGAGTCTCATTAACAATGCATTTCAGCTCGTCAG|GTAATACACGCTGCACAAAGTCGCGGTTTATTTCTGAAAGCAGCTGTTATTGTTCAAATTCTTGATTTCTAAAGACAAAAATGATTGATTGATAACAAGAAGTTGAAAGGTGTTTTCCTTTAAGTTCTTCTAACAACCCAAGATTGCTTTTAGTCTTATAATTAACCTCCTGTCTGTGACTACTAAGGCACTTGAAGGAAGGAATCTGTATCTTAATCTTTCTCATTAATAATTCAGATCATACTGAGGCGGAAGTGTAGATAACCAGGAGATCAACATCCTGGCGAAACTCCTTGCCTTGTCCCCGCGGCATGGTCCCGCAGCTTCTTCTGGCCCCGCTATACCGGATCAGGTTTCTCCCACCACTGGGTCTTTATAACAGCCTTGAAGTCTTCTTAGAAGACTAAGTGACACAGGCCCAAGTTGTTTGCATTACATTTCTTGGGTTAAAGAAGGGATTTTTTTTTCTTTCCTAAACCAGATACCTGAACTTTGCAGCATCTTTATGGAATATAGCTCATAAAATGTAGCCCAACCAAATGATCTCTGAGTGTGTGAAGACAGAATTAACACTACCCTTTTTTTTTCCTCCCCCATGACAATGGTTTTTAAGGAAATGCTCCCAAAGCCTAAAACTCAAACTTCTTCAACATGTAGGCAGACCTAAAGTCCTAAGAGGAAGCATGCAGGTGGGAAGGATTATCTCTTTCTCCTCATACCCGTATCCTTCTGGCCTAAATTTTGAGTGCCTTCTGGTTCTTCTCACCACCATACTGGGCCTCCTGAAGTGAGAAAAAGCAATGGGGGAGAAAGTTATGGGTTGCTTACCTAGCTGTTTCTTTCTGCCTAAAAAATTCCCCTCTGTAGAATATTTCTGCTTGAGCCTTAGAGGGCTTCTTCTTTTTTATTTTTTAACTTATATAATATATACCCTACCACTGAGGTTTTTAACACTAACCCTCTGTAATGAAGGGTTTCTAAGGATATGCAGTGTCTTTGAAATGGAGAAGAAAATGTGTTTCCTCCAGGAACTCATGGTTTTGACCAGATGATAGTCTGTTTGACCAGGCTTCCTAAAGGGTCTTTGATGGAAATCCCTTGCTTACTCTAGAAGTTCACCACTCAATCCAGTGTTTCACAGGGATAGATGCTGATGTGCTTCCTTTCCGGCTTATCCACGTTAGCCCTAGACGTTACTGTTTGAGAAACTTCCTCCCTAGTGTAAACATGCCGGATGTTGTGTTATATGTGATAGAAAAGAGACTTATATAGTAGTTATCTATCTGTTTGTTTATTTATTTTTTGGAGATGGAGTCTCACTCTGTCGCCCAGGCTGGAGTGCAGTGGTGCCATCTCAGCTCACCGCCTCTGCCTCCCGGGTTCAAGTGATTCTCCGGCCTCAGCCTCCCACATACCAAGTACCTGCCACCATGCCTGCCTATTTTATGTATTTTTAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCCGACCTCAGGTGATCCACCTGCCTTGGCCTCCTAAAGTGCTGGGATTACAGGCATGAGCCACCGCGCCCGGCCGTAGTTATCTATTTATAAGGGACATTTATATCTGATTGCTTGTTTTGCTAGAAGATTAAAATATATTTTGAAGCAAAAAGTTGACTTCCCGGATTTACTTTTAAACCACTAACCACAGTGTCTCTTGGTCAG|CATTGGGAAGCTGTCCATTGAAAAGGCCTTGGATTTATCCCTGTACTTGAAACATGAAACTGAAATTATGCCCGTGTTTCAAGGTTTGAATGAGCTGATTCCTATGTATAAGTTAATGGAGAAAAGAGATATGAATGAAGTGGAAACTCAATTCAAG|GTAAAAGCCTGAAATAAAAGTTATGTAATTATTATTTGTGTTAAAAAGTGTTAATCATTGTGTGTGTATGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTATCTTTATATACATATTAAGGAAAAGCAAATAAATATTAGTTCAGTATTTAGATGGACTAGCAAGATTTTGGTTTTATTTGGCACTAAAAATAGGAATGGCATTCTTAGACTCTATTTAATTTGCATGGATTTGTTATTTCCTTCCTTTCACAAACCTTTTATTTCATTTTCCTTTCCTATTACAATGGCTGACGTTCAGATTTTCTGGGACTCTTCATGGTACTTGAGGAAGAGGCACAAAATTGTTATGCTTGGCAAAATGTCCTGAAGTCTTGTTGCATAATTTGCTCTCAAAATTTAGTCATGTAAACTCTCTTTGAGTTTGTTTTAAAAGTCGCAATTCATCCTGACTTTAAGTAGATGGATATCTTATAAAAGTTTGTTATGAAAATATGAAAGTCATTCATCATTATTTATTGTTTTTGGCCCTTTTATAATTAAACAGTGCTGCATGATTTATACAGTAAAAGTTAGATTATGCTTTAAAAATTAGCCCCCATCATCTGAGACCATAATGGATTATATTAACATGAAATGACCTGTGACAATACTGGTCCCTGTTTCCCTGTACAACGCCCTCAG|GCCTTCCTCATCAGGCTGCTAAGGGACCTCATTGATAAGCAGACATGGACAGACGAGGGCTCAGTCTCAGAGCGAATGCTGCGGAGTCAACTACTACTCCTCGCCTGTGTGCACAACTATCAGCCGTGCGTACAGAGGGCAGAAGGCTATTTCAGAAAGTGGAAGGAATCCAATGGAAACTTGAG|GTCAGTCCTTACTAAATAACCAATTTGTTGATGTGAAGGGCATCTTTTCTGTTTTCCATCATTGGTACTAAACATTAGGGAAAACAAAAAGTTTAAGTGTCTCCCCTGCTGCCCTTTTCTGGAAAATAAATTGCTTTTTAAGATTTATCTATGTATCTCGTAACTTTAAGAAATGCTAGGAGGGAACTTCTATGCATAAAAGTCAAATATCTGGGGAGTTAGGATGGTATGGGAATAATTCTCATTTTGTATAGGCAATGCAAAATCTTATTAAACGGTGACAGCCATGCCATAGAGAAACACATGCATTGTATTTTATAGTAGTTTCTTTGGGCTTCCAAATTCCCTGACAGGGATGCATAGTAATTGCTGACACTGTCTAAGCAAGGTAGGATTTTACCGTATCTGGAAATCCCTATTCTTGCTACACTTCAGTCCTTTTTACATTTGGAGCTTAAACCCCACCCAGGAAACATTTATGCCAACAACCTACATGCTTCAGCTTAAAGAAATCCAAACAGTGGTCCTACTGCCGTCCTGACTTGTGATCATGGTGTAAATTTTGAATATAGTTTGAATTTCTTTTGGCTTGAAAGGTGATCCCCTATGGAGTCACCCAGCATCATTAAAGTATTTAAATATGTAGGTATTTATAAAAATGGCATTTCACATTTTTGAGAAGGCTACTTAAAACCTAATTTTAGATATTTTTCTCTTGCCTTTTTTTTATAAAGTGTAACTAGGCCAGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGTGGGTGGATCACGAGGTCAGGGGATCAAGACCATCCTGGCTAAGACGGTGAAACCCCGTCTCTACTAAAATACAAAAAATGAGCCGGGCATGGTGGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGTAGGAGAATGGCGTGAACCCGGGAGGGCGGAGCTTGCAGGAACCGATATCGAGCCACGGCACTCCAGCCTGGGTGACAGAGCAAGACTCCGTCCTAAAAAAAAAAAAAAAAAAAATTGTAACTGAAGGAAATCAGTAGTTTTTGTTTATATTTCCAGGCATGCTTCTCTGATCATGGGGAAAGTTCAGCAGTTCATAGTTAGTCACCATTATGCCTCTAAACATACTTACTGCACCTCAGATTCTCACCGAGTGTGTTCCTTTCTATAG|CCTGCCTGTCGACGTGACCTTGGCAGTGTTTGCTGTGGGGGCCCAGAGCACAGAAGGCTGGGATTTTCTTTATAGTAAATATCAGTTTTCTTTGTCCAGTACTGAGAAAAGCCAAATTGAATTTGCCCTCTGCAGAACCCAAAATAAGGAAAAGCTTCAATG|GTGAGTCCGTCATTCATTCATGTTCATGTGGCCAAGGGAAATTAGATTAGATTAGATCATTCTGGCATCTATTTTTGTTTTCTTGGCCAGGAATTGCCTATCCTGCTGGGAACATACTGCAAGTCAGCACACATCTAATGAGAAAGGCAAATAATTAGGGAGAACCAGGTGTTAGGAAAAATATTTAAAGGTGTATACATTTGTGCATCTTTAAAATGACAACATATTAAAAATATTTACTAGAGCTACTCTTCCTTAACTATTCTTTTGAGCAAATGAGAGGGTATATGAGAATGGGTGTTATGAACACTATATAAATGTTAGTTTTTATTATGTAGCTATAGTGGGTTTATCAGGATTGTGGTTACCCATTTATCTTTTTAACCTTTGTTACTGGCAGACTTTTTTTTCTTTTACAAGTAATTTTATTGCAGTGATTCATAACTGTTTCATCACCTATTCATTGCTAACATTTCTTTTCTTAATCCTTCTAG|GCTACTAGATGAAAGCTTTAAGGGAGATAAAATAAAAACTCAGGAGTTTCCACAAATTCTTACACTCATTGGCAGGAACCCAGTAGGATACCCACTGGCCTGGCAATTTCTGAGGAAAAACTGGAACAAACTTGTACAAAA|GTAAGTGGTGCCAAAAATTGTGCTGTGACTGGATAAGTTCATAACCTTACTGTGTTTTAGCCTTGCTGTTTGTAAAAGAACAGTAACAGTCTAAAGGTACTTTTTGATTGAAGATAGGCAGTAGAAATACCTAAAATATTTGTAGAAAACATAAAACTGGACTTCAGTGCTAACTAGTGAATCTGGACAGGGATGTTTTCCATTCCATCTGGCATAACCCCTTCCTGAGCCCATGGACATATCTGAAGCCTTCCTCCTCACAGTTCAGCCCAGGCCTTCCATGAACACATTTGCTTGTTCACATCTGTCTTTGTCTAACTCTTATAGCATTTCCTGCTTCTGTCATTTTCTGTTGGATACTTAACCTTTTATTAGGCTGTTGGTGTGTATTATTCTTTACAGCTAGATCTTAACCCATTGGATAGACATCATATTTTGTATTTTTCACACCGATCAGTTTTTAGCTGAAAGCTATTATATATAGGAGGCCCTTAAAATATATGTTAAATGAATAAGTATTTCACAACCCGTTTTTGAATATTTCCCTCTCTAG|GTTTGAACTTGGCTCATCTTCCATAGCCCACATGGTAATGGGTACAACAAATCAATTCTCCACAAGAACACGGCTTGAAGAG|GTAAAAAAAAAAAATCTATATATATATTTTTAAACATAAATGAAAATTAGCTAATTAATATGGGGTAGACAAAATACTTTGAGGGTGTGGTGAGTTAGAAATGGATTGTCATTTAGAAGTTATTTTTGGATGCTATGGTGTTGACAGCAGCATAAATCAGTTGCAATTAAACTAGTGAAAACTGTGCCTCTTCTCAGACACGTTAAGAGGTCTTAGCTCTGCCAGTAAAAACCTTAGGACTTGAAGAAAATTACTTGAGACAATTAGTCCTTGTTTAAGGATGTTAAAAGTGGGCACTGAGGTATAAATGACTGAAGTGGTCATCCAGCTGTTGCTAAAGAGGCAGGTCCGGAATCTAGGTTCTTTATAAACCATCCAAGTTCATGGTTCCCTGTCACTTATTATGTAACCATCTACCTTCAGAAAGAATTTCAGGCAGCTTTAAGATCCATGTATAAAAGAAATGTTAAAATGAAGGACAAAAAGATACATAATGGACAGGTGATAACCATGTGAGGGACTTATGGCTGAGGATAATTCTCGCAATTGTCCCCTGAATTTATCACAGACATTCCTGGAAGTCAAGGAAAAATGAGGAATCAGTATGAGTTATTCTCATTGTCTGGTAAAAGAGAACATGAAGCATACACATTTTCTACAAAAGCAGACTTTTCTTCAGTCCTAAACTCGAGGACTTTGATGTGTGGGCCATTGAGTTCTGTGGTGTCCCTTCTATAATAAGGTTTCCTATAGTAAAGTTGTGGAACAAATTTTACAAGGCTCTTAAACTGTGGTACATTTAGATAAAAGCTGAGAGGCTAATATTAATTTTTTTAGGAGTGGTTCCATTGTGTAGTCTGGTTACACTATTTTCTAGCTTGAGATGGGGAGAGAGCTTTGGAAATGAAAAAGAATGAAGGGTTGTTACAGTCTCGTAGCTTTTTTCCAGTTTTCAGTAGCCTCATCCAGGCTTGTAAAATTAACTTGCATATAATAATTCAAGCTTGAACTCCAGCGAGGTCTAGAGCAGAGATACTCTGTCTGATGACTGAAGAGATGTCCAAAGCCTTGACCAGAAAGGTAGTCATCTATAGACAAGATCTATAGCAACAAAGCATTTCATTCATTCACTTGTATTCTCATTAATTCATTCAAGACATATATGCCACACATGGTTTTAGGTTATGGGGCTGTAACAGTGAACAGACAGACAAAACCTTTGCTGTCACGGAGCTGAAATTTCTGCTGTGGGAATCAGACAATTACAGACAAATCAACCAATGTCAAGTAGTTACATGCTCTGAAGATGAATAAGGCAAGGAGAGTGATTGGGATAGAGGGTTGGTGCTGTTTTATAGGATGATTAGGAGACATTTGAGAAGATATCGAAGGAAATGCCAACATCTAGAGGAGTATTGTGTGCAGAGGAAATATCAGGTGCGAAGATCCTGAAGCAGGGCATGAAAAGTAAAGAGCCCGAGATGTTATCATAATGGAGAGCAAACTACGGACATATAAAATACGGGAGTCATCCTGAATCCTTTCTTCCCCAGACATCTAACTACGCAAGTCCTGCCGGTCTTCCCTCAGCACATGCACTGGATCTTGCGCACTTCTCTCTGCTCTTACCCCCATTCAATCCTTTCTCATCTCTTGCCTTGCCTATTGCAATAGTGTCCTAACTGGCCTCCCCCTTGCCACTTTGCCAATAGCAAAACATTTTCCACATTGAAGACTGCAAGTCAGCCCATAACTGACCCTGGCTTCTTGAAACCCTGCCTATTGTTCTTGCAGTAAATTCCAAATCCCTCCTTCCCATGTGTCCATGACCTTGAAGACCTGGCTTCTGAGGACCTCTCTGGTGCCAATTCCCACAGGCGCTCCTGGCTTACTCTCTCATAACACTGTAGCCACTCAGCCTCCTTCTGCTCCTGGGCTACCACATGTTTCTCCTGTCTTTAAGCCTCTGCACTTTTTATTCTCTCTGCCTGGCTTGCCCATTCCCCAACTTTTCACTAGAAAGGGGCATTTTATCTTTGAGAGGTCTCGGCTCAGATCACCTTCCCCAAAACCATGCTGTTCAATGCAGGCCACATCTCCTCAGTTACTAGCATACCCCTGGCATATTTTCTTCATAGCAGTTTATTTACAGTAATCATATTTACTGCTTGTATACCCCAAAAGACTGTAAGCTCCATGAGGGGAGGATGCTTGCCTGATTCACTGCTGCATCCTCAGTGTCTAGCACAGTAATTACAGCAGAGGGATGAATAAATATCAGAGTAAATATTGGTGAACAAATGGATGAGGTTAAGTCCCTGATACATGCTGGGTTTGGGGCTTTGCACTTTACCTGCATTATATAACTACATCCTTCACCACTGCTCTTCATTTTACAAATGAGGAGACTGGCTTCCATAGAAGTTAAATCATTTGCTCAAATCAAGTTAGTAGGATAAACCTGTTTTCCTATCCCTGTATTGTGCTACTTATTGCACAGAAATTGTTCTTAAAGAGCCAAGTCTGAATCAAATATTCAGTGGAGATGTTGCATTTCCAAGGCAGGTGAAGACAGAAGAGATGATTTTGGGTCAGGACAAGGGTAAGAGTAATGGTTAACAGCTGGCTTCGCTGTTGTAGAAGGTCTGTTTTTAGTGGAGCTATCATGATGAGCTCCTTTAGGATTAATGATTTGAACAATTAGTAAAATTATTGATGTGCTTTGTTGAGTAGTGCTCAGTGATTTTTAATCGACTGTGAATTAATCTTGCATTCTGAGAGCGTATGAAACAGTAGGTTGAGCTACAACTTAAAGTAGAGAGTGAAGTGTTTTCCAGGTTGCTACAGCAGGAGGCCATGCCTTCTGCTCATTGTAAAGTTGCACTGTATTGACATATAATTTTAAAACAACTCTGCATCATTTAAAATTGAATTCTGATCTTTCTAAAACCCATTCCTACTCCCCTCTTTATTCCCAAACTAATAATATGGTATTATGACATGGATTTCTAAGGAACTGGCTGTCTGGAATCTATGCTGAATAAATAATACATCATGGTCTACATTCGCTTCCTGAATACCAAATCAAAAATTGATGGATTAATGCTGTGAAAATTTATGGGAAAAGGATAATAACCCTTTAAGGTGAAACAGAATCGCACAATGGTCAATCTTGTGGCAAAGCCAGCCTATCTGTCATGTGAGCTCAAGGGGTGAATTAAGATACCCACAAATGAAACAAGAACAAACATATTTTTCAGGAGGTAGCCAAGAGTTTCTTATCTCTTTAAAATCATGGGGAACTTTATGTGGATTTTATCTTGAGACAACAATACATGAATTGTAGATTAGGATAAAAAAAATGGCAAGGTTTGGGTCTTACCGCCATAACTTGCTAAGAATCCCATTCCCCACAGTGTTTCTATAATAACATACAAGGCCAGGCCCATGGAGAAAGACAGCACTTACGTGGTGGAAACTGTTTTGCTTGGCAAAGAAAAGACTCTGCACATTCTGCTTTTTAGATATCATGTTTTTAAAAAACGAAGTTTGCATCTGTGAGAACAGAACAGAATAACACATTAAATAGACACAATTAAACCTTAATTATAAAATGGTAAACAGTGAAGTCTTTTAAAGTCTGGCAACTGAGAATAAACAAAAGAACTGCAGACAAAAAACAGAACTTGACATCATGAGGCATGAGCTCATTTCATACAGCTTATGTGTACATAATCCTATTCAGACAGCTGGGACTGCCTTCTATATAGAATTTTGACAAATGCTGGAATTTTGGCTTCAGTTTTAACTAAAGTTACATCTGATTAATGTGATAAAATTAATTTTTTAAAACCCACTTTTTCCTCACAAG|GTAAAAGGATTCTTCAGCTCTTTGAAAGAAAATGGTTCTCAGCTCCGTTGTGTCCAACAGACAATTGAAACCATTGAAGAAAACATCGGTTGGATGGATAAGAATTTTGATAAAATCAGAGTGTGGCTGCAAAGTGAAAAGCTTGAAC|GTATGTAAAAATTCCTCCCTTGCCAGGTTCCTGTTATCTCTAATCACCAACATTTTGTTGAGTGTATTTTCAAACTAGAGATGGCTGTTTTGGCTCCAACTGGAGATACTTTTTTCCCTTCAACTCATTTTTTGACTATCCCTGTGAAAAGAATAGCTGTTAGTTTTTCATGAATGGGCTATCGCTACCATGTGTTTTGTTCATCACAGGTGTTGCCCTGCAACGTAAACCCAAGTGTTGGGTTCCCTGCCACAGAAGAATAAAGTACCTTATTCTTCTCATTTTATAGTTTATGCTTAAGCACCCGTGTCCAAAACCCTGTACCCCATGTTTATCATTCATAAACTGTTTCATCAGTCTCCTCGAAAGACTCTGAATAGTCGACTACTGAACAATGAACACCTGGATCTGAGACTAAGCCGGACGATGACTGGGTTAAAGCTCTCCCGGCTCACCCCTCCAGACCCGCTGCCCATCCCTCTTCCTTGCTCCATGCCCAGGGGCTGACTTGTAAAGGCCAAGTCATCAAGCTTTCTTGCCCTTTGGATGTTGGTCAGTGGGGAGCCGGAGAGCTGGAGCTGGGGTCGGAGGAGGTAGTAGGTGGAGGTGTTCTTCCCTGATTCCCTTGCGGGATGCCTCGGGCTGGCCTCCCCTGAGGGTCTTAGCTCCGAGAGGGGACCCTCTTTTCCACACAGCCTTCTCCACCTCTGGATTTTGGTAACTGCTCCCTCCTCATCCCTTCAGGATTAGTGGCCTCAGTGGGAGTCTGGCTTTTACTAGTCCTGGCGGACTTGTGGTTTCTACATAATGTGCTCGCACTTTTGCAAAAAATCTTTTTATAGAACCCTCCTCAGATAATTCTGAGTGAGTGTCATCTATTTCCCTGACTGGTACAGTATCTCTTCTGAAAAAGCAGAGTGCATTCAAGTCTGTAGGAAAACCCTTTTCTTAGGGAGGTGATTTTTTTTCTCTCTCTGCTTCTTATTTGGCCTACTTTACAATTTCTAACTAACTAGTTATTGGCATTTACTGACAGTAAATTATTGCAGTCACCAATAAATGATAGTACATTGTGAAACAAAATATTTGCTCATATTAGCAAATAGGACATTCTTTGGCTTTGAAGTCTTTCTTTCTTTTGTGAAGACTTCACACACGGTTGCTTCAGCACACAGTTGCTGCTCAGGTTTTATGTATAGATGATAATAATAGAAAGCACAGTTTACTAACATGGTAAACCAACGGAGTTCAAGTCAAGTCAGTTAATACCCTAAGAATTAGATTTTATTTCTTATTCTGAAAACTTGCTACACAGGGACTTATCTAACCCATAGTGTGCTCTGTTGCTGACTTGATTCAAGTTGCAGCGTGTTTTGCGCTGACTCTAAGGTGCGGAAATCCTCACACCTGGCAAAGGAGAATTCAAACTGAACTTTTTGAATATAAGGCAAAAACTTCAAGATAAGGGAATATGATTGATGATTGGTACGAAAAATGTCAAAATGTGTTCCCCTAATACACGACAAAATAGAGTGACTTCTGGACATAAATCTGCCATTTATTAAACCATTCACTACAACAAATAAATAGGTATAAAAGTGGAATTGGAATTTTTATACTTATTTGTTGTAGTGAATGGTTTAATAAAAATAGAAATCACTGGTAATTTCCACCCCAAACTAAACTATTTCCCTTCTTTTAAAAAAATACACAACCAAGATTTTAATGTAAAATATTTTGCTTTAATTGTATTTTATGCCTTGATTAATGAAACATGGAAATATTGATTTTCAGTTTTGGTCACCTGAGGAACCTATCTTTGTTTGCTTTTGGAAAAGCCCATTTTCTAAACAGATACAATATTGCCACAACAATGTGCAGAAACCTTTTTGATAATAAAAAATTGTTCTTTGCCTCTAAGTGGATATTTGCAATTATTTTCTCTCTCCTAACTAGACTGTAAAAAGGGCTGCTTTAGATCCTGTAGCTTACTCCAGTTATTAGTTATTAACAAACACCCAAGTCTCGAAGATATTTCTAATTAAAAAAGAAGGCATATTCAGAGTTCTTTTTAAATAAATGTTGTTTACTTTTATAGGCATCTTTAAACTTCTGGATTTTGGTATGCCATTTAAAAATACTTCCAGATACACATGGAAATTAGTAATACTGAAGCCGTATCCTTGCAAACACATCTGTCAGTGTCAAAGGTTTCAAGGTTTTTCTTAAAAAAAGAAAACAAAAAAGCAAACACCTATACTGCCCAAATGGGAGGATTAGATACATGGTTAGAAATCCCTCAGGAAAAGTGTTTTTTCTTTTCTTGTTGCTGCCTTAAAAATAGAATAATGACTATTTCTGATGGATAGAGACATAGCATTTTAAGCTGGTGGTGTGTAAAATCCCATAGGTATGTGCATGACTTTCAGAGAGTATTTGGGGGGGAGGTTAACAAGATGTGGTGCCATTTATAAGCAGTGTTATTGTTTTTGCTTGCCCCGCTGCCACAAGTCAGCTAAGTCATAACAAAAGCTTCAAACTGATGCTAAGGAAGGCCATGCCCTTTGGAAACAATAAATTCCCAATCTGTTTTATGTTATGTACCTGACATCTTTTCCTGCATTCTCTACCAGGAAATAAAGATGAAATTAAATATCAAAATTCTAATCGATGATATCAGTGCAATGTTCAGGAACTATTCATTAAGATATTAGAAAACCATTCAAAGTGGTAGGACATCAGAGCCTACTTCTTACATTGCTGTGGGAGAAATGCAGGTTTCAAATTTAATATATATATATAATTTTTTTAAAAGCAGAAGTTTCTTTTTATATTTGGTAAACTTAAGTTCCATAAAGCCAGACGCTATACAGTGCAAAGGCTAATGTGGCAATAGCTCTAAAGACACAGTTGCTGCTCAGGTTTTATGTATAGATAATAGAAAGCACAGTTTACTAACACAGTAAACCAACAGAGTTCAAGTCAGATGAGTACCCTAAGAATTAATTAGATTTTATTTCTTATGCTCGAAATTTGCTACACAAGGACTTATCTAACCTTTATTTTGCTCTGTTGCTGACTTGATTCAAGTCTCAGTGTGTTCTGTGCTGACTGTAAGATGCAGAAGTCCTCACACCTGGCAAAGAAGAGTTCAAACTGAAAAGGGGTTTGGTGCTTCCTGGTTTGTCCAGGTTACCTGTTATTAATTTATTATTAGCAGCCCAAGAGGAGATATGTGCCCAATGTACAATATCTTATGTTTGACTTATAAACATTATCCCAAAGCAACATCAAATACAGTTCAAAAGCCCAAGAGGAAAGGGGGTAATAAGAAATCAGAACACTGAAGAATGTTTAAAACATTGTTTTCTAAACACTAACAAAAAAAATTAAGGGCAAACTGAAAATACAAATGAGATTTACAGGCACTGTGTGTAGAATGTGCAAAAATTCACTTAGCTTTTCTTTTGTTTTTTTGGTGTTGCTTTAAGAAACTTTATCAAATATATTTCTTACAAATATAAAGCTTTCTCTCCCAATTGAAGGCAATTAAAAAAATTCAAAGTTTATCAATACTCAGTACACAGGTGAACCAGTCAAATTCATTTTCTTTCTGGAAAAGAATAACAAACCAATATTTAGGATGTTCAGAGACTCAACAAAAACCATTCTAGAAATCACCCAGAACAATTGTTTTCTGTTGCCAAAGCCTTTTGTTCTTCAAAAGTCACCATCCACCAGCTGAAGATTTTACATGCAGATACCTTAAAAATTTCAAATAAAAAATGCAGTGAATCATTTAATATGTAATTTTCTTGTTACAGACATAGTAAATATACCACTTAGCAAAAGCATTGTATAACAGACAGAAGGAATTTCCTATAAGTAAACATGAAAGTGGATTAATAGAATTTTTTTTAATTTTGGGAAAAATGTTAGAGCAGTTCTACCTAATATAGCTCCTTTTTTCCTAGGAAATAAACATGGATCATGGTGAGAGAGCTGAACCCGATTTAACCTATACTTTGATTTCTTTTAGGCTTTGGTCAGTAAGTGCTTGTATGCTTTTAAGGCTTACATTAAGCCCTCTCCTTTCTGAAGATTAAGATAAGGGCCCAGTTCTGAAGATCTCAGAAATCCCTTCAATCTGTCCAGTTTCTTCAGCAATTATAAATTAGAATCAAACACAGTACTTTACTTTCCAAAATAATGACAAATAAAAATGGCCAATCTTTTCCTTTGCCCTTGTTCCCAAAACCCTGTAATTTCCACCAGACTCTAAAGGGTACTTCTCCCCACCCCACCATGTATTGGTATTCTGGGGAATGCCAATGTCTTCAGGACATTTCGCTAAGTTGTAACATGCTAATTTTGCTTTGCCACAATAACCTGCAGTAATGAGCATTTGGATTTCAAAGATTCAACTAGCCTCAGATGGTCATTCTAAGTGCCTGGCCTAATATTTTTAAAAGCTTTTAATTATAAGATTAAGGTTTTTAAACATTTCAATAGCAAACTCATCCCATTTAGTGCTTTCAGAGAATGACCAATTACTCTGTAGATCTTGCAGTATGCATTTCATGCCAATACTGTAAAGTGAGCATGAATTACTCAAGAGGTGGACTTCACTTCTCTCATCTATAACACATAAATTGGCAGAAGATACAGTTGTCTTCATTTACAAATAAACACCCAACTTACCAGATACCTTAACTTGTATTTCTTTAGTCATCTTTTGGCTTGGAAGTTTCCTCTGTTGTCTAAAAGGGAAAGCAAAACCATCCGTGAGCTTTCTTTTCTGTATTAAGTATGAGGAGATGGCCTTCTCAGAATTAGGGGACAAAAGATGGCAGTCAGTGGGGAAAAATAAGATGGTCCTTTCAGTTTCTCTTCTTCATCTGGCCACAATATTGTGAGTTCTTCCTCCTCAAGCTTATAAGCTAAAAATAACCTTAAGTGATCCTGATCCTAAATGTATCACTCTCAGCTTTATTTTTTTAAGGCTAGGGTAGGTTATAAAATATGCATAGGTGTTCTTAAATGGCAATGTTTCATTCTGTGGTGATCCTTCCTTTCCTGTACATAGGGTCATGGCTGTCCAGTAAATCCACTCATCATAAAGGGTTATTATGCTTTCTATTATTTGTTAAAGGGCTGATTAAGTACTTCGTATAACTGAAATTAAATAATAGGCAAATAAGTATTTAAACTGAGATATAATGGCATAACTGCCAATTATCAATGCTGTTTCCTATATCCCTTAAGGGAATCTACTACAGGAAGATTTTAATATATTGTTTTAAAAGCTTTGGTGTAACTGGACTATTGTCTTTAAAGCTACACCTTTAACTCCTCCTTATAGCCAGGGGATCATAGTAATAATCATTTAAATCATATGTTCTTGGAATTGGAAGGGACCTAGAAGTTGTCTAATCCAATTTTCATTTCTCATCACAGCTTGAGTAATTTTAATAATAGGAAGTTTCCAACTTCCATGTTCTAAAATCTATATAAACCACATACTATGGTGGTATTTTAATTAGGGCAAGAAAGACAGGCAAAACACAGGCAAATTGGGTTGTGTCAAGAACATTCATTTAGGATTTTAAAGCACCAGTTACTTAAAAATATATGTATTTATACAAATTCAATTACTTTACCAAGCGATGCTATGGGATAGATAGTACTTTAAGAAATTTTATTCTAAATAGATTCCAGTAGGAAACTAAATGACTGAAATGATAAACCCTACTCTGTGTAACTGCTAAACTAATTTGTAGTATATTTACTGCTCCATTTACCTTTGCTGAATCCTTCGCTTTACCTCCATTCTTAGGTGCTTTGGAGCTGGAAGCAGCCTTCTTGCACTTATCCTAAAGCAAAGGAAATGTAATGAGGCTATGGCTATCACCAATCCATCTAGCAATTAACTAGGCTGCAGTTAAATTAATCCAGACCATTCTGAGATCTCCAATTTAATTAAATAATGGAAGACTTTGGGTGTTTTTTCATGATTTTTTTTGAACAACGGTGTCAGAGAATTGTTTAAAGCAGGAGGGAAAAAGGATAGAAATAAGGAGGGAAATGTGGGTGACATTGATGCTCTAATTCCCGTGGTGCCTGACTCACCCAGACCTTATTTTGCTAATCAAAACAGAGCTTGTCAATAGATAATAAATGTCGGCAAGGGTGTGGAGAAAAGGGAACCCTAGTACACTGTCAGTGGGGATGTAGATTGGTATGGGCTATGATGGAAAACAGTATATAGGTTCCTAAAGAAATTAAAAGTAGAACTACTGTATGACCCAGTAATCCCTCTTCTGGGTATATACCCAAAGGAGATGAAATTATCACCTTATAAAGATATCTGCACTCCCATATTCACTGCAACATTACTCACAATAGCCAAGATATGGAACAACCTAGTTGTTGATGAATGGATAAAGAAAATGTCATGTATATATACATAATGGAATATTATTCAGCCTTAAAAAACGATATCCTACTATTTGTCACAACATGGATGGACCTGGAAGACCTTATACTAGATGATATAAGCCAGACACAGAAAGAAAAGTGATTTCACTTATATGTAGAATATATATAAAAGAAAAAGCTCAAAAACACAGAGAATAAAACATGGCGACCATGGTAGGGAACAGGAGGAGGAAACAGAGATATAGGTCAAAGGATACAAAATAGCAGATATGCAGAATGAACAAGTGTAGAGAGTTAATGTATAACATGAGGACTAAGGTTAATAAAATTGTATTAGGGATTTTGTTAACTAAGTAGATTTTAGCTGCTTTTGTCACAAAAAGTAGTTGTGTGAGAATGATAGATATGTAAATCTGCTTCCCTACAGTAACCATTTTATTATTTCTATGCATCCCAAACTACCATGTTGTAAACCTCAAATATACACAATAAAATGTATTTAAAAAACAAAATAGAGCTTGTCTCGATCAGGACTGGCTTTTGTGTACCAAAAGGCAAAAAAAAAAAAACAAAAAAAAACCCTGTTTTCAGTGTTATGGGAGAGAAATGAACAATGGGAAACAACCGAGGAAAGCTGGAGCAGGTTACGTATAAAAATAAAGTCCATTCACCAAAAAAGGCATTACTTACGAGTTACCAGGGGTGAGAGATAGGATGCTGAAGTGGTCTAGAAATTAAGCTACCCAGTATGGAAGGGCTGACAATTCAGTGATCGAGAGCAGTGCCTTAGAACAGCCAAAACAATAGCAAACTGAGATCTGCAGAATTAACTCTCCTGAAAATAACAAGGAGGTACTCATTTCACGTTTCCTTCTATTTGATTTACAAGAGGGTGTAGCTTGAGGGAAAATGCCTCACACTTGTTGAATTACACAGTTGTTTCTCATTCACTTTTAATCACGTTTTGAGCACCTGCTAAGTACCAGGCATTTTGCTAATGAGGAGCACAGAGGTAAAAGACACATCACTACTGTATGAAATGCGTAGCTCAGTGGTGTGATACACAAGCACAGAGAGGTAACAGAGAGCAAGGAGGGCATGGAAGAGAGGCCTCTAACTTTGGACTGGGAAGGGAGAAGATGTAAGACAAGAAAGTCTTCCCTAAGGAGCTGATGCTTGAGCTGTGCCCTGAGGAATGAAGAGTAGACCAGTTGGGCTAAGCAGACAGAAAAGGGGAGGAAGCTCCAGAGAGCAAATGAGCATGAGAGTGCCTGATGTAGTTAGGGCCTGCTCTCACTTTAAATGAACACAGACATAGCATTATTGTGGCACAACCATATAGTGTGGAGATAAAAAATGGTGGCTATGGAAATTACAAAGTAGCAGTTAAGAAATAACGTTAAGCAGTGTTTTATAAGTGGACTGTAAGTATAATTATGTAAAATATACATATAGAAAAAAAAGGAAATCCACAAAATAATACTGTTTTGGGGGCAGTGGAATTATAGGCATTTTTTCTTTTCTTCATTTTCAGGTTCTCTATATCATCGTTTGATTCATTCTACAGTTTAAAAATTGTAAGGGCCAGGCGCAGTGGCTCAAGCCTGTAATCCCAGTACTTTGGGAGGTAGAGGTTGGCAGATCACTTGAGCCCAAGAGCTCGAGACCAGCCTGGCAACATGGCGAAACCCCCTCTCTACAAAAAAATACAAAAATTAGCTGGGTGTGGTGGTGCACGCCTGTAGTCCCAGCTACTCAGGAAGCTAAGGTAGGAGGATTGCTTGAGCCCAGGAAGCAGAGATCGCAGTAAGCTGAGATCACACCACTGCACTCCAGCCTGGGCTATAGAGTAAGACCCTGTTACAAGACAGACCGATAGATAGATCAATCAATAAATAAAACTTATATGTATGTACACATACACACACACACATTTCAAAGAGTGAAATGTGAAAAAGCACAGTACCTTTGCTGTGTTCTGTGAGGTTTCTGTAGTGGAGGGACAGCTGTCCAGATCTCCTGAGAGAGCATCAATGGGGTCTTGGTCATCTGCAGGTTTCTGAGATATGAGTAGAAATAACCATCAGTGAAGAAGCAGAAGGCAAAATGCAATATGGGGTCTTTTCCCACATCACTTACAAATAAAAGATGTTTCTATAAGAAAAAAACTGACTGACATTCTTTATTAATAATAATGTATTGTAAAGGAGATAGAAAAACAAGAAAATCTTGATGGCTTTTTTTCCATCTACTCTTTAATACACGTTGCTTAGCATTCTCTGAGCCTCAGTTTTCCACCCTAAAGGGCTTTTGTGAAGACTAAAAGAGAGGTAAACAATAGTCACACACATTTATATGCATGCTTTGACAAAGTACCAGGCACAGAGTAGGCATTCAATATGTTTTAGTTTTCTAAAATGCCAAATACCCCTATGGCTAGAATAAAACAAAATTTAATGGAAATATGTTCCTATGGTCTTTACCTTTGAATCCTCTGATTTCTTTGTAGGTGGCTTCACTGGTTTGTCCTTAAAAAGAAGGCAGACTATTGGTTAAGCATAGATATCTGTAAAGGTTTACTTAGGTTTAGGCAGTAGAGAATCTATTGTCCCATGACTTGACTTGGATAAAATGGAGTGTAGACTTGCAATAACTAATAAACCTGAGTCCCCACTATTTCTTTACCAGCTTTGCTTAATTACTATTCTTTATAATCATTTGTGTAGACTTGGGGGAAACATATGAAAGGGTCTGGTCCTGGAAGTGAAAGAAACACTATTTGTGAACCTTCTGCCATCATGTATTATCACTTATAATTCTACTTAAAGTGTTATAAAAAGTTACTTGTGTTTTTACATCTTATTTACTATAGCTTATTACACTTAGAAAGTCATCAGGGATTTTTCCCAGCTTCAAAGGCAAGGGCCTTAAATAATAAATTTCCCAAAGAACGCAAGCAGGGTGAGTTGGTACTATCAAAGTGGGAAGGGCTCTAGCAGGATGTGGAATTGCTATCTTGAGGAATACTGAATGCAAGCAAGGAGAATTGGTTCCTGTAATAGGAGACCCTGAAGCTGACACTGCTATTTAAACCAGGAATCATCTTACTCACCAAAATGAATCAGTACAGAAAGAGAAGGGTGTAATTTGGTCTTCCTTGTTTCTTATTTGATGAACAGGCATGGAAAGGGTTATTTATCATCTAATTGACTCAAAAATTAGAAACTAAATAGGAAAGAGGTGGGGGACAGGCCTGGTTGACCTACGAAAGACTGGCCTCTATCATGTGGGAGACAAAAGGCCAGGGACTTTTTGGCAGAGAAACAGGATTTGTGATTGGGAATATTGCTTTGCCTGTCTTCACTTGCAATAGTGCTGATGATGATGCAGGAGAAGATAGGGAGACCCCAGGTCTTGGAGCTGCCTTATTAATTTTCCCTATTAATTATCCCAACACCAGCTCCTTTTCTGCTCTCCTTAAACAGAGATTGCTTCCTTTGAAATCCTATCACCTTAGTCATTTTTATAGCCTTTTCTTGAAAGAGAGAATCACATTTCTTGGATCACTCTTAAGCATCTGTGAGTGACTGAACAGTATTCAACCCTGTTTATACCACAGAGCCTAGCATATGCATATATAATGATATTGGCTCAAATAAATATTTACCTGCTGGTCGGCTTTGGTGACACATTAGAAGCAGTCAGTCATGGTATGTTTTATGCTATTGTAAAGGAGTATAGCTACTATTTATTTCAAGGGAGGTTTGAATTTAAAGATCTTGGTAGCATAAATCCGATCTAGCAATTTGCCTCAGTTTACCTGTCCATTATCATCCAGGAGATGTCTGTATTCAGGTGGGATAGTGTCATCTCTTTCTCCAAGCTTGTCTCTATGTTCAGCTTTAGCTTTTTCCTATATCAACAGTGAGCAGATAGAATTAATATTCATTTCCTCTTTCACTTAGAAAATACATTGTCAAATGCAGCATTCAGCATTTGTTGTACTTTCATGAGGCAAAACATATGGTCTGTTTTTATTTTTTAAGCAACAGAACAACACAGATGACTTCAACAGACTGCTAGTTGGCTAAAATATAAAATCCCGTATGCTTCTGTATGTCAATTCATATCTGTGAATTTTCTAGCTATATTTTAAATGGAAATAAATGATTAAATAATTATCTTCAGAAACCATGTTAGGAGATTAGAACCCAAAGGTATAAAATATCTTTTTCTTTTTTTCTGTATGGTTTCACTTTTCTAATACAAAATCAGGCCACTGTACCTTGCCATTTAGAGAGGTCACATTTACAACTTTGCCTATTTATAAGCAACCCTGAAGGACAACAGCTAATTTGAATGGATGTGCTGACTGCTTGTGTTGCATGGGAGGAAGCCATGCATACCCCACACCTACCTCCCAGAATCCCCTCAGGGAGGCTCCGCTGGTATCTCTGTCACAGATTCAGTGACCTTCCCCTCCAGTGGAGTACAGATTGATTTTTCTATTTCATTATAATTTCATGTTTAAAATATAGGTTAAGTTCACATCAATATTCCTATGACAATGACAGAGTCAAGACCCAAGGATTAAAATTTCACTATTAGTGAATTTTTTTTTTTTTTTTTTTTTTTTTTACCTTTACTTTATCTTCCATTGGTTTGTTCTCATCTGGGTCAGGCTGCCTTTGTCCTAGACTGTCAGAGAGTTTATCCAAGGCATCATCGAGGTCTTTGTCACTCTGCTGAAAAGTAAATAATGCTGAATTAGTCACTCATTAGCCAAACTGTATCAGAGGCAAATTAGCCCAGGAACTCCATCTTTTCAGGAGGGAACCTGTTTCCTTGGAAGAAAAAGACATCTGGGGCTGGGGCAGGGGAGTAGACAGGGTCAGAGAAGAGAAGCCTAGGAATGGAAGATCAGGAGAAGAGCGGCAGTATCTGTCACCCTACTGGGGTTGGAGGGCCAGCACCTTCCACCCAACCCTGCCCATCTCTTGGTGAAGATCCCACCAGGTTAAGGAGGTCTTAGTGGCAGCCTCAGGAGCCATATCCAGTGGGTGACCTGGAGGTCACATAAAGGGTCCAAAAGCAAATGAACCAATCATGTGTGCCTTTCATTTAGAAATTAAACACCATTAGAAAACTGGATATGAAAACAAACATCTACTAATGTTGTCAGATGGTTAGGAAGCAAGATTCTGCAACTATAGAGGGTAAGTGTTTCTTTGGTTCTGTGGGTCCTCTCTAAAACTCTAAGATCTTGAGGGGTGCATTTCAGAAAGTGCAGCGTGACCCGCAGTTTGTGGGAAGCCATGGAGCTCGGCACTGCCATCCTAATACTTCCTAAAGCACAAAACCCCAGAGACAATCTGGGGTCAGGAGAGTGGAAGGGGCTTGTCTGCCACACTGGTGATGAGTGCCCTGAAAGACTTCAGAGAATTTCTGAACTGGTGGGGAAACCTCTCTTTTCATCTTCAGGAAGCTCATGGAAGTGAAATTGCAGAAATGGGAGCTGGTATTCTAGAGGAAAAAAATTATGGACAACAATATCACTGTAACTAAGATAGCTTATTTCCTCTAACATTTATTTACTGTATGATTCAGGCAGCTTATTTAACCCCTTTCAGCTTCAGTTTCCTTGGCCGTGAAATGTGAATAATAGTAGTACTTATACTCCTAAGTTGCTGAGAAAAGTAAATGATTGAAAAGGCATTTAAAACAATACTAGTTGTATGTTAGGACCCAACAAATGGTAAATTATTATTAGTATTATTATAGCAAAATCCATATTTTTCAACACATTGCATTCAAAATTCCACCTCTAAATGAATTCAATTAAAATGTGTTTAATATCTACATTGTATAAGACACCATGCTGAACTCTGTCCAATACTGTATAGAACTTTCCAGTTGATTTTCAAAATGTTTTCACATACACTATCCAGTTTTATTTGATGCCCACAATGGTTCTCAGTGAACTAAGCAGGCTTTTTTTTTTTGAGGTAAAAAGCTCAGGGAAGCTAAGTCAGTTGCTTGAAAACAAATTGTTAAAAAGTGAATAGAACCCAGGTCTTGGGACTGATAAAGCTTTTCCCTGTCATGCTTAGTCACATCCATGATCTTCTATTTTCTTTGAAGCAGTTTTCCTGTTGGAGTGATTTTATTACACAGATCTTTGAAATCATGTCTTCAAATGCTTTCAGTGTATGTAACACTGTTAGTAACAATCTAATAATCACAGCAAAGAAAGCTCCCGTGAATTATCATGGTTTATTTGACTCTTCGATTTCCTAATATTTTTATCTAAATAAAGCTTTATACCTTGTTTTAGTGACATCTTCAAATAAAATGTTAACTAAAAACAAGCCTCTCTGATGGGAAATGTGATCAGAGAAGTGTCATTGTAAAACCTACTTCTTAAAGGCAAAAAAGTTTTTGATTGCAAATGTTTACTGATAGCCTTCATCAGGGCAGAATCTCTGGCCTGAATATTAAGAACTGAAGTGTAAACGGCAGCCTAGGCTATTAATGATTCTTCCTTTCTGTTGCATGGGGACTTTCTTCATTGTGGGTGTGTTTACATACACACATGCACATGCACATGCACATACACACACACGGGGCATTTTACTGGTTTTAAGTGCTTTATTATAATCCAGGATTATAGCTGCTAATGGTAGAGCTGCCCGGGGCCAGGTCTGGGCTTTGTCATTTGTGCCTCTGGATATTTTCAG|ATGATCCTGAAGCTGACGCAACAGGATGAAAATCCATCAGAATCTCAGACTACAGCACTAAATATGCTTTGATGCTACATCAAACGGAATGGAAGCATAGCTGACTTCGCTAAAGTTACTTCATCTCCATCTAGCAAATGAGGCACTGTTCTCAACCAAAGGAGATGGGGATCTGGTTTAGGGCAATCCCTTTATAATTTGATGTGCTGTGGTCTCCTTGGTAATGTATAATTTGGTATTGCACAGGTGATTAGTCAAGGAAGTCTGGAAAAGCTTTGGTCCCACAGCCTTGCCTCACAGCATGTAAATAATTAAAACAATATTGATGCTGAGGTTCTTCTACTGCTAGTATGAAAGTGACAAATTTTTACTGGTGTGAATTGGGAAGAAAACAATGCTATTCCATGACGTTTGTAAAATGTTTGTAAAAGCTCAAACATGACGATTCCATAAAATAAACTTGAGGTTAAATAATGGGTAGTAAATTATAGAATGTAT|AAGAAAAAATATAAAGGAGAAAATCAATTATCAGGAAAGCTAAAGAACTTTTCAAATCTAGTAATTTGAATATAGACACAATGCACTTTATTGCACTTTCAATTCTTATAAAGCAACAATAATATTAAGGTCCTTGACTATGTGTACAATGTTTTCACATATATAGTTTCATTTAATCATTTCAAAGTTAATCTCTGCCATCTCGCTAAATCATCAGTCTCGGCTCTTCTGAAATAGAAGGTGCCTGATCTTCCTAATAATTCTGCCTATTTTCATTTGCTTTAAACAGGCGCCCTATTTTCTTTCTAGTTGTGGCTGCGCAAAAACATTTATCTCCCAAATAAGATGTGCTGCTTACCGAGGTATCACGGGGTGGGGCTCCAGCTTGGGTCGTTGAAGCTGGGGTTTGGGAAACCACTTCAGAGATGGCAGCAGCAAGTTTAGCATCTTCAAATTTCTTTTATTGAAAAAAATTTTATTAGTAACATGTTGTATATAAAATTATGAGCACAATGCCATCACTTAACTATAACTCTTAAAGATAGCTTAATGACTGTTTATTCTCTTGACCAAATAGACTCATAATAACATATAATTTTAAAAGAAATTTAAATTCTTTCTTCTCTATTGTATTATTTTATACAATTTGCTATTTCTATTTCCTTCTCATATTGATTATTCTAAATACTATGCAATAATATAACTTAGAGTTCCACGGTTTGTTTACACATTTCCTGTTGTACATTTAGGTTATTCAAAGTTTTCAGCTCTTTTAAAATTGCTCTGAATAAGTTCTAGTGAGTGAGTTATGGTGCTGGCTATATTTTGCTAAACTGCCCTCTCAAATGTTGCTAGGAATTCATACTGCGAAAAGCAATGAATAAGCATGCCTGTTTTCCCATGGCCTTGCTTGCCAGAATTTGACTTTTATTATGATAATCAGTGTAAAATGATATACTACTATTGCTTGTATATTGTGGTATACGGTGTCAGGTTTCAGGGTTTTTTTTCAACGTTAAATATTCTAGAAACTTTCTGAAATAATTTCTGTTTAAAAATATTGAATATTTGCTTCATTTCAAATACTCCCTTTTGACAAAAAAACTTAGGTATAACTGTTGATGAAAAACCAGAAAAAAGTCCAGAACTCTTTGGTGACTCCAACTATGGATAGCTTATTTTGAAAAAGGAGAATTGCAAATTTTACCAAAAGATGGAGAAAAGCACATTAAAAAGATACCAACATTCAGAAATTCATTTCAGCATGTTATTATTGGAAATTATTTAAACTAATTTAGATAACTATAAGATACTTATTGTCCATTTATACCCTGTAAAGCCGTTTTAGAATGTAATATTTTAGGTAATCCAAAATGTACTAAATTAAATTCATTTTTAGTTATGAGAAATCTTTGCTTATATGACAAATGAAAAGAATAACAAGTTGTCAAATGAAAAGAATGACATTGAAACATTTGTATTGTCTCTTCTTAAACTATCTTATTGACTTATTATTTAAGCCTTTTAATACTAAGTATGAAACAACCTATGGTCTGGAAATTTGTATCGCAAAGCTATATGTGCATATGTTATTTAATTCATCTAATGCTACACAAAAGCATAAAATAATGATTTTTCACTCTCTTTAAAAATACTAAATCATTTATGTCCATTTCTCAATTTTTTCATTGATCTATGCTTTGAGTTTGCTTTCTCAACATTATTGTATTTTCCACTTATTATTACTGTATAACATATGCTAGTGTTTAGTTGGATTAATCTTACCTAAAAGTACTGAAAAATGCTTTTTAGTACTTTTTCATATTTTATACATTTATTTTCCGAATGTATCATTGAATAATTTTATTGAGTTATAAAAGTATCTTATTGCTATTTAATAAAAAATTAACACATAAAATGACTTGAATTGTCATCATTCTTTTTAAGATATTTAGTTAAACTGACTTAATGTATGGCCTTCAATTTTTTTGTGTCCTTATTTTTCTGATCATTTCTCCTTTTATAGTTTACATTAAGTCTGATCTCATATTAATTACATTTTCTCATCTGTTGTTACTAATAAACATGGCATAATGTTACTTACAAATGTATTATCTACAAGTAGTGCTATCCACAAATATATTCAAATGTTCCCTTTTAATGTTTGTCATTTTTTTCATGTGTTGTTAATGATTCTTCCATGTG'

In [6]:
offset = -len(erap_reference.split('|')[0])
offset

-691

In [7]:
def find_occurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

def replace_char(s, i, c):
    return s[:i] + c + s[i+1:]

find_occurrences('AATAT', 'T')

[2, 4]

In [8]:
sequences_unique = {row['Sequence'].strip() for row in rows}
sequences2i = {sequence: str(i) for i, sequence in enumerate(sequences_unique)}

seqs_to_align = [
    str_to_fasta_seqrecord(
        str(i),
        sequence 
    )
    for sequence, i
    in sequences2i.items()
]



In [9]:
from tgsts.sequtils.fastio import write_fasta
from tgsts.sequtils import str_to_fasta_seqrecord
from tgsts.sequtils import clean_sequence

write_fasta(
    'erap_seqs.fasta',
    seqs_to_align,
    force=True
)

write_fasta(
    'erap_reference.fasta',
    str_to_fasta_seqrecord('ref', clean_sequence(erap_reference)),
    force=True
)



In [10]:
#Run minimap
import subprocess

subprocess.run(
    [
        "minimap2",
        "-a",
        "erap_reference.fasta",
        "erap_seqs.fasta"
    ],
    stdout=open("erap_frags_aligned.sam", "w"),
    check=True
)


[M::mm_idx_gen::0.005*2.16] collected minimizers
[M::mm_idx_gen::0.008*2.39] sorted minimizers
[M::main::0.008*2.38] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.009*2.26] mid_occ = 43
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.009*2.17] distinct minimizers: 8894 (98.97% are singletons); average occurrences: 1.025; average spacing: 5.278; total length: 48105
[M::worker_pipeline::2.059*2.95] mapped 437 sequences
[M::main] Version: 2.22-r1101
[M::main] CMD: minimap2 -a erap_reference.fasta erap_seqs.fasta
[M::main] Real time: 2.066 sec; CPU: 6.079 sec; Peak RSS: 0.166 GB


CompletedProcess(args=['minimap2', '-a', 'erap_reference.fasta', 'erap_seqs.fasta'], returncode=0)

In [11]:
#Load aligned

from utils import build_full_length_alignments

ref_aln, subread_strings = build_full_length_alignments('erap_frags_aligned.sam', 'erap_reference.fasta')

piped_ref_aln, piped_read_alns_strs = add_pipes(erap_reference, ref_aln, subread_strings)



In [12]:
class Alignment:
    def __init__(
        self,
        ref: str,
        ali: str
    ):
        self.ref = ref
        self.ali = ali

        self.rrs = identify_rrs_ali(ali)
        self.seq_rr_masked = mask_rrs(clean_sequence(ali))




In [13]:
#Generate alignments
def generate_alignment(key, ali):
    return Alignment(piped_ref_aln, ali)

args = [
    (k, ali)
    for k, ali
    in piped_read_alns_strs.items()
]

res = run_concurrently(
    generate_alignment,
    args,
    mode='process',
    batch_size=10
)




Processing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 437/437 [01:38<00:00,  4.45it/s]


In [14]:
assert not res.failed

In [15]:

piped_read_alns = {
    r.args[0]: r.result
    for r
    in res.successful
}

In [16]:
next(iter(piped_read_alns.keys()))

'0'

In [17]:
def get_erap_exonic(gen_seq: str) -> str:
    ea = exonic_alignment(
        str_to_fasta_seqrecord('cons', gen_seq),
        'ERAP1',
        'erap_full'
    )
    return ExonDict(*next(iter(ea.items()))).seq

def calculate_mms(seq: str, mode: str = 'gen') -> str:
    seq = str_to_fasta_seqrecord('cons', clean_sequence(seq))
    locus_lib = ANTypingLibs().get_locus_lib('ERAP1')
    locus_lib.load_db()

    if mode == 'gen':
        mms, _ = mm_profile_from_seqs(
            seq,
            locus_lib.get_gen_ref_seq(), 
            locus_lib.get_gen_ref_seq(), 
            locus_lib,
            'gen'
        )
    elif mode == 'cds':
        mms, _ = mm_profile_from_seqs(
            seq,
            locus_lib.get_cds_ref_seq(),
            locus_lib.get_cds_ref_seq(),
            locus_lib,
            'cds'
        )
        
    else:
        raise ValueError('mode must be gen or cds')
    mms = ', '.join([str(mm) for mm in mms])
    return mms
        
    
def get_erap_cds_prot(gen_str: str) -> str:
    annotator = Annotator('gen')
    res = annotator('ace78ee52a044de361c280b47d9e8', gen_str, fast_alignment=True)
    return res['sequence']['coding'], res['sequence']['protein']

    

In [18]:
class OverlapError(Exception):
    def __init__(
        self,
        message,
        sample,
        frags_left,
        frags_right,
        overlaps
    ):
        super().__init__(message)
        self.message = message
        self.sample = sample
        self.frags_left = frags_left
        self.frags_right = frags_right
        self.overlaps = overlaps

        self.write_output()

    def write_output(self):
        out_path = f'./out/fails/{self.sample}.txt'

        if os.path.exists(out_path):
            os.remove(out_path)

        with open(out_path, 'w') as f:
            f.write(self.sample+ '\n')
            f.write(self.message + '\n\n')

        if_left = self.frags_left[0].fragment
        if_right = self.frags_right[0].fragment

        overlap_lines = []
        overlap_lines.append(f'         \tCoding\tNonCod\tRR    \tKmer')
        for (i, j), overlap in self.overlaps.items():
            overlap_lines.append('\t'.join([
                f'F{if_left}:{i}/F{if_right}:{j}',
                *[
                    str(int(_)).ljust(6)
                    for _
                    in overlap.match_tuple
                ]
            ]))

        with open(out_path, 'a') as f:
            f.write('\n')
            f.write('\n'.join(overlap_lines) + '\n\n')
                    

        ali_lines = []
        msa = MSA(*self.frags_left, *self.frags_right)

        #Get overlapping boundary idx
        i_start = max([
            len(re.match(r'[-|]*', ali).group())
            for ali
            in msa.alis 
        ])
        i_end = len(msa.ref) - max([
            len(re.match(r'[-|]*', ali[::-1]).group())
            for ali
            in msa.alis 
        ])

        ref = msa.ref[i_start:i_end]
        alis = [ref] + [ali[i_start:i_end] for ali in msa.alis]

        #Lose all columns which are just -s
        ref, *(alis) = [
            ''.join(ali)
            for ali
            in zip(*[
                bases
                for bases
                in zip(*alis)
                if set(bases) != {'-'}
            ])
        ]
        
        labels = [
            f'F{if_left}:1', 
            f'F{if_left}:2', 
            f'F{if_right}:1', 
            f'F{if_right}:2', 
        ]
        batch_width = 80

        for i in range(0, len(ref), batch_width):
            s = slice(i,i+batch_width)

            ref_snippet = ref[s]
            alis_snippets = [ali[s] for ali in alis]

            label_ljust = 20
            
            mm_line = ''.join([' '] * label_ljust) + ''.join([
                'v' if len(set(bases)) != 1 else ' '
                for bases
                in zip(*alis_snippets)
            ])

            ali_lines.append(mm_line)
            ali_lines.append('ref'.ljust(label_ljust) + ref_snippet)
            ali_lines.extend([
                label.ljust(label_ljust) + ali
                for label, ali
                in zip(labels, alis_snippets)
            ])

            

        with open(out_path, 'a') as f:
            f.write('\n'.join(ali_lines))
        


In [19]:
class AssemblyError(Exception):
    def __init__(self, msg: str, sample: str, overlap_lls: list['OverlapLinkedList']):
        super().__init__(msg)
        self.sample = sample

        self.write_output(msg, overlap_lls)

    def write_output(self, msg: str, overlap_lls: list['OverlapLinkedList']):
        
        out_path = f'./out/fails/{self.sample}.txt'

        lines = []

        lines.append('SAMPLE:')
        lines.append(self.sample)
        lines.append('REASON:')
        lines.append(msg)

        #Left to right overlaps, i.e 4 frag_1/2
        l2r_overlaps = {}
        
        for overlap_ll in overlap_lls:
            for overlap in overlap_ll:
                l2r_overlaps.setdefault(
                    (overlap.frag_left.fragment, overlap.frag_right.fragment),
                    set()
                ).add(overlap)

        lines.append('\n')

        for (i_frag_left, i_frag_right), overlaps in l2r_overlaps.items():

            
            frags_left = sorted({
                overlap.frag_left
                for overlap
                in overlaps
            }, key = lambda x: id(x))
            frags_right = sorted({
                overlap.frag_right
                for overlap
                in overlaps
            }, key = lambda x: id(x))

            #Get common mismatches across all frags
            assert len(frags_left) == 2

            left_mms_common = set(frags_left[0].gen_mms) & set(frags_left[1].gen_mms)

            lines.append('▓'*79)
            lines.append(f'░░ FRAG{i_frag_left} ░░'.ljust(48, '▒').rjust(79, '▒'))
            lines.append('░'*79)
            lines.append('')
            
            for i, frag in enumerate(frags_left):
                lines.append(f'ALLELE {i+1} UNIQUE MISMATCHES:')
                lines.append('\n'.join(sorted(set(frag.gen_mms) - left_mms_common, key=mm2pos)) or 'NONE')
                lines.append('')
            lines.append('')

            
            lines.append('▓'*79)
            lines.append(f'░░ OVLP{i_frag_left}/{i_frag_right} ░░'.ljust(50, '▒').rjust(79, '▒'))
            lines.append('░'*79)
            lines.append('')

            lines.append('OVERLAP STATUS:')
            if len(overlaps) == 2:
                lines.append('UNAMBIGUOUS OVERLAP')
                lines.append('')
            else:
                overlap_alignments = [
                    overlap.get_alignment()
                    for overlap
                    in overlaps
                ]
                unique_overlap_count = len(set(overlap_alignments))
                if unique_overlap_count == 1:
                    lines.append('AMBIGUOUS HOMOZYGOUS OVERLAP')
                    lines.append('')
                else:
                    lines.append('AMBIGUOUS HETEROZYGOUS OVERLAP')
                    lines.append('')
                    
            lines.append(' '.join(['Overlap'.ljust(20), 'C', 'G', 'R', 'D']))
            for i_allele_left, frag_left in enumerate(frags_left):
                for i_allele_right, frag_right in enumerate(frags_right):

                    
                    for overlap in overlaps:
                        if (
                            overlap.frag_left is frag_left and
                            overlap.frag_right is frag_right 
                        ):
                            label_left = f'F{i_frag_left}A{i_allele_left}'
                            label_right = f'F{i_frag_right}A{i_allele_right}'
                            
                            #print(f'{label_left} / {label_right} Overlap Alignment')
                            #alignment = overlap.get_alignment(padding=20)
                            #ref, alis = alignment[0], alignment[1:]
                            #for i_ali_start in range(0, len(ref), 69):
                            #    i_ali_end  = i_ali_start + 69
                            #    print('ref'.ljust(10), ref[i_ali_start:i_ali_end], sep='')
                            #    for label, ali in zip((label_left, label_right), alis):
                            #        print(label.ljust(10), ali[i_ali_start:i_ali_end], sep='')
                            #    print()

                            lines.append(' '.join(
                                map(
                                    str,
                                    [
                                        f'{label_left}/{label_right}'.ljust(20), 
                                        *map(int, overlap.match_tuple)
                                    ]    
                                )
                            ))
            lines.append('')
            lines.append('')


        #Now do the final right frag
        right_mms_common = set(frags_right[0].gen_mms) & set(frags_right[1].gen_mms)

        lines.append('▓'*79)
        lines.append(f'░░ FRAG{i_frag_right} ░░'.ljust(48, '▒').rjust(79, '▒'))
        lines.append('░'*79)
        lines.append('')
        
        for i, frag in enumerate(frags_right):
            lines.append(f'ALLELE {i+1} UNIQUE MISMATCHES:')
            lines.append('\n'.join(sorted(set(frag.gen_mms) - right_mms_common, key=mm2pos)) or 'NONE')
            lines.append('')
        lines.append('')

        with open(out_path, 'w') as f:
            for line in lines:
                f.write(line + '\n')


In [20]:
class Fragment:
    def __init__(
        self, 
        seq: str,
        ali: Alignment,
        fragment: int,
        read_count: int,
        fragment_name: str,
        gen_mms: list[str]
    ):
        self.seq = seq
        self.ali = ali
        self.fragment = fragment
        self.read_count = read_count
        self.fragment_name = fragment_name
        self.gen_mms = gen_mms

        try:
            self.mm_pos_dict = {
                mm2pos(mm): mm
                for mm
                in gen_mms
            }
        except:
            print(gen_mms)
            raise
    
    def merge_aln(self, other: 'Fragment'):
        return MSA(self, other)

def mm2pos(mm: str):
    return int(re.search(r':(-?\d+)', mm).group(1))

mm2pos('5utr:-296delACACACACACACAC>')

-296

In [21]:
class Overlap:
    def __init__(
        self,
        frag_left: Fragment,
        frag_right: Fragment,
    ):
        self.frag_left = frag_left
        self.frag_right = frag_right

        self.match_tuple = self.assess_overlap()

    def __bool__(self):
        return self.match_tuple[0] and self.match_tuple[2]

    def __str__(self):
        return str(self.match_tuple)

    def __repr__(self):
        return f'Overlap{self.match_tuple}'

    def get_alignment(self, padding: int = 0):
        """
        Return alignment tuple of the overlap between frag left and right
        """
        overlap_aln = self.frag_left.merge_aln(self.frag_right)
        return overlap_aln.get_trimmed_alignment(padding=padding)


    def assess_overlap(self):
        overlap_aln = self.frag_left.merge_aln(self.frag_right)

        #overlap_aln.print()

        ref_ali = overlap_aln.ref
        alis = overlap_aln.alis

        i_start, i_end = overlap_aln.get_alignment_overlap_idx()

        #Get union of rr bases
        rrs1, rrs2 = overlap_aln.rrs
        rr_bases_1 = {
            i
            for rr
            in rrs1
            for i
            in range(rr['ali_start'], rr['ali_end'])
        }
        rr_bases_2 = {
            i
            for rr
            in rrs2
            for i
            in range(rr['ali_start'], rr['ali_end'])
        }
        rr_bases_both = rr_bases_1 | rr_bases_2

        #Set to false when mismatch found
        coding_match, noncoding_match, rr_match = True, True, True
        
        for i, (ref_base, *(ali_bases)) in enumerate(zip(ref_ali, zip(*alis))):
            if not i_start <= i < i_end:
                continue
            if len(set(*ali_bases)) == 1:
                continue
            #Mismtch found
            #print(i, ref_base, ali_bases, overlap_aln.get_feature(i), end='\t')
            feature = overlap_aln.get_feature(i)
            if feature == 'coding':
                coding_match = False
            elif feature == 'noncoding':
                noncoding_match = False
            else:
                raise ValueError('unexpected feature found?')

        left_tail_masked, right_head_masked = trim_intersection(
            self.frag_left.ali.seq_rr_masked,
            self.frag_right.ali.seq_rr_masked,
        )
        if len({
            str(left_tail_masked.seq),
            str(right_head_masked.seq)
        }) != 1:
            rr_match = False

        kmer_similarity = - calculate_kmer_distance(
            clean_sequence(alis[0][i_start:i_end]),
            clean_sequence(alis[1][i_start:i_end]),
            7
        )

        return coding_match, noncoding_match, rr_match, kmer_similarity

In [22]:
class MSA:
    def __init__(
        self,
        *fragments: Fragment
    ):

        if len({
            f.ali.ref
            for f
            in fragments
        }) != 1:
            raise ValueError('ref must be identical')

        self.fragments = fragments
    
        self.ref = fragments[0].ali.ref
        self.alis = [f.ali.ali for f in fragments]
        self.rrs = [f.ali.rrs for f in fragments]

    @classmethod
    def from_fragments(cls, *fragments: Fragment):
        return cls.from_alis(*[
            frag.ali
            for frag
            in fragments
        ])
            

    def get_feature(self, i):
        upstream = self.ref[:i]
        pipe_count = upstream.count('|')
        if pipe_count % 2 == 0:
            return 'noncoding'
        else:
            return 'coding'
            
    def get_alignment_overlap_idx(self):
        #Get overlapping boundary idx
        i_start = max([
            len(re.match(r'[-|]*', ali).group())
            for ali
            in self.alis
        ])
        i_end = len(self.ref) - max([
            len(re.match(r'[-|]*', ali[::-1]).group())
            for ali
            in self.alis
        ])
        return i_start, i_end

    def get_trimmed_alignment(self, padding: int = 0):
        i_start, i_end = self.get_alignment_overlap_idx()

        i_start -= padding
        i_end += padding

        return (
            self.ref[i_start:i_end],
            *[
                ali[i_start:i_end]
                for ali
                in self.alis
            ]
        )

    def consolidate(self) -> str:

        #Sort by i_fragment
        msa_sorted = MSA(*sorted(
            self.fragments,
            key = lambda x: x.fragment
        ))

        #Counts of each fragments
        fragment_counts = Counter([
            f.fragment
            for f
            in msa_sorted.fragments
        ])

        #Can only align if there is max 1 seq per fragment
        #Frags must also be contiguous
        i_fragment_prev = None
        for i_fragment, count in fragment_counts.items():
            if count > 1:
                raise ValueError(
                    'Cannot consolidate unless only 1 poss seq per fragment'
                )
            if (
                i_fragment_prev 
                and (i_fragment - i_fragment_prev) > 1
            ):
                raise ValueError(
                    'Fragments must be contiguous and overlapping'
                )
            i_fragment_prev = i_fragment
                    
        #Always choose fragment with higher read count

        #Sort fragment indexes, higher read count first
        prioritised_fragments: list[Fragment] = sorted(
            msa_sorted.fragments,
            key = lambda f: f.read_count,
            reverse=True
        )
        prioritised_alignments = [
            fragment.ali.ali
            for fragment
            in prioritised_fragments
        ]
        
        consolidated = []

        for col in range(len(prioritised_alignments[0])):
            base = '-'
            for i, aln in enumerate(prioritised_alignments):
                if aln[col] != '-':
                    base = aln[col]
                    prev_base_frag = i
                    break
            consolidated.append(base)

        return ''.join(consolidated)

    def print(self):
        ali_len = len(self.ref)
        for i in range(0, ali_len, 80):
            print(
                self.ref[i:i+80],
                *[
                    ali[i:i+80]
                    for ali
                    in self.alis
                ],
                sep='\n',
                end='\n\n'
            )




class AssembledSequence:
    def __init__(self, *overlaps: Overlap):
        """
        Assemble all relevant information from list of frags
        """
        frags = [
            overlaps[0].frag_left, 
            *[
                overlap.frag_right
                for overlap
                in overlaps
            ]
        ]
    
        ali = MSA(*frags).consolidate()
        self.seq = clean_sequence(ali)
    

        self.overlaps = overlaps
        self.frags = frags
        self.mms = calculate_mms(self.seq, mode='gen')
        self.exonic = get_erap_exonic(self.seq)
        self.cds, self.prot = get_erap_cds_prot(self.seq)
        self.cds_mms = calculate_mms(self.exonic, mode='cds')
        self.name = ''.join(
            frag.fragment_name
            for frag
            in sorted(
                frags,
                key = lambda x: x.fragment
            )
        )

    def __str__(self):
        return self.seq
        

In [23]:
class FragmentPair(list):
    def __init__(
        self,
        frag1: Fragment,
        frag2: Fragment,
        fragment: int
    ):
        self.frag1 = frag1
        self.frag2 = frag2
        self.fragment = fragment

        super().__init__([frag1, frag2])

    def product(self, other: 'FragmentPair', enum=False):
        if not enumerate:
            for a, b in product(self, other):
                yield a, b 
        else:
            for (i, a), (j, b) in product(
                enumerate(self), enumerate(other)
            ):
                yield (i, a), (j, b)

class TilingRow:

    header = (
        'sample',
        'desc',
        'allele',
        'allele_name',
        'exon_count',
        'cds_mms',
        'cds_mms_trunc',
        'gen_mms',
        'gen_mms_trunc',
        'cds_homoz',
        'gen_homoz',
        '12CDS',
        '12RRA',
        '12GEN',
        '23CDS',
        '23RRA',
        '23GEN',
        '34CDS',
        '34RRA',
        '34GEN',
        '45CDS',
        '45RRA',
        '45GEN',
        'prot_seq',
        'cds_seq',
        'gen_seq',
    )
    
    def __init__(
        self,
        sample: str,
        desc: str,
        i_allele: int,
        allele_name: str = '',
        cds_mms: str = '',
        gen_mms: str = '',
        homozygous_cds: str = '',
        homozygous_gen: str = '',
        prot_seq: str = '',
        cds_seq: str = '',
        seq: str = '',
        overlaps: list[Overlap] = None
    ):
        overlap_dict = {}
        if overlaps:
            #Sorted overlaps for increasing frags
            overlaps = sorted(
                overlaps,
                key = lambda x: x.frag_left.fragment
            )
            for overlap in overlaps:
                label = f'{overlap.frag_left.fragment}{overlap.frag_right.fragment}'
                overlap_dict[f'{label}CDS'] = 'Y' if overlap.match_tuple[0] else 'N'
                overlap_dict[f'{label}RRA'] = 'Y' if overlap.match_tuple[2] else 'N'
                overlap_dict[f'{label}GEN'] = 'Y' if overlap.match_tuple[1] else 'N'

        exon_count = (
            ''
            if not seq
            else '20'
            if 'intron19:33433T>A'
            in gen_mms
            else '19'
        )

        cds_mms_trunc = ', '.join([
            mm
            for mm
            in cds_mms.split(', ')
            if mm and 'exon1:' not in mm
        ])
        try:
            gen_mms_trunc = ', '.join([
                mm
                for mm
                in gen_mms.split(', ')
                if mm and mm2pos(mm) >= 3768
            ])
        except:
            print(type(gen_mms))
            print(gen_mms)
            raise
    
        
        self.tuple = tuple([
            sample,
            desc,
            str(i_allele),
            allele_name,
            exon_count,
            cds_mms,
            cds_mms_trunc,
            gen_mms,
            gen_mms_trunc,
            homozygous_cds,
            homozygous_gen,
            overlap_dict.get('12CDS', ''),
            overlap_dict.get('12RRA', ''),
            overlap_dict.get('12GEN', ''),
            overlap_dict.get('23CDS', ''),
            overlap_dict.get('23RRA', ''),
            overlap_dict.get('23GEN', ''),
            overlap_dict.get('34CDS', ''),
            overlap_dict.get('34CDS', ''),
            overlap_dict.get('34RRA', ''),
            overlap_dict.get('45CDS', ''),
            overlap_dict.get('45RRA', ''),
            overlap_dict.get('45GEN', ''),
            prot_seq,
            cds_seq,
            seq
        ])
            
            

class FailedTiling:
    def __init__(
        self,
        sample: str,
        reason: str
    ):
        self.sample = sample
        self.reason = reason

    @property
    def rows(self):
        return [
            TilingRow(
                self.sample,
                self.reason,
                i+1,
            )
            for i 
            in range(2)
        ]
            

class Tiling:
    def __init__(
        self,
        sample: str,
        *fragment_pairs: FragmentPair
    ):
        self.sample = sample
        self.fragment_pairs = fragment_pairs
        self.assemblies = self.assemble()

    @classmethod
    def try_init(
        cls,
        sample: str,
        *args,
        **kwargs
    ):
        try:
            return cls(sample, *args, **kwargs)
        except (OverlapError, AssemblyError) as e:
            print(f'{type(e)}: {str(e)}', end='\n\n')
            return FailedTiling(sample, str(e)) 

    @property
    def rows(self):

        homozygous_cds = len({
            assembly.cds
            for assembly
            in self.assemblies
        }) == 1
        homozygous_gen = len({
            str(assembly)
            for assembly
            in self.assemblies
        }) == 1
        
        rows = [
            TilingRow(
                self.sample,
                '',
                i+1,
                assembly.name,
                assembly.cds_mms,
                assembly.mms,
                'Y' if homozygous_cds else 'N',
                'Y' if homozygous_gen else 'N',
                assembly.prot,
                assembly.cds,
                str(assembly),
                assembly.overlaps
            )
            for i, assembly
            in enumerate(self.assemblies)
        ]
        return rows

    def assemble(self):

        possible_assemblies = PossibleAssemblies(self.sample)
        
        for frags_left, frags_right in zip(
            self.fragment_pairs[:-1],
            self.fragment_pairs[1:],
        ):
            i_f_left: int = frags_left.fragment 
            i_f_right: int = frags_right.fragment 
            overlaps = {}
            for (i, frag_left), (j, frag_right) in frags_left.product(
                frags_right, 
                enum=True
            ):
                overlap = Overlap(frag_left, frag_right)
                #print(f'- A{i}/A{j}', overlap.match_tuple, bool(overlap))
                overlaps[(i,j)] = overlap

            acceptable_overlaps = {
                k:v
                for k, v
                in overlaps.items()
                if v
            }

            if len(acceptable_overlaps) == 0:
                raise OverlapError(
                    f'No overlaps found between F{i_f_left} and F{i_f_right}',
                    self.sample,
                    frags_left,
                    frags_right,
                    overlaps
                )
            if {_[0] for _ in acceptable_overlaps.keys()} != {0, 1}:
                raise OverlapError(
                    f'F{i_f_left} could not overlap both frags to F{i_f_right}',
                    self.sample,
                    frags_left,
                    frags_right,
                    overlaps
                )
            if {_[1] for _ in acceptable_overlaps.keys()} != {0, 1}:
                raise OverlapError(
                    f'F{i_f_left} could not overlap to both F{i_f_right} frags',
                    self.sample,
                    frags_left,
                    frags_right,
                    overlaps
                )

            #If there are three possible overlaps, one is always
            #removeable as one fragment has an unambiguous choicfe
            if len(acceptable_overlaps) == 3:
                f_left_counts = {0: [], 1:[]}
                for _i, _j in acceptable_overlaps.keys():
                    f_left_counts[_i].append(_j)
                #Find the fragment which has two options
                greedy_frag = 0 if len(f_left_counts[0]) == 2 else 1
                modest_frag = int(not greedy_frag)

                del acceptable_overlaps[(
                    greedy_frag, 
                    f_left_counts[modest_frag][0]
                )]
            #If there are 2 possible overlaps, perfect and unambig
            #If there are 4 ambigs, this is am ambiguous overlap.
            #This may or may not be a problem, depending on whether there
            #are polymorphisms both upstream and downstream that become
            #ambiguous due to the overlap ambiguity in this frag
            #Either way, here we add all possibilities
            

            possible_assemblies.update(list(acceptable_overlaps.values()))
            
            #print()
        #print(len(possible_assemblies), 'poss assemblies')

        overlaps1, overlaps2 = possible_assemblies.get_final_assemblies()
        
        #Consolidate final sequence from the frags 

        assembled1 = AssembledSequence(*overlaps1)
        assembled2 = AssembledSequence(*overlaps2)

        return assembled1, assembled2

            
class OverlapLinkedList:
    def __init__(
        self,
        head: Overlap,
        tail: 'OverlapLinkedList' = None
    ):
        self.head = head
        self.tail = tail

    def update(
        self,
        new_head: Overlap
    ) -> 'OverlapLinkedList':
        if self.head.frag_right is not new_head.frag_left:
            raise ValueError('New head doesnt match list')
        return OverlapLinkedList(
            new_head,
            self
        )

    def __iter__(self) -> list[Overlap]:
        '''
        return list of overlaps corresponding to the tree 
        '''
        
        frag_list = [self.head]
        
        if self.tail is not None:
            frag_list.extend(list(self.tail)[::-1])
            
        return iter(frag_list[::-1])

class PossibleAssemblies(list):
    def __init__(self, sample: str):
        self.sample = sample
        self.overlap_lls = []
        super().__init__(self.overlap_lls)

    def update(self, new_overlaps: list[Overlap]):

        #If no present linked lists, create them
        if not self.overlap_lls:
            for overlap in new_overlaps:
                self.overlap_lls.append(OverlapLinkedList(overlap))
            super().__init__(self.overlap_lls)
        #Else, assign them
        else:
            new_overlap_lls = []
            for ll, overlap in product(self.overlap_lls, new_overlaps): 
                if ll.head.frag_right is overlap.frag_left:
                    new_overlap_lls.append(ll.update(overlap))

            self.overlap_lls = new_overlap_lls 
            super().__init__(self.overlap_lls)

    def get_final_assemblies(self):

        unique_masked_sequences = {}
        
        for ll in self.overlap_lls:
            overlaps = list(ll)

            frags = [
                overlaps[0].frag_left, 
                *[
                    overlap.frag_right
                    for overlap
                    in overlaps
                ]
            ]
            unique_masked_sequences.setdefault(tuple(
                f.ali.seq_rr_masked
                for f
                in frags
            ), []).append(overlaps)

        #print(len(unique_masked_sequences), 'final assembly')
        #print('\n\n')

        if len(unique_masked_sequences) > 2:
            raise AssemblyError(
                'More than 2 possible sequences - unable to tile across frags',
                self.sample,
                self.overlap_lls
            )
        elif len(unique_masked_sequences) == 1:
            possibilities = next(iter(unique_masked_sequences.values()))
            chosen_overlaps1, chosen_overlaps2 = self.select_two_alleles(
                possibilities,
                possibilities
            )
        else:
            possibilities = [
                ps
                for ps
                in unique_masked_sequences.values()
            ]
            chosen_overlaps1, chosen_overlaps2 = self.select_two_alleles(
                *possibilities
            )

        return chosen_overlaps1, chosen_overlaps2

    def select_two_alleles(
        self,
        poss1: list[list[Overlap]], 
        poss2: list[list[Overlap]]
    ) -> tuple[list[Overlap], list[Overlap]]:

        acceptable_pair_scores = {}
        #Find best combination of overlaps that only consumes each frag once
        for overlaps1, overlaps2 in product(poss1, poss2):
            #Skip if any overlapping fragments
            fragment_ids_1 = {
                id(frag)
                for overlap
                in overlaps1
                for frag
                in (overlap.frag_left, overlap.frag_right)
            }
            fragment_ids_2 = {
                id(frag)
                for overlap
                in overlaps2
                for frag
                in (overlap.frag_left, overlap.frag_right)
            }
            if fragment_ids_1 & fragment_ids_2:
                continue
            acceptable_pair_scores[(tuple(overlaps1), tuple(overlaps2))] = sum([
                overlap.match_tuple[3] #Sort on kmer distance
                for overlaps
                in (overlaps1, overlaps2)
                for overlap
                in overlaps
            ])

        #Get selected overlaos
        overlaps1, overlaps2 = sorted(
            acceptable_pair_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )[0][0]

        return overlaps1, overlaps2

        frags1 = [
            overlaps1[0].frag_left, 
            *[
                overlap.frag_right
                for overlap
                in overlaps1
            ]
        ]
        frags2 = [
            overlaps2[0].frag_left, 
            *[
                overlap.frag_right
                for overlap
                in overlaps2
            ]
        ]
        #for frags in [frags1, frags2]:
        #    for frag in frags:
        #        print(id(frag), end='\t')
        #    print()
        #print(frags1)
        #print(frags2)
        #print('\n\n\n')
        
        


        

        return frags1, frags2


                    

            
        
        
        

In [24]:
directory = "out/fails"

# Creates directory and intermediate folders if they don't exist
os.makedirs(directory, exist_ok=True)

In [25]:
# Load into sample dict
samples = {}

for row in rows:
    sample = samples.setdefault(row['Samples'], {})

    sequence = row['Sequence'].strip()
    alignment = piped_read_alns[sequences2i[sequence]]
    i_fragment = row['Fragment']
    read_count = int(row['NumReads'])
    fragment_name = row['Fragment Name']
    gen_mms = [
        mm
        for mm
        in (row['gDNA_mismatch_list'].split(', ') if row['gDNA_mismatch_list'] else [])
        if mm != 'No_MM'
    ]

    fragment = Fragment(
        seq=sequence,
        ali=alignment,
        fragment=i_fragment,
        read_count=read_count,
        fragment_name=fragment_name,
        gen_mms=gen_mms
    )

    if row['analysis_code'] == 1:
        sample.setdefault(i_fragment, []).append(fragment)
    elif row['analysis_code'] == 2:
        sample.setdefault(i_fragment, []).extend([fragment, deepcopy(fragment)])
    elif row['analysis_code'] == None:
        continue
    else:
        continue
        raise Exception('Other code found:', row['Typing Results::analysis_code'])
        
    #print(len(sample.get(row['Typing Results::Fragment'], None)))
    
#Lose weird numbered samples
for sample_id, fragment_info in [*samples.items()]:
    for i in range(1, 6):
    #for fragment, sequences in fragment_info.items():
        if len(fragment_info.get(i, [])) != 2:
            print('Deleting', sample_id)
            print(sample_id.ljust(20), f'fragment {i}', f'{len(fragment_info.get(i, []))} sequences', sep='\t')
            del samples[sample_id]
            break

    
#Lose None sequence
for sample_id, fragment_info in [*samples.items()]:
    for fragment, sequences in fragment_info.items():
        if None in sequences:
            print('Deleting', sample_id)
            print(sample_id.ljust(20), f'fragment {fragment}', f'{len(sequences)} sequences', sep='\t')
            del samples[sample_id]
            break

In [26]:
sample_fragment_pairs = {}

for sample_id in samples:
    sample_fragment_pairs[sample_id] = []
    for i_fragment, fragments in samples[sample_id].items():
        fragment_pair = FragmentPair(*fragments, i_fragment)
        sample_fragment_pairs[sample_id].append(fragment_pair)
    sample_fragment_pairs[sample_id].sort(
        key = lambda x: x.fragment
    )

In [27]:
tilings = []
for sample_id, fragment_pairs in sample_fragment_pairs.items():
    continue
    if sample_id != 'DEU':
        continue
    print(sample_id)
    tiling = Tiling.try_init(sample_id, *fragment_pairs)
    tilings.append(tiling)
    print(tiling.rows)
    print('\n\n\n')

In [28]:
for tiling in tilings:
    for row in tiling.rows:
        print(row.tuple)

In [29]:
from tgsts.utils.parallel import run_concurrently

concurrent_results = run_concurrently(
    Tiling.try_init,
    [
        tuple([sample_id, *fragment_pairs])
        for sample_id, fragment_pairs
        in sample_fragment_pairs.items()
    ],
    mode = 'process',
    raise_exceptions = False,
    max_workers = 30,
)

Processing:   0%|                                                                                                                | 0/66 [00:00<?, ?it/s]

<class '__main__.OverlapError'>: No overlaps found between F1 and F2



Processing:   2%|█▌                                                                                                      | 1/66 [00:00<00:21,  3.07it/s]

<class '__main__.OverlapError'>: F1 could not overlap both frags to F2

<class '__main__.OverlapError'>: F3 could not overlap both frags to F4

<class '__main__.OverlapError'>: No overlaps found between F4 and F5

Processing:   5%|████▋                                                                                                   | 3/66 [00:00<00:14,  4.25it/s]



<class '__main__.OverlapError'>: F2 could not overlap both frags to F3



Processing:   8%|███████▉                                                                                                | 5/66 [00:01<00:11,  5.39it/s]

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags<class '__main__.OverlapError'>: F2 could not overlap to both F3 frags





Processing:  11%|███████████                                                                                             | 7/66 [00:01<00:08,  6.94it/s]

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags



Processing:  14%|██████████████▏                                                                                         | 9/66 [00:01<00:06,  8.92it/s]

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags



Processing:  26%|██████████████████████████▌                                                                            | 17/66 [02:41<06:37,  8.12s/it]

<class '__main__.OverlapError'>: F4 could not overlap both frags to F5



Processing:  33%|██████████████████████████████████▎                                                                    | 22/66 [02:44<01:19,  1.80s/it]

<class '__main__.OverlapError'>: F2 could not overlap both frags to F3



Processing:  35%|███████████████████████████████████▉                                                                   | 23/66 [02:44<00:59,  1.37s/it]

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags



Processing:  38%|███████████████████████████████████████                                                                | 25/66 [02:45<00:39,  1.05it/s]

<class '__main__.OverlapError'>: F4 could not overlap both frags to F5



Processing:  41%|██████████████████████████████████████████▏                                                            | 27/66 [02:46<00:24,  1.56it/s]

<class '__main__.OverlapError'>: F1 could not overlap both frags to F2



Processing:  47%|████████████████████████████████████████████████▍                                                      | 31/66 [02:48<00:18,  1.88it/s]

<class '__main__.AssemblyError'>: More than 2 possible sequences - unable to tile across frags



Processing:  53%|██████████████████████████████████████████████████████▌                                                | 35/66 [02:51<00:19,  1.63it/s]

<class '__main__.OverlapError'>: F1 could not overlap both frags to F2



Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [05:15<00:00,  4.79s/it]


In [30]:
len(concurrent_results.successful), len(concurrent_results.failed)

(66, 0)

In [31]:
concurrent_results.successful[0].result

<__main__.FailedTiling at 0x7f4e591a3410>

In [32]:
tilings = [
    result.result
    for result
    in concurrent_results.successful
]
    

In [34]:
output_path = 'out/assembled.csv'

import csv

with open(output_path, 'w', newline='\n') as csvfile:
    writer = csv.writer(
        csvfile, 
        delimiter=',',
        quotechar='"', 
        quoting=csv.QUOTE_MINIMAL
    )
    writer.writerow(TilingRow.header)
    for tiling in tilings:
        if isinstance(tiling, Tiling):
            for row in tiling.rows:
                writer.writerow(row.tuple)






In [35]:
output_path = 'out/unassembled.csv'

import csv

with open(output_path, 'w', newline='\n') as csvfile:
    writer = csv.writer(
        csvfile, 
        delimiter=',',
        quotechar='"', 
        quoting=csv.QUOTE_MINIMAL
    )
    writer.writerow(TilingRow.header)
    for tiling in tilings:
        if not isinstance(tiling, Tiling):
            for row in tiling.rows:
                writer.writerow(row.tuple)



