__Biopython's SearchIO module allows me to load/read XML files__

In [2]:
from Bio import SearchIO
alignment = SearchIO.read('Forrest - Williams 82 alignment.xml', 'blast-xml')
print(alignment)

Program: blastn (2.13.0+)
  Query: [Glycine (94336)
         max] Forrest B100B10 BAC sequence, Rhg4
 Target: n/a
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0     19  Query_13349  gi|148372212|gb|EF623856.1| Glycine max cu...


- Then I need to go through the hits in the results of the files to get the HSPs.

In [2]:
from Bio import SearchIO

blast_qresult = SearchIO.read('Forrest - Williams 82 Alignment.xml', 'blast-xml')
print(len(blast_qresult))
blast_hit = blast_qresult[0]
print(blast_hit)

1
Query: [Glycine
       max] Forrest B100B10 BAC sequence, Rhg4
  Hit: Query_1863 (141163)
       gi|148372212|gb|EF623856.1| Glycine max cultivar Williams 82 clone BAC...
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         0   70705.10   39844     [1036:40810]          [53556:93370]
          1         0   25012.20   14437    [78415:92809]              [0:14376]
          2         0   24562.30   14249    [40296:54495]          [39608:53796]
          3         0   21258.50   12286    [59690:71892]          [21436:33703]
          4         0   11573.50    6581    [71898:78455]          [14650:21213]
          5         0    8983.87    5032    [54491:59522]          [33686:38711]
          6         0     696.49     658    [85532:86182]            [5924:6565]
          7      

In [6]:
for hsp in blast_hit:
    print(hsp)

      Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [1036:40810] (1)
  Hit range: [53556:93370] (-1)
Quick stats: evalue 0; bitscore 70705.10
  Fragments: 1 (39844 columns)
     Query - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
             |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||~~~|||||
       Hit - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
      Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [78415:92809] (1)
  Hit range: [0:14376] (-1)
Quick stats: evalue 0; bitscore 25012.20
  Fragments: 1 (14437 columns)
     Query - ATATATATATATATATATATATATGTATATATGTATATATGTGTGTCGTACATGTTTAT~~~AGCTT
             |||||||||||||||||||||||| ||| ||||||||||||||||||||||||||||||~~~|||||
       Hit - ATATATATATATATATATATATATATATGTA

In [3]:
for i, hsp in enumerate(blast_hit):
    print(i, hsp)

0       Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [1036:40810] (1)
  Hit range: [53556:93370] (-1)
Quick stats: evalue 0; bitscore 70705.10
  Fragments: 1 (39844 columns)
     Query - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
             |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||~~~|||||
       Hit - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
1       Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [78415:92809] (1)
  Hit range: [0:14376] (-1)
Quick stats: evalue 0; bitscore 25012.20
  Fragments: 1 (14437 columns)
     Query - ATATATATATATATATATATATATGTATATATGTATATATGTGTGTCGTACATGTTTAT~~~AGCTT
             |||||||||||||||||||||||| ||| ||||||||||||||||||||||||||||||~~~|||||
       Hit - ATATATATATATATATATATATATATA

- I can use enumerate to retrieve indices of HSPs or use HSP Fragments to see each HSP individually

In [1]:
from Bio import SearchIO

blast_qresult = SearchIO.read('Forrest - Williams 82 alignment.xml', 'blast-xml')
blast_frag = blast_qresult[0][0][0]
print(f'type of blast_frag: {type(blast_frag)}')
print(blast_frag)

type of blast_frag: <class 'Bio.SearchIO._model.hsp.HSPFragment'>
      Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [1036:40810] (1)
  Hit range: [53556:93370] (-1)
  Fragments: 1 (39844 columns)
     Query - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
             |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||~~~|||||
       Hit - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT


### Using sliding window to see the alignment in chunks

In [1]:
from Bio import SearchIO

blast_qresult = SearchIO.read('Forrest - Williams 82 Alignment.xml', 'blast-xml')
print(len(blast_qresult))
blast_hit = blast_qresult[0]

1


In [2]:
hsp_list = []
for hsp in blast_hit:
    hsp_list.append(hsp)
print(hsp_list)

[HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Glycine', 1 fragments), HSP(hit_id='Query_1863', query_id='[Gly

In [4]:
from Bio.Blast import NCBIXML

result = open('Forrest - Williams 82 Alignment.xml', 'r')
records = NCBIXML.parse(result)
item = next(records)
for alignment in item.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < 0.01:
            print('****Alignment****')
            print(hsp.query[91:179] + '...')
            print(hsp.match[91:179] + '...')
            print(hsp.sbjct[91:179] + '...')

****Alignment****
TTACTAAGATACCTTGTACAAAATACAAATACATTAAAAAATCCAAAAAACTTAGATGCAATCACTTTGGTGTTTTGGACATTGTTCT...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
TTACTAAGATACCTTGTACAAAATACAAATACATTAAAAAATCCAAAAAACTTAGATGCAATCACTTTGGTGTTTTGGACATTGTTCT...
****Alignment****
TTTTATTTTTTTAAATTAATTATAAATTATTTATTTCAAACAATATTTAAAAATAATATATCTATTTTGGTAAATGAAAAATAAAATG...
| |||||||||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
TATTATTTTTTTTAATTAATTATAAATTATTTATTTCAAACAATATTTAAAAATAATATATCTATTTTGGTAAATGAAAAATAAAATG...
****Alignment****
ATATCTGAAATGCTACATTTATTATCTACAGAGTCCTGAATCTGTTTATGTTTTCAATGTTGATGTAACATGTTAGACTGATTCTGCC...
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
ATATCTGAAATGCTACATTTATTATCTACAGAGTCCTGAATCTGTTTATGTTTTCAATGTTGATGTAACATGTTAGACTGATTCTGCC...
****Alignment****
TGAAAGTTGTGTTAAATGGAGTGCTTCTTCCTCTTCTTTTTTTCCCATCAGTTTATGTGCTTATTTCTTTTGTTTGTTTATTATCATG...
||||||||

#### Display alignment without moving through the alignment

In [3]:
from Bio.Blast import NCBIXML

def read_XML_alignment(input_file):
    result = open(input_file, 'r')
    records = NCBIXML.parse(result)
    # item = next(records)
    # return item
    return records

def display_alignment(input_file, window_size):
    """
    Takes parsed NCBI XML alignment file and using sliding window displays it in sections/ranges.
    window_size is the length of the sliding window.
    """
    result_file = read_XML_alignment(input_file)
    item = next(records)
    start = 0
    for alignment in item.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:
                print('Alignment')
                print(hsp.query[start:start+window_size])
                print(hsp.match[start:start+window_size])
                print(hsp.sbjct[start:start+window_size])
    pass

input_file = 'Forrest - Williams 82 Alignment.xml'
window_size = 90
display_alignment(input_file, window_size)

NameError: name 'records' is not defined

In [1]:
from Bio.Blast import NCBIXML

def read_XML_alignment(input_file):
    result = open(input_file, 'r')
    records = NCBIXML.parse(result)
    # item = next(records)
    # return item
    return records

def display_alignment(records, window_size):
    """
    Takes parsed NCBI XML alignment file and using sliding window displays it in sections/ranges.
    window_size is the length of the sliding window.
    """
    result_file = read_XML_alignment(input_file)
    print(f'len(list(records)): {len(list(records))}')
    item = next(records)
    start = 0
    for alignment in item.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:
                print('Alignment')
                print(hsp.query[start:start+window_size])
                print(hsp.match[start:start+window_size])
                print(hsp.sbjct[start:start+window_size])
    pass

input_file = 'Forrest - Williams 82 Alignment.xml'
records = read_XML_alignment(input_file)
window_size = 90
display_alignment(records, window_size)

len(list(records)): 1


StopIteration: 

In [29]:
# using read() instead of parse()

from Bio.Blast import NCBIXML

def read_XML_alignment(input_file):
    file_handle = open(input_file)
    record = NCBIXML.read(file_handle)  # read returns a single record. Use parse for multiple records
    return record

def display_alignment(input_fname, window_size):
    """
    Takes parsed NCBI XML alignment file and using sliding window displays it in sections/ranges.
    window_size is the length of the sliding window.
    """
    blast_record = read_XML_alignment(input_fname)
    start = 0
    print(f'number of alignments: {len(blast_record.alignments)}')
    for alignment in blast_record.alignments:
        print(f'number of HSPs: {len(alignment.hsps)}')
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:
                print('Alignment')
                print(hsp.query[start:start+window_size])
                print(hsp.match[start:start+window_size])
                print(hsp.sbjct[start:start+window_size])

input_fname = 'Forrest - Williams 82 Alignment.xml'
# records = read_XML_alignment(input_fname)
window_size = 90
display_alignment(input_fname, window_size)

number of alignments: 1
number of HSPs: 19
Alignment
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
Alignment
ATATATATATATATATATATATATGTATATATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
|||||||||||||||||||||||| ||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ATATATATATATATATATATATATATATGTATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
Alignment
TTTTGTGAAGTTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
|||| |||  ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
TTTTATGACATTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
Alignment
TATGCTACTTTTTTGTTCTGTGAATGTGGCAATAATGTATTTGATAATCAAAAGGAAATGCATAGTCTCTTCCTCATAAGATAGTTGGAC
||||| |

In [1]:
# WORKING CELL
# using read() instead of parse()

from Bio.Blast import NCBIXML

def display_alignment(input_fname, window_size):
    """
    Reads input XML alignment file and using sliding window displays it in sections/ranges.
    window_size is the length of the sliding window.
    """
    file_handle = open(input_fname)
    blast_record = NCBIXML.read(file_handle)  # read returns a single record. Use parse for multiple records
    start = 0
    print(f'number of alignments: {len(blast_record.alignments)}')
    for alignment in blast_record.alignments:
        print(f'number of HSPs: {len(alignment.hsps)}')
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:
                print('Alignment')
                print(hsp.query[start:start+window_size])
                print(hsp.match[start:start+window_size])
                print(hsp.sbjct[start:start+window_size])

input_fname = 'Forrest - Williams 82 Alignment.xml'
window_size = 90
display_alignment(input_fname, window_size)

number of alignments: 1
number of HSPs: 19
Alignment
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
Alignment
ATATATATATATATATATATATATGTATATATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
|||||||||||||||||||||||| ||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ATATATATATATATATATATATATATATGTATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
Alignment
TTTTGTGAAGTTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
|||| |||  ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
TTTTATGACATTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
Alignment
TATGCTACTTTTTTGTTCTGTGAATGTGGCAATAATGTATTTGATAATCAAAAGGAAATGCATAGTCTCTTCCTCATAAGATAGTTGGAC
||||| |

In [1]:
# WORKING CELL
# using read() instead of parse()

from Bio.Blast import NCBIXML

def display_alignment(input_fname, window_size):
    """
    Reads input XML alignment file and using sliding window displays it in sections/ranges.
    window_size is the length of the sliding window.
    """
    file_handle = open(input_fname)
    blast_record = NCBIXML.read(file_handle)  # read returns a single record. Use parse for multiple records
    start = 0
    print(f'number of alignments: {len(blast_record.alignments)}')
    for alignment in blast_record.alignments:
        print(f'number of HSPs: {len(alignment.hsps)}')
        for hsp in alignment.hsps:
            if hsp.expect < 0.01:
                print('Alignment')
                print(hsp.query[start:start+window_size])
                print(hsp.match[start:start+window_size])
                print(hsp.sbjct[start:start+window_size])

input_fname = 'Forrest - Williams 82 Alignment.xml'
window_size = 90
display_alignment(input_fname, window_size)

number of alignments: 1
number of HSPs: 19
Alignment
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAATGAAATCTCTTTCAATACTAATCTACAACAA
Alignment
ATATATATATATATATATATATATGTATATATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
|||||||||||||||||||||||| ||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ATATATATATATATATATATATATATATGTATGTATATATGTGTGTCGTACATGTTTATTTCAATGTGAAAAATATGTTACGAAATGAAA
Alignment
TTTTGTGAAGTTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
|||| |||  ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
TTTTATGACATTAAAAGAGAGTATTACATAACATTTAAAAATTATGATTCTATACTAATTTCTAGATTAAGGTTTATTCTCTGTTCTGAG
Alignment
TATGCTACTTTTTTGTTCTGTGAATGTGGCAATAATGTATTTGATAATCAAAAGGAAATGCATAGTCTCTTCCTCATAAGATAGTTGGAC
||||| |

### Next steps: assign sequences to variables/container for insertions/deletions analysis

#### Testing another way of extracting just the alignment

In [2]:
from Bio import SearchIO

blast_qresult = SearchIO.read('Forrest - Williams 82 Alignment.xml', 'blast-xml')
blast_frag = blast_qresult[0][0][0]  # looking at first hit, first hsp, first fragment
print(blast_frag)

      Query: [Glycine max] Forrest B100B10 BAC sequence, Rhg4
        Hit: Query_1863 gi|148372212|gb|EF623856.1| Glycine max cultivar Will...
Query range: [1036:40810] (1)
  Hit range: [53556:93370] (-1)
  Fragments: 1 (39844 columns)
     Query - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT
             |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||~~~|||||
       Hit - AAGCTTTAGCCCGAACTAACATACCTCCTTTGTTAAAGCACTAAAAAACACATCCTCAA~~~GAATT


In [4]:
for i in range(len(blast_qresult)):
    blast_frag = blast_qresult[i]
print(f'blast_frag using for loop: {blast_frag}')

blast_frag using for loop: Query: [Glycine
       max] Forrest B100B10 BAC sequence, Rhg4
  Hit: Query_1863 (141163)
       gi|148372212|gb|EF623856.1| Glycine max cultivar Williams 82 clone BAC...
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         0   70705.10   39844     [1036:40810]          [53556:93370]
          1         0   25012.20   14437    [78415:92809]              [0:14376]
          2         0   24562.30   14249    [40296:54495]          [39608:53796]
          3         0   21258.50   12286    [59690:71892]          [21436:33703]
          4         0   11573.50    6581    [71898:78455]          [14650:21213]
          5         0    8983.87    5032    [54491:59522]          [33686:38711]
          6         0     696.49     658    [85532:86182]            [592

In [5]:
from Bio import SearchIO

blast_qresult = SearchIO.read('Forrest - Williams 82 Alignment.xml', 'blast-xml')
for blast_hit in blast_qresult:
    for hsp in blast_hit:
        blast_hsp = blast_qresult[blast_hit][hsp]
print(blast_hsp)

KeyError: Hit(id='Query_1863', query_id='[Glycine', 19 hsps)