# BLAST search of the longest protein of Covid 19
* BLAST search for fiding the best sligned sequence
* 

In [1]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio import SearchIO # For parsing BLAST results


In [2]:
# Load the saved protein sequence 
protein_seq = SeqIO.read("covid_protein_seq.fasta", "fasta")
print(protein_seq.seq)

CTIVFKRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFLKTNCCRFQEKDEDDNLIDSYFVVKRHTFSNYQHEETIYNLLKDCPAVAKHDFFKFRIDGDMVPHISRQRLTKYTMADLVYALRHFDEGNCDTLKEILVTYNCCDDDYFNKKDWYDFVENPDILRVYANLGERVRQALLKTVQFCDAMRNAGIVGVLTLDNQDLNGNWYDFGDFIQTTPGSGVPVVDSYYSLLMPILTLTRALTAESHVDTDLTKPYIKWDLLKYDFTEERLKLFDRYFKYWDQTYHPNCVNCLDDRCILHCANFNVLFSTVFPPTSFGPLVRKIFVDGVPFVVSTGYHFRELGVVHNQDVNLHSSRLSFKELLVYAADPAMHAASGNLLLDKRTTCFSVAALTNNVAFQTVKPGNFNKDFYDFAVSKGFFKEGSSVELKHFFFAQDGNAAISDYDYYRYNLPTMCDIRQLLFVVEVVDKYFDCYDGGCINANQVIVNNLDKSAGFPFNKWGKARLYYDSMSYEDQDALFAYTKRNVIPTITQMNLKYAISAKNRARTVAGVSICSTMTNRQFHQKLLKSIAATRGATVVIGTSKFYGGWHNMLKTVYSDVENPHLMGWDYPKCDRAMPNMLRIMASLVLARKHTTCCSLSHRFYRLANECAQVLSEMVMCGGSLYVKPGGTSSGDATTAYANSVFNICQAVTANVNALLSTDGNKIADKYVRNLQHRLYECLYRNRDVDTDFVNEFYAYLRKHFSMMILSDDAVVCFNSTYASQGLVASIKNFKSVLYYQNNVFMSEAKCWTETDLTKGPHEFCSQHTMLVKQGDDYVYLPYPDPSRILGAGCFVDDIVKTDGTLMIERFVSLAIDAYPLTKHPNQEYADVFHLYLQYIRKLHDELTGHMLDMYSVMLTNDNTSRYWEPEFYEAMYTPHTVLQAVGACVLCNSQTSLRCGACIRRPFLCCKCCYDHVISTSHKLVLSVNPYVCNAPGCDVTDVTQLYLGGMSYY

In [3]:
# BLAST search: using protein data bank (pdb) as the database
result_handle = NCBIWWW.qblast("blastp", "pdb", protein_seq.seq)

In [None]:
# Read the BLAST results => modifies the hierachy; QueryResult/Hit -> HSP
#blast_records = SearchIO.read(result_handle, 'blast-xml')
# Parse the BLAST results (recommended/conventional way)
blast_records = SearchIO.parse(result_handle, 'blast-xml')

In [None]:
# Understand the data type
print('BLAST records read via SearchIO')
print(f'Output type = {type(blast_records)}') # Check the type of the object
print(f'Number of records = {len(blast_records)}') 
print(next(blast_records))

## BLAST SearchIO's output
* [Official document](https://biopython.org/docs/1.76/api/Bio.SearchIO.BlastIO.html#submodules)
* The output is hierarchical; 3-level containers
    * (1) QueryResult (<= this level will be flattened, if using SearchIO.read)
    * (2) Hit 
    * (3) HSP(= High-Scoreing Pair)
* Source code on [Hit Class](https://github.com/biopython/biopython/blob/master/Bio/SearchIO/_model/hit.py)
* 

In [9]:
for query_result in blast_records:
    print(f"\nQuery ID: {query_result.id} (Description: {query_result.description})")
    print(f"Found {len(query_result)} hits")
    
    # Loop through all hits for this query
    for hit in query_result:
        print(f"\nHit ID: {hit.id}")
        print(f"Hit description: {hit.description}")
        print(f"Number of HSPs: {len(hit)}")
        
        # Now loop through HSPs for this hit
        for hsp in hit:
            print(f"\n  HSP E-value: {hsp.evalue}")
            print(f"  Bit score: {hsp.bitscore}")
            print(f"  Query range: {hsp.query_start}-{hsp.query_end}")
            print(f"  Hit range: {hsp.hit_start}-{hsp.hit_end}")

# Rewind the result_handle to the beginning
result_handle.seek(0)

# Re-parse the BLAST results to reset the iterator
blast_records = SearchIO.parse(result_handle, 'blast-xml')



Query ID: unnamed (Description: protein product)
Found 50 hits

Hit ID: pdb|7D4F|A
Hit description: Chain A, RNA-directed RNA polymerase [Severe acute respiratory syndrome coronavirus 2]
Number of HSPs: 1

  HSP E-value: 0.0
  Bit score: 1938.7
  Query range: 4-930
  Hit range: 8-934

Hit ID: pdb|6YYT|A
Hit description: Chain A, nsp12 [Severe acute respiratory syndrome coronavirus 2]
Number of HSPs: 1

  HSP E-value: 0.0
  Bit score: 1938.31
  Query range: 4-929
  Hit range: 10-935

Hit ID: pdb|6XEZ|A
Hit description: Chain A, RNA-directed RNA polymerase [Severe acute respiratory syndrome coronavirus 2]
Number of HSPs: 1

  HSP E-value: 0.0
  Bit score: 1937.92
  Query range: 4-929
  Hit range: 7-932

Hit ID: pdb|9CGV|A
Hit description: Chain A, RNA-directed RNA polymerase nsp12 [Severe acute respiratory syndrome coronavirus 2]
Number of HSPs: 1

  HSP E-value: 0.0
  Bit score: 1937.54
  Query range: 4-929
  Hit range: 23-948

Hit ID: pdb|7BW4|A
Hit description: Chain A, RNA-directed 