<a href="https://colab.research.google.com/github/rororourb0at/RohanA-UTD-CompBioProject/blob/main/RohanAluruGenomicsProjectUTD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install biopython
import Bio
print(Bio.__version__)

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
1.81


In [None]:
from Bio import SeqIO
from Bio import AlignIO
#In the genomics track, I will be looking at noroviruses as the main focus of my project. I will start by comparing the Norwalk virus (which is the first type of Norovirus discovered and is part of genogroup 1) to another strain of Norovirus (which is part of genogroup 2, genotype 4). Genogroups 1 and 2 are the most common types of noroviruses to infect humans while viruses from genogroup II, genotype 4 cause the majority of outbreaks of gastroenteritis in adults. Comparing the differences between genogroups of norovirus will allow me to identify what changes in nucleotide sequencing causes genogroup II, genotype 4 to be more widespread in humans when compared to other strands of norovirus.


In [None]:
# Program 1 - Comparing the Norwalk Virus Strain to the Norovirus Turramurra Strain
from Bio import Entrez
from Bio.pairwise2 import align

Entrez.email = "rohanaluru@gmail.com"

def retrieve_sequence(accession):
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return record

accession_number_NorwalkVirus = "AF093797.1"
accession_number_NorovirusG2 = "HM748973.2"

sequence_recordG1 = retrieve_sequence(accession_number_NorwalkVirus)
sequence_recordG2 = retrieve_sequence(accession_number_NorovirusG2)

def nucleotide_matches(seq1, seq2):
    visual = ""
    match_count = 0
    if len(sequence_recordG2.seq) >= len(sequence_recordG1.seq):
      total_positions = len(sequence_recordG2.seq)
    else:
      total_positions = len(sequence_recordG1.seq)

    for s1, s2 in zip(seq1, seq2):
        if s1 == s2:
            visual += "|"
            match_count += 1
        else:
            visual += " "
    print("Norwalk Virus:        ", seq1)
    print("                      ", visual)
    print("Norovirus Turramurra: ", seq2)
    similarity_percentage = (match_count / total_positions) * 100
    print("Similarity Percentage: ", similarity_percentage, "%")

alignments = align.globalxx(sequence_recordG1.seq, sequence_recordG2.seq)
best_alignment = alignments[0]
Norwalk_Virus = best_alignment.seqA
Norovirus_G2 = best_alignment.seqB

nucleotide_matches(Norwalk_Virus, Norovirus_G2)

Norwalk Virus:         GTGAATGAT-GATGGCGTCGA-AA-GACG-T-C-G-T-----TG---C-AACTAACGCTT--CAA-G-CACTG-GTGACAATAGTAACA--CAA-TGAA-GGA-TCGCTTTT-TAGCGA-GA-CTCAAAAGT---TTGGGT-AACACCA-AGCCCATAAAAATT-GAAAA---CA-CA--C--AAATG--GCTCTCGGGCTTCTAAGCCG-GGAGCCATCACCATCG-CC-CAACCGT-GACCCTCCTAAACA-ACAACGGGACAGGGC-AC-CACGGAGTG---TGGCTGAAACCCAACAGGCAATGGGATGGACAGACCCACCCGTTGATCAAAACCTGCCAAC--AT--GG-GAGGAGTTGAGCCAGTCTGAGAA-GCAGAAAATCATGG-CTGAG-A-A-T-TCAAAG-TGGTTTGATGCTGGTGGTCTAGGTCCTGCCACCTTGCCATCGAATTATTGTCGTGTTCAAGACAACG-GTG-ATGGTGACCAGCA--GGTTAAATGGAG-TGCTAAAGAT--GG-TGTCAATCTGGGG-GTTGATGCCCTAACA-ACTGTC---CA-AGGCCCCCCATG-GAATTTATGCCCACTTCCCCCAGTCGATCAACGCAATAGTGG-GGCAGCCA---A--GGAACCCCTGA-TT--GGAGATATGATTGAGTTCTATGAA-GG-GCACATATAC---CATTATG-CCATGTACAT--AG-GTC-AGG----G-GAAAACAATT-GGTGTGCATTCC---CCTAGGGCC-GCTTTCT-CCAT-A-CCT----AGGAT-TACC-ATAC-A-C-CCAATCG-CTGCCTG-GT---GGAG----G-GTATG-CT----ATGTC-CC-CA-C-CC--AGGAACAAAGGCTGAGT-T-AT-GACCAACTTAAAG-AGCTTGAAA-ATGA-ACC-TTGGCCA--TATGCA--GC-TGTT-ACTAACAACTGCTATGAG--TTTT

In [None]:
# Program 2 - Listing Strains that are genetically similar to the Norovirus Turramurra Strain
from Bio.Blast import NCBIWWW

from Bio.Blast import NCBIXML

Entrez.email = "rohanaluru@gmail.com"

def retrieve_sequence(accession):

    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")

    record = SeqIO.read(handle, "fasta")

    handle.close()

    return record

def perform_blast(sequence, program="blastn", database="nt"):

    result_handle = NCBIWWW.qblast(program, database, sequence)

    return result_handle


accession_number = "HM748973.2"

sequence_record = retrieve_sequence(accession_number)


def analyze_blast_results(result_handle):

    blast_records = NCBIXML.parse(result_handle)

    for record in blast_records:

        for alignment in record.alignments:

            for hsp in alignment.hsps:

                print(f"Sequence ID: {alignment.title}")

                print(f"Sequence Length: {alignment.length}")

                print(f"Sequence Similarity: {hsp.identities / alignment.length * 100}%")


blast_results = perform_blast(sequence_record.format("fasta"))

analyze_blast_results(blast_results)

Sequence ID: gi|374674602|gb|HM748973.2| Norovirus Hu/GII.4/Turramurra/NSW892U/2009/AUS, complete genome
Sequence Length: 7560
Sequence Similarity: 100.0%
Sequence ID: gi|305415031|gb|HQ009513.1| Norovirus Hu/GII.4/JB-15/KOR/2008, complete genome
Sequence Length: 7558
Sequence Similarity: 98.21381317808944%
Sequence ID: gi|302128698|dbj|AB541322.1| Norovirus Hu/GII-4/Osaka2/2008/JP genomic RNA, complete genome
Sequence Length: 7509
Sequence Similarity: 98.45518710880276%
Sequence ID: gi|302128690|dbj|AB541320.1| Norovirus Hu/GII-4/Osaka1/2008/JP genomic RNA, complete genome
Sequence Length: 7509
Sequence Similarity: 98.36196564123053%
Sequence ID: gi|302128218|dbj|AB541202.1| Norovirus Hu/GII-4/Aichi1/2008/JP genomic RNA, complete genome
Sequence Length: 7509
Sequence Similarity: 98.24210946863764%
Sequence ID: gi|507309631|gb|JX439815.1| Norovirus Hu/GII/Seoul1055/KOR/2010, complete genome
Sequence Length: 7538
Sequence Similarity: 98.02334836826745%
Sequence ID: gi|507309635|gb|JX439

In [None]:
# Program 3 - Comparing the Norovirus Turramurra Strain and the Norovirus Aomori2 Strain
Entrez.email = "rohanaluru@gmail.com"

def retrieve_sequence(accession):
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return record

accession_number_NorovirusAomori2 = "AB541221.1"
accession_number_NorovirusTurramurra = "HM748973.2"

sequence_record1 = retrieve_sequence(accession_number_NorovirusAomori2)
sequence_record2 = retrieve_sequence(accession_number_NorovirusTurramurra)

def nucleotide_matches(seq3, seq4):
    visual = ""
    match_count = 0
    if len(sequence_record2.seq) > len(sequence_record1.seq):
      total_positions = len(sequence_record2.seq)
    else:
      total_positions = len(sequence_record1.seq)

    for s1, s2 in zip(seq3, seq4):
        if s1 == s2:
            visual += "|"
            match_count += 1
        else:
            visual += " "
    print("Norovirus Aomori2:   ", seq3)
    print("                     ", visual)
    print("Norovirus Turramurra:", seq4)
    similarity_percentage = (match_count / total_positions) * 100
    print("Similarity Percentage: ", similarity_percentage, "%")

alignments2 = align.globalxx(sequence_record1.seq, sequence_record2.seq)
best_alignment2 = alignments2[0]
NorovirusAomori2 = best_alignment2.seqA
NorovirusTurramurra = best_alignment2.seqB

nucleotide_matches(NorovirusAomori2, NorovirusTurramurra)

Norovirus Aomori2:    -T---T-----T-------AA--ACG--------------------A---GC--C--C-------T-----C-----G-G--G-A---C------GC--GGC---C--T---AAAC-AGCC-TC----C-C--CC------G---------AGGGAAATACCACAAAGACCCCCACGACCACCC-ACTCCAGAACTAA--TCAAAAAC-ATCCCCCCTCCCCC-ACCCAACGGAGAGGATGACATAGTGGTTTCTTATAGTGTT-AAAGATGGTGTCTCTGGTTTGCCTGATCTTTCCACCGTCAGGCAACCGGAAGAATCTAAC-ACGGCCTT-CAGTGTCCCTCCACTCAATCAGAGGGAGAATAGAGATGCTAAGGAA-CCACTGACTGGAACAATTCT-GGAAATGTGGGATGGAGAAATCTAC-CATTATGGCCTGTATGTG-GAGCGAGGTCTTGTA-CTAGGTGTGCACAAACCACCAGCTGCCATCAGCCTCGCTAGGGTCGAACTAACACCACTCTCCTTGTACTGGAGACCTGTGTATACTCCCCAGTACCTCATCTCTCC-AGACGCTCTCAAGAAACTACACGGAGAGACGTTCCCCTACACAGCCTTTGACAACAACTGCTATGCCTTTTGT-TGTTGGGTCCTGGACCT-AAACGACTCGTGGCTGAGTAGGAGAATGATCCAGAGAACAACTGGCTTCTTCAGACCCTACCAAGATTGGAATAGGAAACCCCTCCCCACTATGGATGACTCCAAGTTAAAGAAGGTAGCTAACATATTCCTGTGTGCACTGTCTTCG-CTATTCACCAGGCCTATAAAAGAC-ATAATAGGGAAGCTAAGGCCTCTTAACATCCTCAACATCTTGGCCTCATGTGATTGGACTTTTGCG-GGCATAGTGGAGTCCTTGATACTCTTGGCAGAACTCTTTGGAGTTTTCTGGACACCCCCAGATGTGTCTGCGA