#**ONLINE**

In [None]:
!pip install biopython
!apt-get update
!apt-get install ncbi-blast+

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m2.3/3.2 MB[0m [31m68.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 https://r

In [None]:
#Load viral TP original dataset
TP_Dataset = {"PRD1": "YP_009639955.1", "Bam35": "AAP83474.1", "Sato": "QWE49625.1", "ABV": "A4ZU93.1"}
TP_list = list(TP_Dataset.values())
TP_list

['YP_009639955.1', 'AAP83474.1', 'QWE49625.1', 'A4ZU93.1']

In [None]:
from Bio import Entrez

Entrez.email = "juancarlos.ramirezm@estudiante.uam.es"
accession_ids = TP_list
sequences = {}

# Function to fetch sequences
def fetch_sequence_by_id(accession_id):
    """Fetch the FASTA sequence for a given accession ID."""
    try:
        with Entrez.efetch(db="protein", id=accession_id, rettype="fasta", retmode="text") as handle:
            return handle.read()
    except Exception as e:
        print(f"Error fetching sequence for {accession_id}: {e}")
        return None

# Retrieve sequences for each accession ID
for acc_id in accession_ids:
    print(f"Fetching sequence for {acc_id}...")
    fasta_sequence = fetch_sequence_by_id(acc_id)
    if fasta_sequence:
        # Parse the sequence (optional, for clarity)
        header, sequence = fasta_sequence.split("\n", 1)
        sequences[acc_id] = sequence.replace("\n", "")
        print(f"Fetched sequence for {acc_id}")

# Print fetched sequences
print("Fetched sequences:", sequences)

Fetching sequence for YP_009639955.1...
Fetched sequence for YP_009639955.1
Fetching sequence for AAP83474.1...
Fetched sequence for AAP83474.1
Fetching sequence for QWE49625.1...
Fetched sequence for QWE49625.1
Fetching sequence for A4ZU93.1...
Fetched sequence for A4ZU93.1
Fetched sequences: {'YP_009639955.1': 'MAKKKPVEKNGLVYKEFQKQVSNLKKAGLIPKTLDVRKVKPTKHYKGLVSKYKDVATGGAKLAAIPNPAVIETLEARGESIIKKGGKAYLKARQQINQRGQIVNPFTVRVTKRGEVVRRYRKTTPEGKPVYITQRELPIKFENMEQWLTELKAAGFQLQPGEQIYFTFNGNYSRRTYTSFDEAFNKFMTYDIIIDAVAGKLKVEDEADLVKSVGFQRISGPEAKAYNRNRIVLPEMQFSQAAKKKYKRRQKRGYGSKGV', 'AAP83474.1': 'MANKRLKKKLETKRKKSLLVSEGYSKKETKKLKGRELETVYKKKAHNRKNRERAREIANLAKQWGLSPSKYNSWKKLLPEIERIKKEQDREAPFLLIYYQDFTGETDSKFIYDFKKRNNTRSRSQITESIIGWLQNAHNKLFLGRVAIRIVPKRDVSKTNTLWRNHGYVKIYEGQGKELSKLLTAIETIMVGVYDVKERDKYLKELVAKLRSLPYEKAKKNAKEIQKIYDTKSYKKESWDNDDYY', 'QWE49625.1': 'MAKKRIKKKLEKKKKISLLLSDSSVSKKETKRLKGRELDVVYKQVNQRVKNRERARAISAEAKRWGLSPTKFNSWKKLLPEIERRKKEIAKEEKREEQRRKRAERNKGKALYVFWTDTQGHSLEEWDRQRDQVEHIYSVHGEEGL

In [None]:
import requests
import time

# EBI Job Dispatcher PSI-BLAST endpoint
BASE_URL = "https://www.ebi.ac.uk/Tools/services/rest/psiblast"

# Submit PSI-BLAST job for each sequence
job_ids = {}
for name, seq in sequences.items():
    payload = {
    "email": Entrez.email,  # Required
    "sequence": seq,                   # Input protein sequence
    "database": "uniprotkb",                  # Use the NCBI nr database
    "iterations": 3,                   # Number of iterations
    "exp": 1e-5,                       # E-value threshold
      }
    response = requests.post(f"{BASE_URL}/run", data=payload)
    if response.status_code == 200:
        job_ids[name] = response.text
        print(f"Job submitted for {name}, ID: {job_ids[name]}")
    else:
        print(f"Failed to submit job for {name}: {response.text}")

# Monitor job status and retrieve results
for name, job_id in job_ids.items():
    status = "RUNNING"
    while status == "RUNNING":
        time.sleep(10)  # Wait before checking status
        status_response = requests.get(f"{BASE_URL}/status/{job_id}")
        status = status_response.text
        print(f"Job {job_id} ({name}): Status = {status}")

    if status == "FINISHED":
        result_url = f"{BASE_URL}/result/{job_id}/out"
        result = requests.get(result_url)
        with open(f"{name}_psiblast.txt", "w") as result_file:
            result_file.write(result.text)
        print(f"Results saved for {name}")
    else:
        print(f"Job {job_id} for {name} did not complete successfully")


Job submitted for YP_009639955.1, ID: psiblast-R20241204-013210-0058-40247448-p1m
Job submitted for AAP83474.1, ID: psiblast-R20241204-013212-0832-87899189-p1m
Job submitted for QWE49625.1, ID: psiblast-R20241204-013215-0857-52884783-p1m
Job submitted for A4ZU93.1, ID: psiblast-R20241204-013218-0423-45620439-p1m
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204-013210-0058-40247448-p1m (YP_009639955.1): Status = RUNNING
Job psiblast-R20241204

#**LOCALLY**

In [None]:
!pip install biopython
!apt-get update
!apt-get install ncbi-blast+
!update_blastdb.pl nr

In [None]:
from Bio import Entrez, SeqIO
from Bio.Blast.Applications import NcbipsiblastCommandline
import os

# Fetching sequences
Entrez.email = "juancarlos.ramierzm@estudiante.uam.es"
protein_ids = TP_list

# Directory to store intermediate and output files
output_dir = "./psi_blast_results"
os.makedirs(output_dir, exist_ok=True)

# Function to fetch a sequence from GenBank and save as a FASTA file
def fetch_sequence(protein_id, output_fasta):
    with Entrez.efetch(db="protein", id=protein_id, rettype="fasta", retmode="text") as handle:
        seq_record = SeqIO.read(handle, "fasta")
        SeqIO.write(seq_record, output_fasta, "fasta")
    print(f"Fetched and saved sequence for {protein_id} to {output_fasta}")

# Perform PSI-BLAST for each protein ID
for protein_id in protein_ids:
    try:
        # Step 1: Fetch the protein sequence
        fasta_file = os.path.join(output_dir, f"{protein_id}.fasta")
        fetch_sequence(protein_id, fasta_file)

        # Step 2: Run PSI-BLAST
        psi_output = os.path.join(output_dir, f"{protein_id}_psi_blast.txt")
        psi_blast_cline = NcbipsiblastCommandline(
            query=fasta_file,
            db="nr_cluster_seq",  # Use the NCBI's non-redundant database
            evalue=1e-5,  # Set e-value threshold
            num_iterations=3,  # Number of iterations
            outfmt=7,  # Tabular output
            out=psi_output,
        )
        print(f"Running PSI-BLAST for {protein_id}...")
        stdout, stderr = psi_blast_cline()  # Execute the command
        print(f"PSI-BLAST completed for {protein_id}. Results saved to {psi_output}")
    except Exception as e:
        print(f"Error with {protein_id}: {e}")


#**PROCESSING**

In [None]:
import pandas as pd

fields = {"Hit": [], "aln_hit": [], "%I": [], "P(H)": [], "E-value": [], "Bit-Score": [], "len(Qry)":[], "len(aln)":[], "%aln":[], "Taxonomy":[], "TaxID":[], "Database":[], "Method":[], "TP":[]}
field_order = ["Hit", "GenBankID", "aln_hit", "%I", "P(H)", "E-value", "Bit-Score", "len(Qry)", "len(aln)", "%aln", "Taxonomy", "TaxID", "Database", "Method", "TP"]
fields_lesser = {"Hit": [], "%I": [], "aln_hit": [], "E-value": [], "Bit-Score": [], "len(Qry)":[], "len(aln)":[], "%aln":[], "Database":[], "Method":[], "TP":[]}



bigtable = pd.DataFrame(columns = fields_lesser)

dataset = pd.read_csv("TP_Dataset.csv")
TP_ID = {}
query_lengths = {}
TP_list = dataset.Acronym.to_list()
ID_list = dataset.GenBankID.to_list()

for i in range(len(dataset)):
  TP_ID[dataset["Acronym"][i]] = dataset["GenBankID"][i]
  query_lengths[dataset["Acronym"][i]] = dataset["Length"][i]


  hitlist = pd.read_csv(f"{ID_list[i]}.tsv", sep="\t", comment="#", header=None)
  hitlist["len(Qry)"] = query_lengths[[key for key, value in TP_ID.items() if value == ID_list[i]][0]]
  hitlist["len(aln)"] = hitlist[7] - hitlist[6] + 1
  hitlist.drop(columns=[0, 4, 5, 6, 7, 8, 9], inplace=True)
  hitlist = hitlist.rename(columns={1: "Hit", 2: "%I", 3: "aln_hit", 10: "E-value", 11: "Bit-Score"})
  hitlist["%aln"] = hitlist["len(aln)"] / hitlist["len(Qry)"]
  hitlist["Database"] = "nr"
  hitlist["Method"] = "PSI-BLAST"
  hitlist["TP"] = TP_list[i]
  bigtable = pd.concat([bigtable, hitlist], ignore_index=True)
bigtable.to_csv("PSI-BLAST_results.csv")
bigtable

bigtable["P(H)"] = 1
bigtable["Taxonomy"] = ""
bigtable["TaxID"] = ""
bigtable["GenBankID"] = bigtable["Hit"]
bigtable = bigtable[field_order]
bigtable

In [None]:
# ALTERNATIVE FORMAT

import pandas as pd

count = 0

hitlist = []
fields = ["GenBankID", "Bit-Score", "E-value"]
results = pd.DataFrame(columns = fields)

with open("XGU07810.1.tsv", "r") as hit:
  for line in hit.readlines():
    if "Query" not in line and ".1" in line and "[" in line and "  " in line:
      hitlist.append(line[:-1])
for hit in hitlist:
  elements = hit.split(" ")
  elements = [item for item in elements if item != '']
  print(elements)
  results.loc[count, "GenBankID"] = elements[0]
  results.loc[count, "Bit-Score"] = elements[-2]
  results.loc[count, "E-value"] = elements[-1]
  count += 1
results

['XGU07810.1', 'DNA', 'terminal', 'protein', '[Microbacterium', 'phage', 'Evcara]', '521', '0.0']
['WVX89719.1', 'DNA', 'terminal', 'protein', '[Microbacterium', 'phage', 'Curie]', '315', '4e-105']
['YP_010755012.1', 'DNA', 'terminal', 'protein', '[Microbacterium', 'phage', 'Pineapp...', '176', '1e-50']
['YP_010755995.1', 'hypothetical', 'protein', 'QEJ62_gp02', '[Curtobacterium', 'ph...', '124', '3e-30']
['URG18047.1', 'terminal', 'protein', '[Curtobacterium', 'phage', 'Ayka]', '110', '5e-25']
['YP_010755973.1', 'hypothetical', 'protein', 'QEJ61_gp02', '[Curtobacterium', 'ph...', '97.8', '3e-20']
['YP_010756039.1', 'Putative', 'terminal', 'protein', '[Glutamicibacter', 'phage', 'V...', '88.2', '1e-16']
['YP_009842182.1', 'hypothetical', 'protein', 'HWB96_gp07', '[Arthrobacter', 'phag...', '85.5', '1e-15']
['YP_010756068.1', 'hypothetical', 'protein', 'QEJ65_gp08', '[Rhizobium', 'phage', 'R...', '80.5', '7e-14']
['DAR80938.1', 'TPA:', 'MAG', 'TPA:', 'hypothetical', 'protein', '[Caudovi

Unnamed: 0,GenBankID,Bit-Score,E-value
0,XGU07810.1,521,0.0
1,WVX89719.1,315,4e-105
2,YP_010755012.1,176,1e-50
3,YP_010755995.1,124,3e-30
4,URG18047.1,110,5e-25
...,...,...,...
231,DAW98648.1,53.0,3e-04
232,DAJ64125.1,52.6,6e-04
233,MBQ8168678.1,51.9,9e-04
234,MBO7712828.1,51.9,9e-04


In [None]:
# SOLO PARA ELIMINAR HAdV-12
import pandas as pd
bigtable = pd.read_csv("PSI-BLAST_results.csv")
bigtable = bigtable[bigtable["TP"] != "HAdV-12"]
bigtable.reset_index(drop=True, inplace=True)
bigtable.to_csv("PSI-BLAST_results.csv", index=False)


bigtable["P(H)"] = 1
bigtable["Taxonomy"] = ""
bigtable["TaxID"] = ""
bigtable["GenBankID"] = bigtable["Hit"]
bigtable = bigtable[field_order]
bigtable

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,Taxonomy,TaxID,Database,Method,TP
0,NP_040718.1,NP_040718.1,266.0,97.744,1,0.000000e+00,534.0,266,266.0,1.000000,,,nr,PSI-BLAST,phi29
1,2EX3_B,2EX3_B,196.0,100.000,1,3.110000e-140,403.0,266,196.0,0.736842,,,nr,PSI-BLAST,phi29
2,NP_690636.1,NP_690636.1,266.0,62.030,1,2.250000e-115,342.0,266,266.0,1.000000,,,nr,PSI-BLAST,phi29
3,UOX39806.1,UOX39806.1,266.0,59.023,1,3.130000e-113,336.0,266,266.0,1.000000,,,nr,PSI-BLAST,phi29
4,AYJ76434.1,AYJ76434.1,222.0,58.559,1,1.660000e-87,269.0,266,222.0,0.834586,,,nr,PSI-BLAST,phi29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12850,WP_206886507.1,WP_206886507.1,131.0,12.214,1,4.070000e-05,54.3,360,131.0,0.363889,,,nr,PSI-BLAST,Av-1
12851,DAV80483.1,DAV80483.1,225.0,13.333,1,4.890000e-05,56.6,360,211.0,0.586111,,,nr,PSI-BLAST,Av-1
12852,DAQ18774.1,DAQ18774.1,224.0,10.714,1,2.420000e-04,54.3,360,215.0,0.597222,,,nr,PSI-BLAST,Av-1
12853,DAK11225.1,DAK11225.1,253.0,14.229,1,5.840000e-04,53.1,360,233.0,0.647222,,,nr,PSI-BLAST,Av-1


In [None]:
!pip install biopython
!apt-get update
!apt-get install ncbi-blast+

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.2 MB[0m [31m50.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:5 https://developer.download.nvidia.com/c

In [None]:
bigtable_prefilt = bigtable.copy()
bigtable_prefilt = bigtable_prefilt.dropna(subset=["Hit"]).reset_index(drop=True)
bigtable_prefilt

# Prefiltering. E-value threshold aready adjusted during querying process, but retained here for validation. P(H) omitted as it lacks on PSI-BLAST
for i in range(len(bigtable_prefilt)):
    if (
        bigtable_prefilt.loc[i, "E-value"] > 0.01 or
        bigtable_prefilt.loc[i, "%aln"] < 0.3
    ):
        bigtable_prefilt = bigtable_prefilt.drop(i)
# Sorting
bigtable_prefilt = bigtable_prefilt.sort_values(
    by=["E-value", "P(H)", "%aln"],
    ascending=[True, False, True]  # Specify sorting order
)

bigtable_prefilt.to_csv("Prefilt_PSI-BLAST_results.csv", index=False)
# Deduplication
# Avoid deleting ID-less entries
mask = bigtable_prefilt['GenBankID'] == ""
# Remove duplicate rows based on 'GenBankID' column
# Keep only the first occurrence of each duplicate (best ranked one)
unique_entries = bigtable_prefilt[~mask].drop_duplicates(subset="GenBankID", keep="first")
bigtable_dedup = pd.concat([bigtable_prefilt[mask], unique_entries], ignore_index=True)

bigtable_dedup.to_csv("Dedup_PSI-BLAST_results.csv", index=False)

bigtable_dedup

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,Taxonomy,TaxID,Database,Method,TP
0,AOC84064.1,AOC84064.1,352.0,99.148,1,0.000000,716.0,679,352.0,0.518409,,,nr,PSI-BLAST,FAdV-8
1,ANA50312.1,ANA50312.1,354.0,98.023,1,0.000000,711.0,679,353.0,0.519882,,,nr,PSI-BLAST,FAdV-8
2,XEQ86939.1,XEQ86939.1,374.0,99.465,1,0.000000,752.0,671,374.0,0.557377,,,nr,PSI-BLAST,hAd2
3,QOV03173.1,QOV03173.1,378.0,72.487,1,0.000000,549.0,671,376.0,0.560358,,,nr,PSI-BLAST,hAd2
4,AGT76236.1,AGT76236.1,442.0,74.661,1,0.000000,573.0,671,430.0,0.640835,,,nr,PSI-BLAST,hAd2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,HUB10295.1,HUB10295.1,457.0,14.661,1,0.000826,55.2,559,456.0,0.815742,,,nr,PSI-BLAST,GC1
1391,KAJ7837014.1,KAJ7837014.1,251.0,18.327,1,0.001000,54.8,559,249.0,0.445438,,,nr,PSI-BLAST,GC1
1392,EAK6855417.1,EAK6855417.1,56.0,23.214,1,0.001000,46.4,114,56.0,0.491228,,,nr,PSI-BLAST,S-CREM2
1393,WP_110855965.1,WP_110855965.1,436.0,13.073,1,0.001000,54.8,559,408.0,0.729875,,,nr,PSI-BLAST,GC1


In [None]:
from Bio import Entrez

Entrez.email = "juancarlos.ramirezm@estudiante.uam.es"

# List of GenBankID
genbank_ids = bigtable_dedup["GenBankID"].to_list()
genbank_ids

def fetch_taxonomy_id(genbank_id):
    try:
        # Query Entrez
        print(f"Fetching TaxID for: {genbank_id}")
        handle = Entrez.efetch(db="protein", id=genbank_id, rettype="gb", retmode="text")
        record = handle.read()
        handle.close()

        # Find the Taxonomy ID
        for line in record.split("\n"):
            if "Taxonomy" in line or "/db_xref=\"taxon:" in line:
                if "/db_xref=\"taxon:" in line:
                    tax_id = line.split("taxon:")[1].split("\"")[0]
                    return tax_id
        return None
    except Exception as e:
        print(f"Error fetching taxonomy for {genbank_id}: {e}")
        return None

# Process all GenBank IDs
genbank_ids
taxonomy_ids = {}
for gb_id in range(len(genbank_ids)):
    tax_id = fetch_taxonomy_id(genbank_ids[gb_id])
    taxonomy_ids[genbank_ids[gb_id]] = tax_id
    bigtable_dedup.at[gb_id, "TaxID"] = tax_id


for gb_id, tax_id in taxonomy_ids.items():
    print(f"{gb_id}: {tax_id}")
bigtable_dedup.to_csv("Taxonomy_PSI-BLAST_results.csv", index=False)
bigtable_dedup

Fetching TaxID for: AOC84064.1
Fetching TaxID for: ANA50312.1
Fetching TaxID for: XEQ86939.1
Fetching TaxID for: QOV03173.1
Fetching TaxID for: AGT76236.1
Fetching TaxID for: QWK52450.1
Fetching TaxID for: WJJ54608.1
Fetching TaxID for: DAW83362.1
Fetching TaxID for: BCZ16696.1
Fetching TaxID for: YP_010790674.1
Fetching TaxID for: NP_043879.1
Fetching TaxID for: WZB38155.1
Fetching TaxID for: UPO25000.1
Fetching TaxID for: DBA48924.1
Fetching TaxID for: WXG22698.1
Fetching TaxID for: WXG22478.1
Fetching TaxID for: YP_009388312.1
Fetching TaxID for: WXG22732.1
Fetching TaxID for: XBH23610.1
Fetching TaxID for: WRQ19845.1
Fetching TaxID for: AYC35462.1
Fetching TaxID for: NP_050282.1
Fetching TaxID for: AEK79911.1
Fetching TaxID for: QRV11644.1
Fetching TaxID for: NP_108659.1
Fetching TaxID for: XCK17000.1
Fetching TaxID for: YP_010796273.1
Fetching TaxID for: WXG22927.1
Fetching TaxID for: QQD36934.1
Fetching TaxID for: UKS51808.1
Fetching TaxID for: WP_202259074.1
Fetching TaxID for: 

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,Taxonomy,TaxID,Database,Method,TP
0,AOC84064.1,AOC84064.1,352.0,99.148,1,0.000000,716.0,679,352.0,0.518409,,190065,nr,PSI-BLAST,FAdV-8
1,ANA50312.1,ANA50312.1,354.0,98.023,1,0.000000,711.0,679,353.0,0.519882,,586029,nr,PSI-BLAST,FAdV-8
2,XEQ86939.1,XEQ86939.1,374.0,99.465,1,0.000000,752.0,671,374.0,0.557377,,1907210,nr,PSI-BLAST,hAd2
3,QOV03173.1,QOV03173.1,378.0,72.487,1,0.000000,549.0,671,376.0,0.560358,,10524,nr,PSI-BLAST,hAd2
4,AGT76236.1,AGT76236.1,442.0,74.661,1,0.000000,573.0,671,430.0,0.640835,,108098,nr,PSI-BLAST,hAd2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,HUB10295.1,HUB10295.1,457.0,14.661,1,0.000826,55.2,559,456.0,0.815742,,2026763,nr,PSI-BLAST,GC1
1391,KAJ7837014.1,KAJ7837014.1,251.0,18.327,1,0.001000,54.8,559,249.0,0.445438,,230810,nr,PSI-BLAST,GC1
1392,EAK6855417.1,EAK6855417.1,56.0,23.214,1,0.001000,46.4,114,56.0,0.491228,,195,nr,PSI-BLAST,S-CREM2
1393,WP_110855965.1,WP_110855965.1,436.0,13.073,1,0.001000,54.8,559,408.0,0.729875,,321895,nr,PSI-BLAST,GC1


In [None]:
from Bio import Entrez
import pandas as pd

bigtable_dedup = pd.read_csv("Taxonomy_PSI-BLAST_results.csv")
Entrez.email = "juancarlos.ramirezm@estudiante.uam.es"

genbank_ids = bigtable_dedup["GenBankID"].to_list()  # Replace with your GenBank IDs

# Function to fetch species for a GenBank ID
def get_species_from_genbank(genbank_id):
    try:
        # Fetch the GenBank record
        print(f"Fetching Species for: {genbank_id}")
        handle = Entrez.efetch(db="protein", id=genbank_id, rettype="gb", retmode="text")
        record = handle.read()
        handle.close()

        # Parse the species information
        for line in record.split("\n"):
            if line.startswith("  ORGANISM"):
                print(f'{line.replace("  ORGANISM", "").strip()}')
                return line.replace("  ORGANISM", "").strip()
        return "Species not found"
    except Exception as e:
        return f"Error retrieving {genbank_id}: {e}"

# Fetch and print species for each GenBank ID
species_dict = {}
for genbank_id in range(len(genbank_ids)):
    species = get_species_from_genbank(genbank_ids[genbank_id])
    species_dict[genbank_ids[genbank_id]] = species
    print(f"{species}")
    bigtable_dedup.at[genbank_id, "Taxonomy"] = species

# Optionally, save results to a file
with open("species_results.txt", "w") as f:
    for genbank_id, species in species_dict.items():
        f.write(f"{genbank_id}: {species}\n")

bigtable_dedup.to_csv("Species_PSI-BLAST_results.csv", index=False)
bigtable_dedup

Fetching Species for: AOC84064.1
Fowl aviadenovirus E
Fowl aviadenovirus E
Fetching Species for: ANA50312.1


  bigtable_dedup.at[genbank_id, "Taxonomy"] = species


Fowl adenovirus 8b
Fowl adenovirus 8b
Fetching Species for: XEQ86939.1
Human adenovirus sp.
Human adenovirus sp.
Fetching Species for: QOV03173.1
Human adenovirus 41
Human adenovirus 41
Fetching Species for: AGT76236.1
Human mastadenovirus B
Human mastadenovirus B
Fetching Species for: QWK52450.1
Duck adenovirus 3
Duck adenovirus 3
Fetching Species for: WJJ54608.1
Otus scops adenovirus
Otus scops adenovirus
Fetching Species for: DAW83362.1
Caudoviricetes sp.
Caudoviricetes sp.
Fetching Species for: BCZ16696.1
Owl adenovirus
Owl adenovirus
Fetching Species for: YP_010790674.1
Psittacine adenovirus 1
Psittacine adenovirus 1
Fetching Species for: NP_043879.1
Fowl aviadenovirus A
Fowl aviadenovirus A
Fetching Species for: WZB38155.1
Bat mastadenovirus
Bat mastadenovirus
Fetching Species for: UPO25000.1
Fowl aviadenovirus D
Fowl aviadenovirus D
Fetching Species for: DBA48924.1
Adenovirus bat33390
Adenovirus bat33390
Fetching Species for: WXG22698.1
Bat mastadenovirus
Bat mastadenovirus
Fetc

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,Taxonomy,TaxID,Database,Method,TP
0,AOC84064.1,AOC84064.1,352.0,99.148,1,0.000000,716.0,679,352.0,0.518409,Fowl aviadenovirus E,190065,nr,PSI-BLAST,FAdV-8
1,ANA50312.1,ANA50312.1,354.0,98.023,1,0.000000,711.0,679,353.0,0.519882,Fowl adenovirus 8b,586029,nr,PSI-BLAST,FAdV-8
2,XEQ86939.1,XEQ86939.1,374.0,99.465,1,0.000000,752.0,671,374.0,0.557377,Human adenovirus sp.,1907210,nr,PSI-BLAST,hAd2
3,QOV03173.1,QOV03173.1,378.0,72.487,1,0.000000,549.0,671,376.0,0.560358,Human adenovirus 41,10524,nr,PSI-BLAST,hAd2
4,AGT76236.1,AGT76236.1,442.0,74.661,1,0.000000,573.0,671,430.0,0.640835,Human mastadenovirus B,108098,nr,PSI-BLAST,hAd2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,HUB10295.1,HUB10295.1,457.0,14.661,1,0.000826,55.2,559,456.0,0.815742,Myxococcales bacterium,2026763,nr,PSI-BLAST,GC1
1391,KAJ7837014.1,KAJ7837014.1,251.0,18.327,1,0.001000,54.8,559,249.0,0.445438,Mycena olivaceomarginata,230810,nr,PSI-BLAST,GC1
1392,EAK6855417.1,EAK6855417.1,56.0,23.214,1,0.001000,46.4,114,56.0,0.491228,Campylobacter coli,195,nr,PSI-BLAST,S-CREM2
1393,WP_110855965.1,WP_110855965.1,436.0,13.073,1,0.001000,54.8,559,408.0,0.729875,Paraburkholderia silvatlantica,321895,nr,PSI-BLAST,GC1


In [None]:
bigtable_safe = bigtable_dedup.copy()
bigtable_safe.to_csv("Species_PSI-BLAST_results.csv", index=False)
bigtable_safe

Unnamed: 0,Hit,GenBankID,aln_hit,%I,P(H),E-value,Bit-Score,len(Qry),len(aln),%aln,Taxonomy,TaxID,Database,Method,TP
0,AOC84064.1,AOC84064.1,352.0,99.148,1,0.000000,716.0,679,352.0,0.518409,Fowl aviadenovirus E,190065,nr,PSI-BLAST,FAdV-8
1,ANA50312.1,ANA50312.1,354.0,98.023,1,0.000000,711.0,679,353.0,0.519882,Fowl adenovirus 8b,586029,nr,PSI-BLAST,FAdV-8
2,XEQ86939.1,XEQ86939.1,374.0,99.465,1,0.000000,752.0,671,374.0,0.557377,Human adenovirus sp.,1907210,nr,PSI-BLAST,hAd2
3,QOV03173.1,QOV03173.1,378.0,72.487,1,0.000000,549.0,671,376.0,0.560358,Human adenovirus 41,10524,nr,PSI-BLAST,hAd2
4,AGT76236.1,AGT76236.1,442.0,74.661,1,0.000000,573.0,671,430.0,0.640835,Human mastadenovirus B,108098,nr,PSI-BLAST,hAd2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,HUB10295.1,HUB10295.1,457.0,14.661,1,0.000826,55.2,559,456.0,0.815742,Myxococcales bacterium,2026763,nr,PSI-BLAST,GC1
1391,KAJ7837014.1,KAJ7837014.1,251.0,18.327,1,0.001000,54.8,559,249.0,0.445438,Mycena olivaceomarginata,230810,nr,PSI-BLAST,GC1
1392,EAK6855417.1,EAK6855417.1,56.0,23.214,1,0.001000,46.4,114,56.0,0.491228,Campylobacter coli,195,nr,PSI-BLAST,S-CREM2
1393,WP_110855965.1,WP_110855965.1,436.0,13.073,1,0.001000,54.8,559,408.0,0.729875,Paraburkholderia silvatlantica,321895,nr,PSI-BLAST,GC1


In [None]:
from Bio import Entrez
import pandas as pd
from collections import defaultdict

#bigtable_safe = pd.read_csv("Species_PSI-BLAST_results.csv")

Entrez.email = "juancarlos.ramirezm@estudiante.uam.es"

# List of TaxID
tax_ids = bigtable_safe["TaxID"].unique() # Fetching GenBank-ID-containing hits

# Categories to track
categories = {"Viruses": "viruses", "Bacteria": "bacteria", "Archaea": "archaea", "Eukaryota": "eukaryota", "Other": "other", "Unclassified": "unclassified"}
classification_counts = defaultdict(int)
classification_results = []

# Query NCBI Taxonomy database for each Taxonomy ID
for tax_id in tax_ids:
  if tax_id != "":
    print(f"Querying Taxonomy ID: {tax_id}")
    try:
        handle = Entrez.efetch(db="taxonomy", id=str(tax_id), retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        # Extract lineage
        lineage = records[0]["Lineage"]
        lineage_lower = lineage.lower()
        print(f"Lineage for {tax_id}: {lineage}")

        # Check category membership
        result = {"TaxID": tax_id, "Lineage": lineage}
        for category, keyword in categories.items():
            if keyword in lineage_lower:
                result["Category"] = category
                classification_counts[category] += 1
                break
        else:
            result["Category"] = "Other"

        classification_results.append(result)
    except Exception as e:
        print(f"Error fetching Taxonomy ID {tax_id}: {e}")

# Convert results to a DataFrame
results_df = pd.DataFrame(classification_results)

summary_df = pd.DataFrame.from_dict(classification_counts, orient="index", columns=["Count"])
print("\nClassification Summary:")
print(summary_df)
results_df.to_csv("taxonomy_classification_results.csv", index=False)
print("\nClassification results saved to 'taxonomy_classification_results.csv'.")

Querying Taxonomy ID: 190065
Lineage for 190065: Viruses; Varidnaviria; Bamfordvirae; Preplasmiviricota; Tectiliviricetes; Rowavirales; Adenoviridae; Aviadenovirus
Querying Taxonomy ID: 586029
Lineage for 586029: Viruses; Varidnaviria; Bamfordvirae; Preplasmiviricota; Tectiliviricetes; Rowavirales; Adenoviridae; Aviadenovirus; Fowl aviadenovirus E
Querying Taxonomy ID: 1907210
Lineage for 1907210: Viruses; Varidnaviria; Bamfordvirae; Preplasmiviricota; Tectiliviricetes; Rowavirales; Adenoviridae; Mastadenovirus; unclassified Human adenoviruses
Querying Taxonomy ID: 10524
Lineage for 10524: Viruses; Varidnaviria; Bamfordvirae; Preplasmiviricota; Tectiliviricetes; Rowavirales; Adenoviridae; Mastadenovirus; Human mastadenovirus F
Querying Taxonomy ID: 108098
Lineage for 108098: Viruses; Varidnaviria; Bamfordvirae; Preplasmiviricota; Tectiliviricetes; Rowavirales; Adenoviridae; Mastadenovirus
Querying Taxonomy ID: 2233538
Lineage for 2233538: Viruses; Varidnaviria; Bamfordvirae; Preplasmiv

In [None]:
results_df = pd.read_csv("taxonomy_classification_results.csv")
results_df

Unnamed: 0,TaxID,Lineage,Category
0,1907210,Viruses; Varidnaviria; Bamfordvirae; Preplasmi...,Viruses
1,130308,Viruses; Varidnaviria; Bamfordvirae; Preplasmi...,Viruses
2,129951,Viruses; Varidnaviria; Bamfordvirae; Preplasmi...,Viruses
3,2050578,Viruses; Varidnaviria; Bamfordvirae; Preplasmi...,Viruses
4,38432,Viruses; Varidnaviria; Bamfordvirae; Preplasmi...,Viruses
...,...,...,...
1009,2026763,cellular organisms; Bacteria; Pseudomonadati; ...,Bacteria
1010,230810,cellular organisms; Eukaryota; Opisthokonta; F...,Eukaryota
1011,195,cellular organisms; Bacteria; Pseudomonadati; ...,Bacteria
1012,321895,cellular organisms; Bacteria; Pseudomonadati; ...,Bacteria


In [None]:
import pandas as pd
tax_levels = {"Empire": [], "Realm": [], "Kingdom": [], "Phylum": [], "Class": [], "Order": [], "Family": [], "Genus": [], "Species": []}

OTU_list = []
for i in range(len(results_df)):
  OTU_list.append(results_df.at[i, "Lineage"])
#OTU_list
lineage_list = pd.DataFrame(tax_levels)

for i in range(len(OTU_list)):
  lineage = OTU_list[i].split(";")
  print(lineage)
  if "unclassified" not in lineage[-1] and lineage[-1][1:].count(" ") >= 1:
    lineage_list.at[i, "Species"] = lineage[-1][1:]
  if lineage[0] == "Viruses":
    lineage_list.loc[i, "Empire"] = "Viruses"
  elif lineage[0] == "cellular organisms":
    lineage_list.loc[i, "Empire"] = "Cytota"
  else:
    lineage_list.loc[i, "Empire"] = "unclassified entries"
  if "unclassified" in lineage[1]:
    lineage_list.loc[i, "Realm"] = ""
  elif lineage[1][-1] != "a":
    lineage_list.loc[i, "Realm"] = ""
  else:
    lineage_list.at[i, "Realm"] = lineage[1][1:]
  for level in lineage:
    #print(level)
    if level[-2:] == "ae" and "unclassified" not in level and level[-4:] != "diae" and " " not in level[1:] and level[-4:] != "neae" and level[-4:] != "inae":
      lineage_list.at[i, "Family"] = level[1:]

  if lineage[0] == "Viruses": # Viral classification
    for level in lineage:
      if level[-5:] == "viria" and "unclassified" not in level:
        lineage_list.at[i, "Realm"] = level[1:]
      if level[-5:] == "virae" and "unclassified" not in level:
        lineage_list.at[i, "Kingdom"] = level[1:]
      if level[-8:] == "viricota" and "unclassified" not in level:
        lineage_list.at[i, "Phylum"] = level[1:]
      if level[-9:] == "viricetes" and "unclassified" not in level:
        lineage_list.at[i, "Class"] = level[1:]
      if level[-7:] == "virales" and "unclassified" not in level:
        lineage_list.at[i, "Order"] = level[1:]
      if (level[-5:] == "virus" or level[-6:] == "viroid" or level[-9:] == "satellite" or level[-8:] == "viriform") and "unclassified" not in level and " " not in level[1:]:
        lineage_list.at[i, "Genus"] = level[1:]
  if lineage[1][1:] == "Bacteria" or lineage[1][1:] == "Archaea": # Bacterial/Achaeal classification
    for level in lineage:
      if level[-3:] == "ati" and "unclassified" not in level:
        lineage_list.at[i, "Kingdom"] = level[1:]
      if (level[-3:] == "ota" or level[1:] == "Candidatus Kryptoniota" or level[1:] == "Candidatus Aminicenantota" or level[1:] == "Candidatus Hydrothermarchaeota" or level[1:] == "Candidatus Eiseniibacteriota" or level[1:] == "Candidatus Zixiibacteriota") and "unclassified" not in level and "Candidatus " not in level:
        lineage_list.at[i, "Phylum"] = level[1:]
      if "unclassified" not in level and "Candidatus " in level and level[1:] != lineage_list["Realm"][i] and level[1:] != lineage_list["Kingdom"][i] and level[1:] and level[1:] != lineage_list["Empire"][i] and level != "cellular organisms" and level[1:] != lineage_list["Species"][i]:
        lineage_list.at[i, "Phylum"] = level[1:]
      if (level[-2:] == "ia" or level[-4:] == "etes" or level[-2:] == "ei" or level[-4:] == "neae" or level[-2:] == "bi" or level[-5:] == "cocci" or level[1:] == "Bacilli") and "unclassified" not in level and level[1:] != "Bacteria" and level[1:] not in ["Massilia", "Escherichia", "Ehrlicia", "Rickettsia", "Pregia", "Wolbachia", "Orientia", "Chlamydia", "Mannheimia", "Neorickettsia", "Hafnia", "Hafkinia", "Gortzia", "Bealeaia", "Seliberia"]:
        lineage_list.at[i, "Class"] = level[1:]
      if level[-4:] == "ales" and "unclassified" not in level:
        lineage_list.at[i, "Order"] = level[1:]
      if level[-5:] == "aceae" and "unclassified" not in level:
        lineage_list.at[i, "Family"] = level[1:]
      if level.count(" ") <= 1  and ("unclassified" not in level or "Candidatus " in level) and level[1:] != lineage_list["Realm"][i] and level[1:] != lineage_list["Kingdom"][i] and level[1:] != lineage_list["Phylum"][i] and level[1:] != lineage_list["Class"][i] and level[1:] != lineage_list["Order"][i] and level[1:] != lineage_list["Family"][i] and level[1:] != lineage_list["Empire"][i] and level != "cellular organisms":
        lineage_list.at[i, "Genus"] = level[1:]
      if "unclassified" not in level and "Candidatus " in level and level[1:] != lineage_list["Realm"][i] and level[1:] != lineage_list["Kingdom"][i] and level[1:] != lineage_list["Phylum"][i] and level[1:] != lineage_list["Class"][i] and level[1:] != lineage_list["Order"][i] and level[1:] != lineage_list["Family"][i] and level[1:] != lineage_list["Empire"][i] and level != "cellular organisms" and level[1:] != lineage_list["Species"][i] and level[1:] != "Candidatus Hydrothermarchaeota" and level[1:] != "Candidatus Kryptoniota" and level[1:] != "Candidatus Aminicenantota" and level[1:] != "Candidatus Eiseniibacteriota" and level[1:] != "Candidatus Zixiibacteriota":
        lineage_list.at[i, "Genus"] = level[1:]

  if lineage[1][1:] == "Eukaryota": # Eukaryotic classification
    for level in lineage:
      if level[1:] == "Metazoa":
        lineage_list.at[i, "Kingdom"] = level[1:]
      if level[1:] in ["Porifera", "Cnidaria", "Ctenophora", "Placozoa", "Chordata", "Echinodermata", "Arthropoda", "Nematoda", "Mollusca", "Annelida", "Platyhelminthes", "Nemertea", "Rotifera", "Bryozoa", "Tardigrada", "Onychophora", "Brachiopoda", "Chaetognatha", "Hemichordata", "Xenacoelomorpha", "Priapulida", "Loricifera", "Kinorhyncha", "Gastrotricha", "Cycliophora", "Micrognathozoa", "Phoronida", "Entoprocta", "Ectoprocta", "Acanthocephala", "Gnathostomulida"] or level[-4:] in ["zoa", "ta"]:  # Includes phyla with common suffixes
          lineage_list.at[i, "Phylum"] = level[1:]
      if (level[-6:] == "ophyta" or level[-6:] == "mycota") and "unclassified" not in level:
        lineage_list.at[i, "Phylum"] = level[1:]
      if (level[-7:] == "mycetes" or level[-7:] == "phyceae" or level[-3:] == "ata" or(lineage_list["Kingdom"][i] != "Metazoa" and level[-7:] == "opsida")) and "unclassified" not in level:
        lineage_list.at[i, "Class"] = level[1:]
      if (level[-4:] == "ales" or level[-7:] == "iformes" or (lineage_list["Kingdom"][i] == "Metazoa" and level[-3:] == "ida") or level[-2:] == "ea") and "unclassified" not in level:
        lineage_list.at[i, "Order"] = level[1:]
      if level[1:] not in [" ", "unclassified"] and level[1:] != lineage_list["Realm"][i] and level[1:] != lineage_list["Kingdom"][i] and level[1:] != lineage_list["Phylum"][i] and level[1:] != lineage_list["Empire"][i] and level[1:] != lineage_list["Class"][i] and level[1:] != lineage_list["Order"][i] and level[1:] != lineage_list["Family"][i]:
        lineage_list.at[i, "Genus"] = level[1:]
      if level[1:] == "Amebozoa":
        lineage_list.at[i, "Phylum"] = level[1:]
        lineage_list.at[i, "Kingdom"] = "Protozoa"
      if level[1:] == "Viridiplantae":
        lineage_list.at[i, "Kingdom"] = "Viridiplantae"
      if level[-6:] == "mycota" and "unclassified" not in level:
        lineage_list.at[i, "Kingdom"] = "Fungi"

lineage_list.to_csv("lineage_classification_results.csv", index=False)
lineage_list

['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Aviadenovirus']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Aviadenovirus', ' Fowl aviadenovirus E']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Mastadenovirus', ' unclassified Human adenoviruses']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Mastadenovirus', ' Human mastadenovirus F']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Mastadenovirus']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Aviadenovirus', ' Duck aviadenovirus B']
['Viruses', ' Varidnaviria', ' Bamfordvirae'

  lineage_list.loc[i, "Empire"] = "Viruses"
  lineage_list.at[i, "Realm"] = lineage[1][1:]
  lineage_list.at[i, "Family"] = level[1:]
  lineage_list.at[i, "Kingdom"] = level[1:]
  lineage_list.at[i, "Phylum"] = level[1:]
  lineage_list.at[i, "Class"] = level[1:]
  lineage_list.at[i, "Order"] = level[1:]
  lineage_list.at[i, "Genus"] = level[1:]
  lineage_list.at[i, "Species"] = lineage[-1][1:]


['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Siadenovirus', ' Turkey siadenovirus A']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' Atadenovirus', ' Lizard atadenovirus A']
['Viruses', ' Varidnaviria', ' Bamfordvirae', ' Preplasmiviricota', ' Tectiliviricetes', ' Rowavirales', ' Adenoviridae', ' unclassified Adenoviridae']
['Viruses', ' Duplodnaviria', ' Heunggongvirae', ' Uroviricota', ' Caudoviricetes', ' Salasmaviridae', ' Picovirinae', ' Salasvirus', ' Salasvirus PZA']
['Viruses', ' Duplodnaviria', ' Heunggongvirae', ' Uroviricota', ' Caudoviricetes', ' Salasmaviridae', ' Northropvirinae', ' unclassified Northropvirinae']
['Viruses', ' Duplodnaviria', ' Heunggongvirae', ' Uroviricota', ' Caudoviricetes', ' Salasmaviridae', ' Huangshavirus', ' Huangshavirus dlcuna']
['Viruses', ' Duplodnaviria', ' Heunggongvirae', ' Uroviricota'

Unnamed: 0,Empire,Realm,Kingdom,Phylum,Class,Order,Family,Genus,Species
0,Viruses,Varidnaviria,Bamfordvirae,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Aviadenovirus,
1,Viruses,Varidnaviria,Bamfordvirae,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Aviadenovirus,Fowl aviadenovirus E
2,Viruses,Varidnaviria,Bamfordvirae,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,
3,Viruses,Varidnaviria,Bamfordvirae,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,Human mastadenovirus F
4,Viruses,Varidnaviria,Bamfordvirae,Preplasmiviricota,Tectiliviricetes,Rowavirales,Adenoviridae,Mastadenovirus,
...,...,...,...,...,...,...,...,...,...
758,Cytota,Bacteria,Pseudomonadati,Myxococcota,Myxococcia,Myxococcales,,,
759,Cytota,Eukaryota,Fungi,Basidiomycota,Agaricomycetes,Agaricales,Mycenaceae,Mycena,
760,Cytota,Bacteria,Pseudomonadati,Campylobacterota,Epsilonproteobacteria,Campylobacterales,Campylobacteraceae,Campylobacter,
761,Cytota,Bacteria,Pseudomonadati,Pseudomonadota,Paraburkholderia,Burkholderiales,Burkholderiaceae,,
