# Extracting Sequences from BLAST Outputs

In [1]:
import pandas as pd
import glob

## Read tab-delimited BLAST output and concatenate

In [2]:
blast_results = pd.DataFrame()

for tab in glob.glob("BLAST-tabs/*.tab"):
    print("Found:", tab)
    blast_results = blast_results.append(pd.read_csv(tab, sep='\t'))

Found: BLAST-tabs/AFSV-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/CroV-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/Faustovirus-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/Mimivirus-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/PBCV-1-tape-protien-blastp-against-virus-0.01evalue-1.tab


In [3]:
blast_results.head()

Unnamed: 0,Entry,Info,Unnamed: 2,Unnamed: 3,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs,Sequence
P0CAB3,E-value: 0.0;,"Score: 6,490;",Ident.: 100.0%,,Ken-072,African swine fever virus (isolate Pig/Kenya/K...,561445,Uncharacterized protein M1249L (pM1249L),561445,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...
A0A0C5AWP7,E-value: 0.0;,"Score: 6,440;",Ident.: 99.0%,,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...
A0A2Z5DFV5,E-value: 0.0;,"Score: 6,435;",Ident.: 98.9%,,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...
A0A2Z5DGI2,E-value: 0.0;,"Score: 6,434;",Ident.: 98.9%,,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...
A0A0C5AWA9,E-value: 0.0;,"Score: 6,403;",Ident.: 98.9%,,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...


## Fix the table
There is an empty column, and the column some column names are wrong...

In [4]:
del blast_results['Unnamed: 3']
blast_results = blast_results.rename(columns = {'Entry': 'E-value',
                                                'Info': 'Score',
                                                'Unnamed: 2': 'Identity'})

In [5]:
blast_results['Accession'] = blast_results.index

In [6]:
blast_results.head()

Unnamed: 0,E-value,Score,Identity,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs,Sequence,Accession
P0CAB3,E-value: 0.0;,"Score: 6,490;",Ident.: 100.0%,Ken-072,African swine fever virus (isolate Pig/Kenya/K...,561445,Uncharacterized protein M1249L (pM1249L),561445,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,P0CAB3
A0A0C5AWP7,E-value: 0.0;,"Score: 6,440;",Ident.: 99.0%,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,A0A0C5AWP7
A0A2Z5DFV5,E-value: 0.0;,"Score: 6,435;",Ident.: 98.9%,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,A0A2Z5DFV5
A0A2Z5DGI2,E-value: 0.0;,"Score: 6,434;",Ident.: 98.9%,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,A0A2Z5DGI2
A0A0C5AWA9,E-value: 0.0;,"Score: 6,403;",Ident.: 98.9%,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,A0A0C5AWA9


## Remove duplicates

In [7]:
print("Total rows:", len(blast_results))
print("Total unique ids:", len(blast_results.Accession.unique()))

Total rows: 284
Total unique ids: 173


In [8]:
seqs_with_ids = blast_results.loc[:, ["Accession", "Sequence", "Gene names", "Organism", "Organism ID",
                                      "Protein names", "Taxonomic lineage IDs"]]

In [9]:
seqs_with_ids = seqs_with_ids.drop_duplicates()

In [10]:
seqs_with_ids.head()

Unnamed: 0,Accession,Sequence,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs
P0CAB3,P0CAB3,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,Ken-072,African swine fever virus (isolate Pig/Kenya/K...,561445,Uncharacterized protein M1249L (pM1249L),561445
A0A0C5AWP7,A0A0C5AWP7,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497
A0A2Z5DFV5,A0A2Z5DFV5,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497
A0A2Z5DGI2,A0A2Z5DGI2,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,M1249L,African swine fever virus (ASFV),10497,PM1249L,10497
A0A0C5AWA9,A0A0C5AWA9,MEEVITIAQIVHRGTDILSLNNEEIEALVDEIYSTLKGSNDIKNIR...,BA71V-M1249L,African swine fever virus (ASFV),10497,BA71V-M1249L,10497


In [11]:
unique_id = seqs_with_ids["Organism"] + "|" + seqs_with_ids["Gene names"] + "|" + seqs_with_ids["Accession"]

In [12]:
unique_id = unique_id.str.replace(" ", "_")

In [13]:
seqs_with_ids["unique_id"] = unique_id

## Write csv

In [14]:
seqs_with_ids.to_csv("unique_sequences.csv", index=False)

## Convert csv to fasta

In [12]:
%%bash
python csv_to_fasta.py --help

Usage: csv_to_fasta.py [OPTIONS] CSVFILE IDCOL SEQCOL

Options:
  --version          Show the version and exit.
  -f, --fafile TEXT
  -d, --descol TEXT
  --help             Show this message and exit.


In [51]:
%%bash
python csv_to_fasta.py -f unique-sequences.fa \
    unique_sequences.csv unique_id Sequence

In [15]:
%%bash
python csv_to_fasta.py -f unique-sequences-by-accession.fa \
    unique_sequences.csv Accession Sequence

Traceback (most recent call last):
  File "csv_to_fasta.py", line 4, in <module>
    import click
ModuleNotFoundError: No module named 'click'


CalledProcessError: Command 'b'python csv_to_fasta.py -f unique-sequences-by-accession.fa \\\n    unique_sequences.csv Accession Sequence\n'' returned non-zero exit status 1.