# Extracting Sequences from BLAST Outputs

In [1]:
import pandas as pd
import glob

## Read tab-delimited BLAST output and concatenate

In [2]:
blast_results = pd.DataFrame()

for tab in glob.glob("BLAST-tabs/*.tab"):
    print("Found:", tab)
    blast_results = blast_results.append(pd.read_csv(tab, sep='\t'))

Found: BLAST-tabs/PBCV-1-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/CroV-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/Faustovirus-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/Mimivirus-tape-protien-blastp-against-virus-0.01evalue-1.tab
Found: BLAST-tabs/AFSV-tape-protien-blastp-against-virus-0.01evalue-1.tab


In [3]:
blast_results.head()

Unnamed: 0,Entry,Info,Unnamed: 2,Unnamed: 3,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs,Sequence
Q84656,E-value: 0.0;,"Score: 3,060;",Ident.: 100.0%,,A342L,Paramecium bursaria Chlorella virus 1 (PBCV-1),10506,Uncharacterized protein,10506,MCNTYYKRVKFNLFLYTNSIEMELLAVASIIGYGLFSSQQGRETRP...
M1I204,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,,CvsA1_388L PBCVCvsA1_388L,Paramecium bursaria Chlorella virus CvsA1,1278254,Uncharacterized protein,1278254,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...
M1H8N0,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,,MA-1E_415L PBCVMA1E_415L,Paramecium bursaria Chlorella virus MA-1E,1278260,Uncharacterized protein,1278260,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...
M1HYV5,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,,CviKI_378L PBCVCviKI_378L,Paramecium bursaria Chlorella virus CviKI,1278253,Uncharacterized protein,1278253,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...
M1HWS0,E-value: 0.0;,"Score: 2,919;",Ident.: 98.9%,,KS1B_324L PBCVKS1B_324L,Paramecium bursaria Chlorella virus KS1B,1278258,Uncharacterized protein,1278258,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...


## Fix the table
There is an empty column, and the column some column names are wrong...

In [4]:
del blast_results['Unnamed: 3']
blast_results = blast_results.rename(columns = {'Entry': 'E-value',
                                                'Info': 'Score',
                                                'Unnamed: 2': 'Identity'})

In [5]:
blast_results['Accession'] = blast_results.index

In [6]:
blast_results.head()

Unnamed: 0,E-value,Score,Identity,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs,Sequence,Accession
Q84656,E-value: 0.0;,"Score: 3,060;",Ident.: 100.0%,A342L,Paramecium bursaria Chlorella virus 1 (PBCV-1),10506,Uncharacterized protein,10506,MCNTYYKRVKFNLFLYTNSIEMELLAVASIIGYGLFSSQQGRETRP...,Q84656
M1I204,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,CvsA1_388L PBCVCvsA1_388L,Paramecium bursaria Chlorella virus CvsA1,1278254,Uncharacterized protein,1278254,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,M1I204
M1H8N0,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,MA-1E_415L PBCVMA1E_415L,Paramecium bursaria Chlorella virus MA-1E,1278260,Uncharacterized protein,1278260,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,M1H8N0
M1HYV5,E-value: 0.0;,"Score: 2,923;",Ident.: 99.1%,CviKI_378L PBCVCviKI_378L,Paramecium bursaria Chlorella virus CviKI,1278253,Uncharacterized protein,1278253,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,M1HYV5
M1HWS0,E-value: 0.0;,"Score: 2,919;",Ident.: 98.9%,KS1B_324L PBCVKS1B_324L,Paramecium bursaria Chlorella virus KS1B,1278258,Uncharacterized protein,1278258,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,M1HWS0


## Remove duplicates

In [7]:
print("Total rows:", len(blast_results))
print("Total unique ids:", len(blast_results.Accession.unique()))

Total rows: 284
Total unique ids: 173


In [8]:
seqs_with_ids = blast_results.loc[:, ["Accession", "Sequence", "Gene names", "Organism", "Organism ID",
                                      "Protein names", "Taxonomic lineage IDs"]]

In [9]:
seqs_with_ids = seqs_with_ids.drop_duplicates()

In [10]:
seqs_with_ids.head()

Unnamed: 0,Accession,Sequence,Gene names,Organism,Organism ID,Protein names,Taxonomic lineage IDs
Q84656,Q84656,MCNTYYKRVKFNLFLYTNSIEMELLAVASIIGYGLFSSQQGRETRP...,A342L,Paramecium bursaria Chlorella virus 1 (PBCV-1),10506,Uncharacterized protein,10506
M1I204,M1I204,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,CvsA1_388L PBCVCvsA1_388L,Paramecium bursaria Chlorella virus CvsA1,1278254,Uncharacterized protein,1278254
M1H8N0,M1H8N0,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,MA-1E_415L PBCVMA1E_415L,Paramecium bursaria Chlorella virus MA-1E,1278260,Uncharacterized protein,1278260
M1HYV5,M1HYV5,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,CviKI_378L PBCVCviKI_378L,Paramecium bursaria Chlorella virus CviKI,1278253,Uncharacterized protein,1278253
M1HWS0,M1HWS0,MELLAVASIIGYGLFSSQQGRETRPDRNRYAEALGSGQGLDEDYDV...,KS1B_324L PBCVKS1B_324L,Paramecium bursaria Chlorella virus KS1B,1278258,Uncharacterized protein,1278258


## Write csv

In [11]:
seqs_with_ids.to_csv("unique_sequences.csv", index=False)

## Convert csv to fasta

In [12]:
%%bash
python csv_to_fasta.py --help

Usage: csv_to_fasta.py [OPTIONS] CSVFILE IDCOL SEQCOL

Options:
  --version          Show the version and exit.
  -f, --fafile TEXT
  -d, --descol TEXT
  --help             Show this message and exit.


In [15]:
%%bash
python csv_to_fasta.py -f unique-sequences.fa \
    -d "Protein names" -d "Taxonomic lineage IDs" \
    unique_sequences.csv Accession Sequence