In [None]:
# Dans le dossier d'analyse, faire un dossier queries et y placer les sequences proteiques des genes
# de virulence que l'on souhaite tester.
# NB. Les sequences peuvent etre des fichiers multi-fasta (plusieurs versions d'un meme gene)
# ou encore des fichiers fasta avec une seule séquence
# Par exemple, le fichier eae.fasta.fasta contient différentes versions de gènes d'intimins (beta, epsilon, etc)
# bfp.fasta contient les genes bfp tires d'un plasmide
# ehxA contient la sequence de reference pour l'hemolysine
# nommer correctement ces genes, car ils seront utilises dans la sortie du programme

In [2]:
import os
import glob
import re
from pathlib import Path
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Alphabet import IUPAC

In [3]:
def get_files(start_dir, file_type):
    """ Given a directory, make a list of files
    This function will find all files recursively.
    """
    files = []
    for filename in Path(start_dir).glob('**/*.' + file_type):
        files.append(str(filename))
    return files

In [None]:
# Preparation of a database of CDS from all strains!
cwd = os.getcwd()
if not os.path.exists(cwd + '/database'):
    os.mkdir(cwd + '/database')
if not os.path.exists(cwd + '/xml'):
    os.mkdir(cwd + '/xml')
    
gbk_files = get_files("./MS_minION_all_gbk", "gbk")

# Doing it on a bunch of gbk files
for gbk_i, gbk_file in enumerate(gbk_files):
    
    # Work on UNIX filesystems, name the fasta file from the gbk file
    outfile_name = os.path.basename(gbk_file).replace('.gbk', '')
    print('Adding ', outfile_name, sep=' ')
    
    # Prepare a fasta file for writing
    with open("database/" + outfile_name + '.fasta', "w") as output_handle:
        records = SeqIO.parse(gbk_file, "genbank")

        for record in records:
            # print('  Dealing with ', record.description, flights_2 %>% filter(is.na(ARR_TIME)) %>% filter(!is.na(DEP_TIME)) %>% filter(FL_DATE == '2001-09-05record.name, sep=' ')
            for i_feature, feature in enumerate(record.features):
            
                if feature.type == 'CDS':
                    if 'gene' in feature.qualifiers: # if a gene tag is present use it the name
                        feature_id = feature.qualifiers['gene'][0] + '_' + record.name +  '_' + outfile_name
                    else:
                        feature_id = feature.qualifiers['locus_tag'][0] + '_' + record.name + '_' + outfile_name
                
                    # Create the sequence object
                    my_seq = Seq(feature.qualifiers['translation'][0],IUPAC.protein)
                
                    # Create a SeqRecord object with the protein sequence
                    simple_seq_r = SeqRecord(my_seq, id=feature_id, description = '')
                    SeqIO.write(simple_seq_r, output_handle, "fasta")

In [None]:
# In a terminal prepare a NCBI blast+ db
# Someting like :
# makeblastdb -in t6ss.fasta -input_type fasta  -dbtype prot -title T6SS_stuff -out T6SS

In [4]:
# Perform blastp searches against the MS_strains database
# The MS_strains NCBI+ database contains protein sequences from the genomes of the 244 strains included in the manuscript 

# Create xml folder if it not exists
cwd = os.getcwd()
if not os.path.exists(cwd + '/xml'):
    os.mkdir(cwd + '/xml')

# First grab the query files in the queries folder
virulence_factors = get_files ("./queries", "fasta")

# Next lauch blastp searches from biopython
for f in virulence_factors:
    outname =  os.path.basename(f).replace('.fasta','')
    blastp_cline = NcbiblastpCommandline(query = f, db="MS_strains", evalue=1e-6, outfmt=5, out='xml/' + outname + '.xml')
    stdout, stderr = blastp_cline()



In [5]:
# This code provide nice output of very high-quality aligments
# And give the names of the strains that contains the query sequence
# Will work only if the query contains multiples versions of the same gene, e.g. many slightly related intimins

# First grab the bastp output files in the xml folder
xml_files = get_files ("./xml", "xml")

p = re.compile('.+\_(Res13[^_]+)\_(\S+)')  # a pattern for identifying our strains in alignemnt titles
E_VALUE_THRESH = 1e-80
S = set()
for f in xml_files:
    virulence_gene_name = os.path.basename(f).replace('.xml','')
    print ("Analyzing", virulence_gene_name, sep=' ')
    result_handle = open(f)
    from Bio.Blast import NCBIXML
    blast_records = NCBIXML.parse(result_handle)
    for blast_record_i, blast_record in enumerate(blast_records, start=1):
        print('* Query #' + repr(blast_record_i) + ' is ' + blast_record.query + '*', end='')
        print('(length=' + repr(blast_record.query_letters) + ') ', end='\n')
        
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                # restrict to near full_length alignment
                if ((hsp.expect < E_VALUE_THRESH) & (hsp.align_length/blast_record.query_letters > 0.95)):
                    #print("Hit found in", f)
                    print("****Alignment****")
                    print("sequence:", alignment.title)
                    m = p.match(alignment.title) # look if the pattern can be found in the alignment title
                    if m:
                        S.add( (virulence_gene_name, m.group(1), m.group(2)) ) # add a tuple to our set S
                    print("length:", alignment.length)
                    print("e value:", hsp.expect)
                    print(hsp.query[0:75] + "...")
                    print(hsp.match[0:75] + "...")
                    print(hsp.sbjct[0:75] + "...")

Analyzing ehxA
* Query #1 is YP_308794.1 hemolysin A (plasmid) [Escherichia coli]*(length=998) 
****Alignment****
sequence: gnl|BL_ORD_ID|1054499 hlyA_p1561_tig141_Res13-Sevr-PEA27-14_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNAT TTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNATLTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
****Alignment****
sequence: gnl|BL_ORD_ID|1049471 hlyA_p1561_tig148_Res13-Sevr-PEA24-03_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNAT TTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNATLTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
****Alignment****
sequence: gnl|BL_ORD_ID|1044449 hlyA_p1561_tig129_Res13-Sevr-PEA20-33_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINE

* Query #7 is bfpE*(length=352) 
****Alignment****
sequence: gnl|BL_ORD_ID|395499 pilR_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 356
e value: 1.03565e-165
RLLFTSKTRMRVFSKLSRYLSNGVPVTFALAELYKFTSDEGRKKDNPDAFALQRWLIAVRNGKTLAEAMRGWVPF...
++LFTSK RMR++ KL+RYL+NGVP+TFAL ELYKFT+D G++   P A A+  W I++RNG +L +A++GWVP ...
KMLFTSKLRMRIYEKLARYLANGVPLTFALDELYKFTTDSGKRNKTPQAIAIHMWSISIRNGDSLTKALKGWVPE...
* Query #8 is bfpF*(length=331) 
****Alignment****
sequence: gnl|BL_ORD_ID|395500 PHBEAHDP_00714_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 321
e value: 2.27588e-148
INELNFADLLISKDSYNFRFLEGQPYPICNVPDNYNAEVQEMVQELECIREAKGNEFFYLHLGVPYRVAVVQTIS...
+  + FADLLIS   Y+FR+L+G   PI  V + Y+AE++++V+EL    +   +EFFY +  +P+R ++V+T+ ...
LKSIYFADLLISNKGYHFRYLKGASNPITVVDEIYHAEIKKIVEELNIRIDNGEHEFFYSYQNIPFRASIVETVD...
* Query #9 is bfpP*(length=249) 
****Alignment****
sequence: gnl|BL_ORD_ID|395501 pppA_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 243
e value: 9.48872e-81
FIY--AATITSFIWLAVERLPHQLKWVDNPVSDI

* Query #4 is KT591267.1_1 Escherichia coli strain EP255_alpha1 intimin (eae) gene, complete cds*(length=940) 
****Alignment****
sequence: gnl|BL_ORD_ID|1051219 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1047646 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1042182 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLG

* Query #6 is KT591295.1_1 Escherichia coli strain EP090_iota1 intimin (eae) gene, complete cds*(length=938) 
****Alignment****
sequence: gnl|BL_ORD_ID|1051219 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1047646 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1042182 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGS

* Query #9 is KT591313.1_1 Escherichia coli strain EP088_mu intimin (eae) gene, complete cds*(length=936) 
****Alignment****
sequence: gnl|BL_ORD_ID|1051219 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1047646 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1042182 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSK

****Alignment****
sequence: gnl|BL_ORD_ID|437660 eae_chr_tig38_Res13-Lact-EA22-14_chromosome
length: 935
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKL SDSKLLT N  Q+RLFYTLKTGETV+ ...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNVAQDRLFYTLKTGETVSS...
****Alignment****
sequence: gnl|BL_ORD_ID|413194 eae_chr_tig211_Res13-Lact-EA10-01_chromosome
length: 935
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKL SDSKLLT N  Q+RLFYTLKTGETV+ ...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNVAQDRLFYTLKTGETVSS...


In [6]:
# Now discover which strains have the complete query genes
L = list(S) # Change our Set to a mutable List
L.sort(key=lambda tup: tup[1]) # Sort by strain
L

[('eae', 'Res13-Croi-PEA15-12', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA15-12', 'plasmid_1561'),
 ('paa', 'Res13-Croi-PEA15-12', 'chromosome'),
 ('paa', 'Res13-Croi-PEA21-28', 'chromosome'),
 ('eae', 'Res13-Croi-PEA21-28', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA21-28', 'plasmid_1561'),
 ('ehxA', 'Res13-Croi-PEA22-35', 'plasmid_1561'),
 ('eae', 'Res13-Croi-PEA22-35', 'chromosome'),
 ('paa', 'Res13-Croi-PEA22-35', 'chromosome'),
 ('eae', 'Res13-Croi-PEA24-40', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA24-40', 'plasmid_1561'),
 ('paa', 'Res13-Croi-PEA24-40', 'chromosome'),
 ('eae', 'Res13-Croi-PEA27-33', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA27-33', 'plasmid_1561'),
 ('paa', 'Res13-Croi-PEA27-33', 'chromosome'),
 ('paa', 'Res13-Croi-PER03-14', 'plasmid_377'),
 ('paa', 'Res13-Fini-PEA17-13', 'chromosome'),
 ('eae', 'Res13-Fini-PEA17-13', 'chromosome'),
 ('ehxA', 'Res13-Fini-PEA17-13', 'plasmid_1561'),
 ('bfp', 'Res13-Lact-EA06-47', 'chromosome'),
 ('eae', 'Res13-Lact-EA10-01', 'chromosome

In [7]:
!pwd

/data/GTlab/MS_minION/virulence_gene_detection


In [189]:
'''Now we are looking for bundle forming pili genes (bfp) in our strains
We will use the Genbank Accession number AB024946 (S.abraham et al. )
Go get the bfp gene in this Genbank record of this plasmid
and then prepare a multifasta file with bfp genes present on a plasmid
'''
from Bio import Entrez
Entrez.email ="jean-simon.brouard@canada.ca"
handle = Entrez.efetch(db="nucleotide", id='AB024946', rettype="gb", retmode="text")
with open('./AB024946.gb', "w") as fp:
    fp.write(handle.read())
    
# Read only 1 record
record = SeqIO.read("AB024946.gb", "genbank")

# Prepare a fasta file for writing
with open('queries/bfp.fasta', "w") as output_handle:
        
    for i_feature, feature in enumerate(record.features):
        if feature.type == 'CDS':
            if 'gene' in feature.qualifiers: # if a gene tag is present use it the name
                if 'bfp' in feature.qualifiers['gene'][0]:
                    feature_id = feature.qualifiers['gene'][0]
                    # Create the sequence object
                    my_seq = Seq(feature.qualifiers['translation'][0],IUPAC.protein)
                
                    # Create a SeqRecord object with the protein sequence
                    simple_seq_r = SeqRecord(my_seq, id=feature_id, description = '')
                    SeqIO.write(simple_seq_r, output_handle, "fasta")

In [238]:
# Code for outputing the description lines of blast output
# We have aligments, bit scores and E-value
# Note that there is 1 BLAST record object/query if many sequences are given as query
# In addition, hits with good E-value are found in all strains, but one will want to look
# at the aligments to determine if the eae gene is present or not (see the cell below)!
E_VALUE_THRESH = 1e-80
for f in xml_files:
    print ("Analyzing", f, sep=' ')
    result_handle = open(f)
    from Bio.Blast import NCBIXML
    blast_records = NCBIXML.parse(result_handle)
    for blast_record in blast_records:
        for description in blast_record.descriptions:
            print(description)


Analyzing xml/ehxA/ehxA.xml
gnl|BL_ORD_ID|1054499 hlyA_p1561_tig141_Res13-Sevr-PEA27-14_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|1049471 hlyA_p1561_tig148_Res13-Sevr-PEA24-03_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|1044449 hlyA_p1561_tig129_Res13-Sevr-PEA20-33_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|1034531 hlyA_p1561_tig127_Res13-Sevr-PEA15-37_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|767049 hlyA_p1561_tig151_Res13-Lact-PEA28-10_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|757633 hlyA_p1561_tig138_Res13-Lact-PEA25-11_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|262772 hlyA_p1561_tig191_Res13-Fini-PEA17-13_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|162168 hlyA_p1561_tig158_Res13-Croi-PEA27-33_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|157151 hlyA_p1561_tig132_Res13-Croi-PEA24-40_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|152127 hlyA_p1561_tig63_Res13-Croi-PEA22-35_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|147109 hlyA_p1561_tig56_Res13-Croi-PEA21-28_plasmid_1561 5123.0  0.0
gnl|BL_ORD_ID|133077 hlyA_p1561_tig143_Res13-C

In [191]:
blastp_cline = NcbiblastpCommandline(query = "queries/bfp.fasta" , db="MS_strains", evalue=1e-6, outfmt=5, out='xml/bfp/bfp.xml')
stdout, stderr = blastp_cline()

In [200]:
blastp_cline = NcbiblastpCommandline(query = "queries/paa.fasta" , db="MS_strains", evalue=1e-6, outfmt=5, out='xml/paa/paa.xml')
stdout, stderr = blastp_cline()

In [None]:
'''Now we are looking for bundle forming pili genes (bfp) in our strains
We will use the Genbank Accession number AB024946 (S.abraham et al. )
Go get the bfp gene in this Genbank record of this plasmid
and then prepare a multifasta file with bfp genes present on a plasmid
'''
from Bio import Entrez
Entrez.email ="jean-simon.brouard@canada.ca"
handle = Entrez.efetch(db="nucleotide", id='AB024946', rettype="gb", retmode="text")
with open('./AB024946.gb', "w") as fp:
    fp.write(handle.read())

In [None]:
import os
import time
import sys

from Bio import Entrez
from Bio import SeqIO
from Bio import Seq

# having a txt file with accession number
#CP009861
#CP012928
#CP017726
#CP016042
#NC_011513
#JN983049


Entrez.email ="jean-simon.brouard@canada.ca"

# make a folder for output
if not os.path.exists('./retrieved_from_ncbi'):
    os.mkdir('./retrieved_from_ncbi')

# Get a list of all gbk we want to fetch
with open("ncbi_ids.txt") as gbk_list:
    S = set() # To be sure to remove duplicate
    for line in gbk_list:
        S.add(line.strip())

for i in S:
    print ('Fetching ' + i + ' from NCBI')
    handle = Entrez.efetch(db="nucleotide", id=i, rettype="fasta", retmode="text")

    with open(os.getcwd() + '/retrieved_from_ncbi/'+ i + '.fasta', "w") as fp:
        fp.write(handle.read())
        time.sleep(1)

In [None]:
help(NcbiblastpCommandline)

In [None]:
help(SeqRecord)

In [254]:
f= "bidon.fasta"

In [257]:
patou = "/data/ext4/riducle.fd"

In [258]:
 os.path.basename(patou)

'riducle.fd'