In [None]:
# Dans le dossier d'analyse, faire un dossier queries et y placer les sequences proteiques des genes
# de virulence que l'on souhaite tester.
# NB. Les sequences peuvent etre des fichiers multi-fasta (plusieurs versions d'un meme gene)
# ou encore des fichiers fasta avec une seule séquence
# Par exemple, le fichier eae.fasta.fasta contient différentes versions de gènes d'intimins (beta, epsilon, etc)
# bfp.fasta contient les genes bfp tires d'un plasmide
# ehxA contient la sequence de reference pour l'hemolysine
# nommer correctement ces genes, car ils seront utilises dans la sortie du programme

In [1]:
import os
import glob
import re
from pathlib import Path
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Alphabet import IUPAC

In [2]:
def get_files(start_dir, file_type):
    """ Given a directory, make a list of files
    This function will find all files recursively.
    """
    files = []
    for filename in Path(start_dir).glob('**/*.' + file_type):
        files.append(str(filename))
    return files

In [3]:
# Preparation of a database of CDS from all strains!
cwd = os.getcwd()
if not os.path.exists(cwd + '/database'):
    os.mkdir(cwd + '/database')
if not os.path.exists(cwd + '/xml'):
    os.mkdir(cwd + '/xml')
    
gbk_files = get_files("./MS_minION_all_gbk", "gbk")

# Doing it on a bunch of gbk files
for gbk_i, gbk_file in enumerate(gbk_files):
    
    # Work on UNIX filesystems, name the fasta file from the gbk file
    outfile_name = os.path.basename(gbk_file).replace('.gbk', '')
    print('Adding ', outfile_name, sep=' ')
    
    # Prepare a fasta file for writing
    with open("database/" + outfile_name + '.fasta', "w") as output_handle:
        records = SeqIO.parse(gbk_file, "genbank")

        for record in records:
            # print('  Dealing with ', record.description, flights_2 %>% filter(is.na(ARR_TIME)) %>% filter(!is.na(DEP_TIME)) %>% filter(FL_DATE == '2001-09-05record.name, sep=' ')
            for i_feature, feature in enumerate(record.features):
            
                if feature.type == 'CDS':
                    if 'gene' in feature.qualifiers: # if a gene tag is present use it the name
                        feature_id = feature.qualifiers['gene'][0] + '_' + record.name +  '_' + outfile_name
                    else:
                        feature_id = feature.qualifiers['locus_tag'][0] + '_' + record.name + '_' + outfile_name
                
                    # Create the sequence object
                    my_seq = Seq(feature.qualifiers['translation'][0],IUPAC.protein)
                
                    # Create a SeqRecord object with the protein sequence
                    simple_seq_r = SeqRecord(my_seq, id=feature_id, description = '')
                    SeqIO.write(simple_seq_r, output_handle, "fasta")

Adding  Res13-Fini-PEC01-29_plasmid_22
Adding  Res13-Fini-PEC01-29_chromosome
Adding  Res13-Fini-PEC01-29_plasmid_novel_0
Adding  Res13-Fini-PEC01-29_plasmid_3
Adding  Res13-Fini-PEC01-29_plasmid_2912
Adding  Res13-Fini-PEC01-29_plasmid_476
Adding  Res13-Fini-PEC01-29_plasmid_539
Adding  Res13-Lact-EA17-20_plasmid_509
Adding  Res13-Lact-EA17-20_plasmid_839
Adding  Res13-Lact-EA17-20_chromosome
Adding  Res13-Lact-EA17-20_plasmid_644
Adding  Res13-Lact-EA17-20_plasmid_29
Adding  Res13-Lact-EA17-20_plasmid_539
Adding  Res13-Lact-PER02-33_plasmid_novel_1
Adding  Res13-Lact-PER02-33_plasmid_34
Adding  Res13-Lact-PER02-33_chromosome
Adding  Res13-Lact-PER02-33_plasmid_novel_0
Adding  Res13-Lact-PER02-33_plasmid_novel_2
Adding  Res13-Lact-PER02-33_plasmid_1009
Adding  Res13-Lact-PER02-33_plasmid_1068
Adding  Res13-Lact-PER02-33_plasmid_14
Adding  Res13-Lact-PER02-33_plasmid_476
Adding  Res13-Sevr-LER1-34_plasmid_novel_1
Adding  Res13-Sevr-LER1-34_plasmid_34
Adding  Res13-Sevr-LER1-34_chromoso

Adding  Res13-Abat-PEA25-P4-01-A_plasmid_novel_0
Adding  Res13-Lact-LEB3-12_plasmid_259
Adding  Res13-Lact-LEB3-12_plasmid_973
Adding  Res13-Lact-LEB3-12_plasmid_2911
Adding  Res13-Lact-LEB3-12_chromosome
Adding  Res13-Lact-LEB3-12_plasmid_47
Adding  R13-CV2-pWea-04_plasmid_novel_1
Adding  R13-CV2-pWea-04_plasmid_5
Adding  R13-CV2-pWea-04_plasmid_1140
Adding  R13-CV2-pWea-04_plasmid_825
Adding  R13-CV2-pWea-04_chromosome
Adding  R13-CV2-pWea-04_plasmid_novel_0
Adding  R13-CV2-pWea-04_plasmid_1008
Adding  R13-CV2-pWea-04_plasmid_1068
Adding  R13-CV2-pWea-04_plasmid_2912
Adding  R13-CV2-pWea-04_plasmid_14
Adding  Res13-Fini-PEA19-38_chromosome
Adding  Res13-Fini-PEA19-38_plasmid_novel_0
Adding  Res13-Fini-PEA19-38_plasmid_540
Adding  Res13-Fini-PEA19-38_plasmid_1068
Adding  Res13-Lact-EB13-15_chromosome
Adding  Res13-Lact-EB13-15_plasmid_23
Adding  Res13-Lact-EB13-15_plasmid_972
Adding  Res13-Lact-EB13-15_plasmid_476
Adding  Res13-Lact-EB13-15_plasmid_539
Adding  Res13-Lact-EB13-15_plasm

Adding  R13-AF34-pFin-01_plasmid_novel_0
Adding  R13-AF34-pFin-01_plasmid_540
Adding  R13-AF34-pFin-01_plasmid_1068
Adding  Res13-Lact-ER10-16_plasmid_novel_1
Adding  Res13-Lact-ER10-16_plasmid_444
Adding  Res13-Lact-ER10-16_plasmid_2911
Adding  Res13-Lact-ER10-16_chromosome
Adding  Res13-Lact-ER10-16_plasmid_novel_0
Adding  Res13-Lact-ER10-16_plasmid_novel_2
Adding  Res13-Lact-ER10-16_plasmid_1009
Adding  Res13-Fini-PEC03-13_plasmid_22
Adding  Res13-Fini-PEC03-13_chromosome
Adding  Res13-Fini-PEC03-13_plasmid_novel_0
Adding  Res13-Fini-PEC03-13_plasmid_3
Adding  Res13-Fini-PEC03-13_plasmid_476
Adding  Res13-Fini-PEC03-13_plasmid_539
Adding  Res13-Lact-PER09-33_chromosome
Adding  Res13-Lact-PER09-33_plasmid_476
Adding  R13-AF33-pWea-01_plasmid_529
Adding  R13-AF33-pWea-01_plasmid_2911
Adding  R13-AF33-pWea-01_chromosome
Adding  R13-AF33-pWea-01_plasmid_novel_0
Adding  R13-AF33-pWea-01_plasmid_282
Adding  R13-AF33-pWea-01_plasmid_1353
Adding  R13-AF33-pWea-01_plasmid_46
Adding  R13-AF33

Adding  Res13-Abat-PEB01-P1-04-A_plasmid_novel_0
Adding  Res13-Abat-PEB01-P1-04-A_plasmid_novel_2
Adding  Res13-Lact-ER01-35_chromosome
Adding  Res13-Lact-ER01-35_plasmid_1009
Adding  Res13-Lact-ER01-35_plasmid_476
Adding  Res13-Lact-PER01-40_plasmid_novel_1
Adding  Res13-Lact-PER01-40_plasmid_34
Adding  Res13-Lact-PER01-40_chromosome
Adding  Res13-Lact-PER01-40_plasmid_novel_0
Adding  Res13-Lact-PER01-40_plasmid_1009
Adding  Res13-Lact-PER01-40_plasmid_1068
Adding  Res13-Lact-PER01-40_plasmid_14
Adding  Res13-Lact-PER01-40_plasmid_476
Adding  Res13-Abat-PEC13-P2-01_plasmid_novel_1
Adding  Res13-Abat-PEC13-P2-01_chromosome
Adding  Res13-Abat-PEC13-P2-01_plasmid_novel_0
Adding  Res13-Abat-PEC13-P2-01_plasmid_novel_2
Adding  Res13-Lact-EA06-22_plasmid_973
Adding  Res13-Lact-EA06-22_chromosome
Adding  Res13-Lact-EA06-22_plasmid_23
Adding  Res13-Lact-EA06-22_plasmid_920
Adding  Res13-Lact-EA06-22_plasmid_476
Adding  Res13-Lact-EA06-22_plasmid_32
Adding  Res13-Lact-LEB3-09_plasmid_973
Addin

Adding  Res13-Lact-EB12-33_plasmid_23
Adding  Res13-Lact-EB12-33_plasmid_972
Adding  Res13-Lact-EB12-33_plasmid_476
Adding  Res13-Lact-EB12-33_plasmid_539
Adding  Res13-Lact-EB12-33_plasmid_32
Adding  Res13-Sevr-PEA18-33_plasmid_509
Adding  Res13-Sevr-PEA18-33_plasmid_novel_1
Adding  Res13-Sevr-PEA18-33_plasmid_465
Adding  Res13-Sevr-PEA18-33_plasmid_973
Adding  Res13-Sevr-PEA18-33_chromosome
Adding  Res13-Sevr-PEA18-33_plasmid_novel_0
Adding  Res13-Sevr-PEA18-33_plasmid_546
Adding  Res13-Sevr-PEA18-33_plasmid_novel_2
Adding  Res13-Sevr-PEA18-33_plasmid_1068
Adding  Res13-Sevr-PEA18-33_plasmid_261
Adding  Res13-Sevr-PEA18-33_plasmid_23
Adding  Res13-Sevr-PEA18-33_plasmid_476
Adding  Res13-Sevr-PEA18-33_plasmid_novel_3
Adding  R13-AF11-pWea-03_plasmid_191
Adding  R13-AF11-pWea-03_plasmid_529
Adding  R13-AF11-pWea-03_plasmid_2070
Adding  R13-AF11-pWea-03_chromosome
Adding  R13-AF11-pWea-03_plasmid_32
Adding  Res13-Fini-PEB02-13_plasmid_960
Adding  Res13-Fini-PEB02-13_chromosome
Adding  R

Adding  R13-AF21-pWea-04_plasmid_novel_0
Adding  R13-AF21-pWea-04_plasmid_53
Adding  Res13-Fini-PEA17-13_plasmid_509
Adding  Res13-Fini-PEA17-13_plasmid_834
Adding  Res13-Fini-PEA17-13_plasmid_932
Adding  Res13-Fini-PEA17-13_chromosome
Adding  Res13-Fini-PEA17-13_plasmid_novel_0
Adding  Res13-Fini-PEA17-13_plasmid_1561
Adding  Res13-Fini-PEA17-13_plasmid_476
Adding  Res13-Fini-PEA17-13_plasmid_47
Adding  R13-AF12-pSow-02_plasmid_189
Adding  R13-AF12-pSow-02_plasmid_731
Adding  R13-AF12-pSow-02_chromosome
Adding  R13-AF12-pSow-02_plasmid_novel_0
Adding  R13-AF12-pSow-02_plasmid_690
Adding  Res13-Sevr-PEB02-10_plasmid_509
Adding  Res13-Sevr-PEB02-10_plasmid_novel_1
Adding  Res13-Sevr-PEB02-10_plasmid_465
Adding  Res13-Sevr-PEB02-10_plasmid_973
Adding  Res13-Sevr-PEB02-10_chromosome
Adding  Res13-Sevr-PEB02-10_plasmid_novel_0
Adding  Res13-Sevr-PEB02-10_plasmid_546
Adding  Res13-Sevr-PEB02-10_plasmid_novel_2
Adding  Res13-Sevr-PEB02-10_plasmid_1068
Adding  Res13-Sevr-PEB02-10_plasmid_261


Adding  Res13-Lact-ER02-39_plasmid_novel_0
Adding  Res13-Lact-ER02-39_plasmid_novel_2
Adding  Res13-Lact-ER02-39_plasmid_1009
Adding  Res13-Lact-ER02-39_plasmid_1068
Adding  Res13-Lact-ER02-39_plasmid_14
Adding  Res13-Lact-ER02-39_plasmid_476
Adding  Res13-Lact-EB03-01_plasmid_471
Adding  Res13-Lact-EB03-01_plasmid_887
Adding  Res13-Lact-EB03-01_plasmid_973
Adding  Res13-Lact-EB03-01_chromosome
Adding  Res13-Lact-EB03-01_plasmid_novel_0
Adding  Res13-Lact-EB03-01_plasmid_229
Adding  Res13-Lact-EB03-01_plasmid_1009
Adding  Res13-Lact-EB03-01_plasmid_476
Adding  Res13-Sevr-PEC09-46_plasmid_106
Adding  Res13-Sevr-PEC09-46_plasmid_708
Adding  Res13-Sevr-PEC09-46_chromosome
Adding  Res13-Sevr-PEC09-46_plasmid_1009
Adding  Res13-Abat-PEA25-P4-02_plasmid_98
Adding  Res13-Abat-PEA25-P4-02_chromosome
Adding  Res13-Abat-PEA25-P4-02_plasmid_novel_0
Adding  R13-AF31-pSow-03_plasmid_725
Adding  R13-AF31-pSow-03_plasmid_529
Adding  R13-AF31-pSow-03_plasmid_887
Adding  R13-AF31-pSow-03_chromosome
Add

Adding  Res13-Lact-PEA27-11_plasmid_novel_0
Adding  Res13-Lact-PEA27-11_plasmid_476
Adding  Res13-Lact-PEA02-11_plasmid_973
Adding  Res13-Lact-PEA02-11_chromosome
Adding  Res13-Sevr-LER1-36-a_chromosome
Adding  R13-CV2-pWea-03_plasmid_novel_1
Adding  R13-CV2-pWea-03_plasmid_958
Adding  R13-CV2-pWea-03_plasmid_1553
Adding  R13-CV2-pWea-03_chromosome
Adding  R13-CV2-pWea-03_plasmid_novel_0
Adding  R13-CV2-pWea-03_plasmid_novel_2
Adding  R13-CV2-pWea-03_plasmid_1009
Adding  R13-CV2-pWea-03_plasmid_476
Adding  R13-CV2-pWea-03_plasmid_35
Adding  Res13-Lact-PER12-33-A_plasmid_34
Adding  Res13-Lact-PER12-33-A_chromosome
Adding  Res13-Lact-PER12-33-A_plasmid_novel_0
Adding  Res13-Lact-PER12-33-A_plasmid_541
Adding  Res13-Lact-PER12-33-A_plasmid_1009
Adding  Res13-Lact-PER12-33-A_plasmid_1068
Adding  Res13-Lact-PER12-33-A_plasmid_14
Adding  Res13-Lact-PER12-33-A_plasmid_476
Adding  R13-AF12-pSow-04_plasmid_973
Adding  R13-AF12-pSow-04_chromosome
Adding  R13-AF12-pSow-04_plasmid_novel_0
Adding  

Adding  Res13-Fini-PEC02-13_plasmid_novel_0
Adding  Res13-Fini-PEC02-13_plasmid_3
Adding  Res13-Fini-PEC02-13_plasmid_2912
Adding  Res13-Fini-PEC02-13_plasmid_476
Adding  Res13-Fini-PEC02-13_plasmid_539
Adding  Res13-Abat-PEA25-P5-02_plasmid_novel_1
Adding  Res13-Abat-PEA25-P5-02_chromosome
Adding  Res13-Abat-PEA25-P5-02_plasmid_novel_0
Adding  Res13-Abat-PEA25-P5-02_plasmid_1836
Adding  Res13-Abat-PEA25-P5-02_plasmid_1774
Adding  R13-AF22-pSow-04_plasmid_novel_1
Adding  R13-AF22-pSow-04_plasmid_973
Adding  R13-AF22-pSow-04_chromosome
Adding  R13-AF22-pSow-04_plasmid_novel_0
Adding  R13-AF22-pSow-04_plasmid_novel_2
Adding  R13-AF22-pSow-04_plasmid_490
Adding  R13-AF22-pSow-04_plasmid_972
Adding  R13-AF22-pSow-04_plasmid_29
Adding  R13-AF22-pSow-04_plasmid_226
Adding  Res13-Croi-PEB21-10_plasmid_973
Adding  Res13-Croi-PEB21-10_chromosome
Adding  Res13-Croi-PEB21-10_plasmid_476
Adding  Res13-Abat-PEC12-P3-01_plasmid_novel_1
Adding  Res13-Abat-PEC12-P3-01_chromosome
Adding  Res13-Abat-PEC

### In a terminal prepare a NCBI blast+ db


In [None]:
### cat *.fasta > MS_strains_v2
### makeblastdb -in t6ss.fasta -input_type fasta  -dbtype prot -title T6SS_stuff -out T6SS

In [5]:
# Perform blastp searches against the MS_strains database
# The MS_strains NCBI+ database contains protein sequences from the genomes of the 244 strains included in the manuscript 
# Output will be in XML folders
# Create xml folder if it not exists
cwd = os.getcwd()
if not os.path.exists(cwd + '/xml'):
    os.mkdir(cwd + '/xml')

# First grab the query files in the queries folder
virulence_factors = get_files ("./queries", "fasta")

# Next lauch blastp searches from biopython
for f in virulence_factors:
    outname =  os.path.basename(f).replace('.fasta','')
    blastp_cline = NcbiblastpCommandline(query = f, db="MS_strains_v2", evalue=1e-6, outfmt=5, out='xml/' + outname + '.xml')
    stdout, stderr = blastp_cline()



In [6]:
# This code provide nice output of very high-quality aligments
# And give the names of the strains that contains the query sequence
# Will work only if the query contains multiples versions of the same gene, e.g. many slightly related intimins

# First grab the bastp output files in the xml folder
xml_files = get_files ("./xml", "xml")

p = re.compile('.+\_(Res13[^_]+)\_(\S+)')  # a pattern for identifying our strains in alignemnt titles
E_VALUE_THRESH = 1e-80
S = set()
for f in xml_files:
    virulence_gene_name = os.path.basename(f).replace('.xml','')
    print ("Analyzing", virulence_gene_name, sep=' ')
    result_handle = open(f)
    from Bio.Blast import NCBIXML
    blast_records = NCBIXML.parse(result_handle)
    for blast_record_i, blast_record in enumerate(blast_records, start=1):
        print('* Query #' + repr(blast_record_i) + ' is ' + blast_record.query + '*', end='')
        print('(length=' + repr(blast_record.query_letters) + ') ', end='\n')
        
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                # restrict to near full_length alignment
                if ((hsp.expect < E_VALUE_THRESH) & (hsp.align_length/blast_record.query_letters > 0.95)):
                    #print("Hit found in", f)
                    print("****Alignment****")
                    print("sequence:", alignment.title)
                    m = p.match(alignment.title) # look if the pattern can be found in the alignment title
                    if m:
                        S.add( (virulence_gene_name, m.group(1), m.group(2)) ) # add a tuple to our set S
                    print("length:", alignment.length)
                    print("e value:", hsp.expect)
                    print(hsp.query[0:75] + "...")
                    print(hsp.match[0:75] + "...")
                    print(hsp.sbjct[0:75] + "...")

Analyzing ehxA
* Query #1 is YP_308794.1 hemolysin A (plasmid) [Escherichia coli]*(length=998) 
****Alignment****
sequence: gnl|BL_ORD_ID|1334003 hlyA_p1561_tig141_Res13-Sevr-PEA27-14_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNAT TTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNATLTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
****Alignment****
sequence: gnl|BL_ORD_ID|1328975 hlyA_p1561_tig148_Res13-Sevr-PEA24-03_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNAT TTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
MTVNKIKNIFNNATLTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINELVKAADELGIEIHRTERDDTAIA...
****Alignment****
sequence: gnl|BL_ORD_ID|1323953 hlyA_p1561_tig129_Res13-Sevr-PEA20-33_plasmid_1561
length: 998
e value: 0.0
MTVNKIKNIFNNATSTTKSAFNTASSSVRSAGKKLILLIPDNYEAQGVGINE

* Query #6 is bfpD*(length=534) 
****Alignment****
sequence: gnl|BL_ORD_ID|675002 PHBEAHDP_00712_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 544
e value: 0.0
MLNKTEKTSDLM-FERFKRNVSEIVTGDGGELELTVEQRKYFLIFKNGDFLVSSCHMKHHLVQMLREIATRKGYP...
M+ +  KT+D   F RF++N+S+IVT  GG +E T EQ+K  +I+KNGDFL+S+ H+++  ++ L+E+A RKG  ...
MIVEQLKTADDQDFLRFRKNISDIVTVKGGLIETTDEQKKICIIYKNGDFLISTEHLENPSIRFLKEVAIRKGIQ...
* Query #7 is bfpE*(length=352) 
****Alignment****
sequence: gnl|BL_ORD_ID|675003 pilR_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 356
e value: 9.26739e-164
RLLFTSKTRMRVFSKLSRYLSNGVPVTFALAELYKFTSDEGRKKDNPDAFALQRWLIAVRNGKTLAEAMRGWVPF...
++LFTSK RMR++ KL+RYL+NGVP+TFAL ELYKFT+D G++   P A A+  W I++RNG +L +A++GWVP ...
KMLFTSKLRMRIYEKLARYLANGVPLTFALDELYKFTTDSGKRNKTPQAIAIHMWSISIRNGDSLTKALKGWVPE...
* Query #8 is bfpF*(length=331) 
****Alignment****
sequence: gnl|BL_ORD_ID|675004 PHBEAHDP_00714_chr_tig34_Res13-Lact-EA06-47_chromosome
length: 321
e value: 1.82039e-146
INELNFADLLISKDSYNFRFLEGQPYPICNVP

* Query #3 is KT591262.1_1 Escherichia coli strain EP064_lambda intimin (eae) gene, complete cds*(length=939) 
****Alignment****
sequence: gnl|BL_ORD_ID|1330723 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1327150 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1321686 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLG

* Query #5 is KT591272.1_1 Escherichia coli strain EP113_epsilon2 intimin (eae) gene, complete cds*(length=949) 
****Alignment****
sequence: gnl|BL_ORD_ID|1330723 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1327150 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1321686 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFK

* Query #7 is KT591297.1_1 Escherichia coli strain EP017_iota2 intimin (eae) gene, complete cds*(length=938) 
****Alignment****
sequence: gnl|BL_ORD_ID|1348419 eae_pnovel_0_tig35_Res13-Sevr-PEB04-11_plasmid_novel_0
length: 935
e value: 0.0
MITHGCYTRTRHKHKLKKTFVMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNAAQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKTF+MLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQN AQ+RLFYTLKTGETV+ ...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNVAQDRLFYTLKTGETVSS...
****Alignment****
sequence: gnl|BL_ORD_ID|1013235 eae_chr_tig181_Res13-Lact-PEA14-20_chromosome
length: 935
e value: 0.0
MITHGCYTRTRHKHKLKKTFVMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNAAQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKTF+MLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQN AQ+RLFYTLKTGETV+ ...
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLSSDSKLLTQNVAQDRLFYTLKTGETVSS...
****Alignment****
sequence: gnl|BL_ORD_ID|975823 eae_chr_tig181_Res13-Lact-PEA10-41_chromosome
length: 935
e value: 0.0
MITHGCYTRTRHKHKLKKTFVMLSAGLGLFFYVNQNSFANGENYF

* Query #11 is KT591322.1_1 Escherichia coli strain EP084_zeta intimin (eae) gene, complete cds*(length=939) 
****Alignment****
sequence: gnl|BL_ORD_ID|1330723 eae_1_chr_tig47_Res13-Sevr-PEA27-14_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1327150 eae_1_chr_tig102_Res13-Sevr-PEA24-03_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHNSYQNRLFYTLKTGETVAD...
MITHG Y RTRHKHKLKKT IMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTH+SYQNRLFYTLKTGETVAD...
MITHGCYTRTRHKHKLKKTLIMLSAGLGLFFYVNQNSFANGENYFKLGSDSKLLTHDSYQNRLFYTLKTGETVAD...
****Alignment****
sequence: gnl|BL_ORD_ID|1321686 eae_1_chr_tig88_Res13-Sevr-PEA20-33_chromosome
length: 934
e value: 0.0
MITHGFYARTRHKHKLKKTFIMLSAGLGLFFYVNQNSFANGENYFKLGS

In [7]:
# Now discover which strains have the complete query genes
L = list(S) # Change our Set to a mutable List
L.sort(key=lambda tup: tup[1]) # Sort by strain
L

[('paa', 'Res13-Croi-PEA15-12', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA15-12', 'plasmid_1561'),
 ('eae', 'Res13-Croi-PEA15-12', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA21-28', 'plasmid_1561'),
 ('eae', 'Res13-Croi-PEA21-28', 'chromosome'),
 ('paa', 'Res13-Croi-PEA21-28', 'chromosome'),
 ('paa', 'Res13-Croi-PEA22-35', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA22-35', 'plasmid_1561'),
 ('eae', 'Res13-Croi-PEA22-35', 'chromosome'),
 ('eae', 'Res13-Croi-PEA24-40', 'chromosome'),
 ('paa', 'Res13-Croi-PEA24-40', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA24-40', 'plasmid_1561'),
 ('eae', 'Res13-Croi-PEA27-33', 'chromosome'),
 ('paa', 'Res13-Croi-PEA27-33', 'chromosome'),
 ('ehxA', 'Res13-Croi-PEA27-33', 'plasmid_1561'),
 ('paa', 'Res13-Croi-PER03-14', 'plasmid_377'),
 ('eae', 'Res13-Fini-PEA17-13', 'chromosome'),
 ('paa', 'Res13-Fini-PEA17-13', 'chromosome'),
 ('ehxA', 'Res13-Fini-PEA17-13', 'plasmid_1561'),
 ('bfp', 'Res13-Lact-EA06-47', 'chromosome'),
 ('eae', 'Res13-Lact-EA10-01', 'chromosome

### End of analysis

### Extra pieces of code

#### Code below explain how the bfp query was created

In [8]:
'''Now we are looking for bundle forming pili genes (bfp) in our strains
We will use the Genbank Accession number AB024946 (S.abraham et al. )
Go get the bfp gene in this Genbank record of this plasmid
and then prepare a multifasta file with bfp genes present on a plasmid
'''
from Bio import Entrez
Entrez.email ="jean-simon.brouard@canada.ca"
handle = Entrez.efetch(db="nucleotide", id='AB024946', rettype="gb", retmode="text")
with open('./AB024946.gb', "w") as fp:
    fp.write(handle.read())
    
# Read only 1 record
record = SeqIO.read("AB024946.gb", "genbank")

# Prepare a fasta file for writing
with open('queries/bfp.fasta', "w") as output_handle:
        
    for i_feature, feature in enumerate(record.features):
        if feature.type == 'CDS':
            if 'gene' in feature.qualifiers: # if a gene tag is present use it the name
                if 'bfp' in feature.qualifiers['gene'][0]:
                    feature_id = feature.qualifiers['gene'][0]
                    # Create the sequence object
                    my_seq = Seq(feature.qualifiers['translation'][0],IUPAC.protein)
                
                    # Create a SeqRecord object with the protein sequence
                    simple_seq_r = SeqRecord(my_seq, id=feature_id, description = '')
                    SeqIO.write(simple_seq_r, output_handle, "fasta")

#### Just to show how we can iterate on blast record descriptions

In [None]:
# Code for outputing the description lines of blast output
# We have aligments, bit scores and E-value
# Note that there is 1 BLAST record object/query if many sequences are given as query
# In addition, hits with good E-value are found in all strains, but one will want to look
# at the aligments to determine if the eae gene is present or not (see the cell below)!
E_VALUE_THRESH = 1e-80
for f in xml_files:
    print ("Analyzing", f, sep=' ')
    result_handle = open(f)
    from Bio.Blast import NCBIXML
    blast_records = NCBIXML.parse(result_handle)
    for blast_record in blast_records:
        for description in blast_record.descriptions:
            print(description)
