## Phylogenetic analysis of 16S rRNA sequences

Sergio Álvarez-Pérez, 2020

In [None]:
import sys
import os
import re
import Bio
import pandas as pd
from Bio import Entrez, SeqIO

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')

if not os.path.exists('/home/sergio/TFM1/reports/16SrRNA/'):
    os.mkdir('/home/sergio/TFM1/reports/16SrRNA/')

In [None]:
path = '/home/sergio/TFM1/'

for filename in os.listdir(path):
    if re.match("Enterobacterales_16SrRNA.xlsx", filename): # download this .xlsx file from GitHub to your preferred path
        df = pd.read_excel(os.path.join(path, filename)).set_index('Accession_no')
        df['Species'] = df['Species'].str.replace(" ","_")
        df['Strain'] = df['Strain'].str.replace(" ","")

df

In [None]:
ids = list(df.index)

# Uncomment this part to retrieve the target sequences from the NCBI database and write them into a .fasta file
"""
myfile = open('/home/sergio/TFM1/reports/16SrRNA/16SrRNA.fasta', 'w')
Entrez.email = "xxxxxx@xxxxx.xx" # Include here a valid e-mail address!!
for seq_id in ids:
    handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="fasta")
    record = handle.read()
    print (record)
    myfile.write(record)
myfile.close()
"""

In [None]:
# To make some edits in the .fasta file (headers of each record)

original_file = '/home/sergio/TFM1/reports/16SrRNA/16SrRNA.fasta'
corrected_file = '/home/sergio/TFM1/reports/16SrRNA/16SrRNA_new.fasta'

with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        accession = re.match("(.+)\..+", record.id).groups()[0]
        species = df.loc[[accession],['Species']].Species.item()
        strain = df.loc[[accession],['Strain']].Strain.item()
        record.id = ""
        record.description = str(accession + "_" + species + '_' + strain)
        SeqIO.write(record, corrected, 'fasta')

### For the following commands, use the bash terminal:

In [None]:
# Multiple sequence alignment using MUSCLE:
# muscle -in /home/sergio/TFM1/reports/16SrRNA/16SrRNA_new.fasta -out /home/sergio/TFM1/reports/16SrRNA/16SrRNA.aln

In [None]:
# Removal of gaps and poorly aligned regions using Gblocks:
# Gblocks /home/sergio/TFM1/reports/16SrRNA/16SrRNA.aln 16SrRNA.aln -t=d -b4=5 -b5=h
# cat /home/sergio/TFM1/reports/16SrRNA/16SrRNA.aln-gb > /home/sergio/TFM1/reports/16SrRNA/16SrRNA_alignment.fasta

# Remove the 5' and 3' overhangs (e.g. using MEGA X) and save the file as 16SrRNA_final_alignment.fas
# mkdir /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree_gblocks/
# cp /home/sergio/TFM1/reports/16SrRNA/16SrRNA_final_alignment.fas /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree_gblocks/

# Phylogenetic tree using IQtree:
# iqtree -s /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree_gblocks/16SrRNA_final_alignment.fas -m TEST -alrt 1000 -bb 1000 -nt 4 

In [None]:
# Repetition of the phylogenetic analysis using the original (i.e. non-trimmed alignment)
# mkdir /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree/
# cat /home/sergio/TFM1/reports/16SrRNA/16SrRNA.aln > /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree/16SrRNA.fas

# Remove the 5' and 3' overhangs (e.g. using MEGA X) and save the file as 16SrRNA_aln.fas

# iqtree -s /home/sergio/TFM1/reports/16SrRNA/16SrRNA_tree/16SrRNA_aln.fas -alrt 1000 -bb 1000 -nt 4