## Analysis of PHASTER results

Sergio Álvarez-Pérez, 2020

In [None]:
import os
import re
import pandas as pd
import numpy as np

In [None]:
if not os.path.exists('/home/sergio/TFM1/reports/'): # include here your preferred path
    os.mkdir('/home/sergio/TFM1/reports/')
if not os.path.exists('/home/sergio/TFM1/reports/phaster/'):
    os.mkdir('/home/sergio/TFM1/reports/phaster/')

In [None]:
path = '/home/sergio/TFM1/phaster/' # path to the folder containing the outputs of PHASTER (lists of putative prophage regions)

mydict = {}

for foldername in os.listdir(path):
    for file in os.listdir(path + '/' + foldername):
        if file == "summary.txt":
            genome = foldername
            with open(os.path.join(path,foldername,file)) as myFile:
                for num, line in enumerate(myFile, 1):
                    if num >= 35:
                        record = re.split(" +", line)
                        region = record[1]
                        region_length = record[2]
                        completeness = record[3]
                        keyword = record[4]
                        position = record[5]
                        tRNA_num = record[6]
                        total_protein_num = record[7]
                        phage_hit_protein_num = record[8]
                        hypo_protein_num = record[9]
                        phage_hypo_protein_percentage = record[10]
                        bact_protein_num = record[11]
                        att_site_showup = record[12]
                        phage_species_num = record[13]
                        most_common_phage_name = record[14]
                        most_common_phage_num = record[15]
                        most_common_phage_percentage = record[16]
                        GC_percentage = record[17]
                    
                        mydict[str(genome+'_REGION_'+region)] = {
                                'REGION_LENGTH':region_length,
                                'COMPLETENESS(score)':completeness,
                                'SPECIFIC_KEYWORD':keyword,
                                'REGION_POSITION':position,
                                'TRNA_NUM':tRNA_num,
                                'TOTAL_PROTEIN_NUM':total_protein_num,
                                'PHAGE_HIT_PROTEIN_NUM':phage_hit_protein_num,
                                'HYPOTHETICAL_PROTEIN_NUM':hypo_protein_num,
                                'PHAGE+HYPO_PROTEIN_PERCENTAGE':phage_hypo_protein_percentage,
                                'BACTERIAL_PROTEIN_NUM':bact_protein_num,
                                'ATT_SITE_SHOWUP':att_site_showup,
                                'PHAGE_SPECIES_NUM':phage_species_num,
                                'MOST_COMMON_PHAGE_NAME(hit_genes_count)':most_common_phage_name,
                                'FIRST_MOST_COMMON_PHAGE_NUM':most_common_phage_num,
                                'FIRST_MOST_COMMON_PHAGE_PERCENTAGE':most_common_phage_percentage,
                                'GC_PERCENTAGE':GC_percentage
                                }
            myFile.close()


In [None]:
# To generate the list of putative prophage regions found in all Rosenbergiella genomes
path='/home/sergio/TFM1/reports/phaster/'
df = pd.DataFrame.from_dict(mydict).T
pd.DataFrame.to_csv(df, path_or_buf = os.path.join(path,'regions_per_species.tsv'), sep = "\t")
df

In [None]:
path = '/home/sergio/TFM1/phaster/' # path to the folder containing the outputs of PHASTER 

mydict2 = {}

for foldername in os.listdir(path):
    for file in os.listdir(path + '/' + foldername):
        if file == "detail.txt":
            genome = foldername
            with open(os.path.join(path,foldername,file)) as myFile:
                for num, line in enumerate(myFile, 1):
                    if num >=6:
                        if re.match('#### .* ####', line):
                            region = re.match('#### (region [0-9]+),.+', line).groups()[0]
                            n = 0
                        elif re.match('[0-9]+\.\.[0-9]+', line) or re.match('complement.+', line):
                            n += 1
                            record = re.split('  +', line)
                            position = record[0]
                            blast_hit = record[1]
                            evalue = record[2]
                            sequence = record[3].strip('\n')
                            mydict2[genome + '_' + region + '_' + str(n)] = {
                                    'CDS position':position,
                                    'Blast hit':blast_hit,
                                    'E-value':evalue,
                                    'Sequence':sequence
                                    }
                            
            myFile.close()


In [None]:
# To generate the list of CDS found in all putative prophage regions of all Rosenbergiella genomes
path='/home/sergio/TFM1/reports/phaster/'
df2 = pd.DataFrame.from_dict(mydict2).T
pd.DataFrame.to_csv(df2, path_or_buf = os.path.join(path,'cds_per_genome_and_region.tsv'), sep = "\t")
df2