In [2]:
import pandas as pd
import numpy as np
import glob
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [36]:
# Load roary and metadata
roary = pd.read_csv('../../results/roary/gene_presence_absence.csv')
metadata = pd.read_csv('../../results/roary/opa_with_prokka.csv', index_col = 0)
opa_metadata = pd.read_csv('../../results/opa_metadata_locus.csv', index_col = 0)

  roary = pd.read_csv('../../results/roary/gene_presence_absence.csv')


In [4]:
# Figure out which roary clusters the identified opa genes are from
roary_clusters = []
for i, row in metadata.iterrows():
    if len(roary[roary.eq(row['locus_tag']).any(axis=1)])==1:
        roary_clusters.append(roary[roary.eq(row['locus_tag']).any(axis=1)]['Gene'].values[0])
        
unique_roary_clusters = np.unique(roary_clusters)
print(unique_roary_clusters)

['group_10' 'group_1130' 'group_12' 'group_13' 'group_16' 'group_17'
 'group_2' 'group_20' 'group_21' 'group_22' 'group_24' 'group_25'
 'group_3' 'group_31' 'group_33' 'group_34' 'group_36' 'group_4'
 'group_4940' 'group_5' 'group_7' 'group_9' 'piiC_1' 'piiC_10' 'piiC_11'
 'piiC_2' 'piiC_3' 'piiC_4' 'piiC_5' 'piiC_6' 'piiC_7' 'piiC_8' 'piiC_9']


In [5]:
# Get the information about these roary clusters
roary_opa = pd.DataFrame(unique_roary_clusters, columns = ['Gene']).merge(roary)
print('Number of genes in the roary clusters corresponding to identified opa genes = ' + str(np.sum(roary_opa['No. sequences'])))

Number of genes in the roary clusters corresponding to identified opa genes = 2433


In [6]:
# Reformat the results so that each row contains the locus tag (above, each row contained the entire gene presence absence table)
general_columns = ['Gene', 
                   'Non-unique Gene name',
                   'Annotation', 
                   'No. isolates',
                   'No. sequences',
                   'Avg sequences per isolate',
                   'Genome Fragment',
                   'Order within Fragment',
                   'Accessory Fragment',
                   'Accessory Order with Fragment',
                   'QC',
                   'Min group size nuc',
                   'Max group size nuc',
                   'Avg group size nuc']
strains = np.sort(list(set(roary_opa.columns) - set(general_columns)))

roary_opa_by_gene = pd.DataFrame()
for strain in strains:
    roary_opa_by_gene_strain = roary_opa[['Gene', 'Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', strain]].dropna(subset = [strain])
    roary_opa_by_gene_strain.rename({'Gene':'roary', 'Non-unique Gene name':'prokka_gene_name', strain:'locus_tag'}, axis = 'columns', inplace = True)
    roary_opa_by_gene = pd.concat([roary_opa_by_gene,roary_opa_by_gene_strain])

roary_opa_by_gene.reset_index(inplace = True, drop = True)

In [83]:
# Figure out which ones were already found by our script
merged = roary_opa_by_gene.merge(metadata, on = 'locus_tag', how = 'outer', indicator = True)

metadata_only = merged[merged['_merge'] == 'right_only']
metadata_only.reset_index(drop = True, inplace = True)

roary_only = merged[merged['_merge'] == 'left_only']
roary_only.reset_index(drop = True, inplace = True)

# If there were two locus tags associated with an entry (may have happened if the ORF was split), only keep the first locus tag
roary_only['locus_tag'] = roary_only['locus_tag'].str.split(expand = True)[0]

# Get the locations of the genes from Prokka for the genes that were only identified by roary but not our algorithm
prokka_path = '../../results/annotations/'

filenames = glob.glob(prokka_path + '*.csv')
filenames = np.sort(filenames)
prokka_all = pd.DataFrame()
for filename in filenames:
    strain = os.path.basename(filename)[:-4]
    prokka = pd.read_csv(filename, index_col = 0)
    prokka['strain'] = strain
    prokka_all = pd.concat([prokka_all, prokka])
prokka_all.reset_index(drop = True, inplace = True)

roary_only = roary_only.drop(['chromosome_metadata', 'strand', 'start_cr', 'stop_cr', 'start_term', 'stop_term', 'id', 'start_metadata', 'stop', 'n_terminus', 'in_frame', 'chromosome_prokka', 'source', '_merge', 'strain', 'gene', 'start_prokka', 'end', 'lcb_num', 'ref_lcb_start',
       'ref_lcb_stop', 'ref_lcb_strand', 'query_lcb_start', 'query_lcb_stop',
       'query_lcb_strand', 'length_ratio', 'start_reordered',
       'start_reordered_flipped', 'strand_flipped', 'locus'], axis = 'columns')
roary_only = roary_only.merge(prokka_all, on = 'locus_tag', how = 'left')

# Check if the roary only hits were actually just ORFs split into 2 ORFs - use very strict cutoff since I would rather have more false positives than false negatives
roary_only.sort_values('start', inplace = True, ignore_index = True)
opa_metadata.sort_values('start', inplace = True, ignore_index = True)

roary_only = roary_only[pd.merge_asof(roary_only, opa_metadata, on = 'start', by=['strain','strand'], direction = 'nearest', tolerance = 700)['id'].isnull()]
roary_only.to_csv('../../results/roary/roary_only.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  roary_only['locus_tag'] = roary_only['locus_tag'].str.split(expand = True)[0]


In [84]:
# Get the sequences
path = '../../results/assemblies_shifted/'
roary_only_records = []
i = 0
for i, row in roary_only.iterrows():
    strain = row['strain']
    start = int(row['start'])
    stop = int(row['end'])
    strand = row['strand']
    
    record = list(SeqIO.parse(path + strain + '.fa', "fasta"))[0]
    
    if strand == 1: 
        roary_only_records.append(SeqRecord(record.seq[start-100:start+900], id = strain + '_' + str(start) + '_' + str(stop)))
    elif strand == -1:
        roary_only_records.append(SeqRecord(record.seq[stop-900:stop+100].reverse_complement(), id = strain + '_' + str(start) + '_' + str(stop)))
        
    # Add FA1090 sequences, with extra sequence before and after to test for deletions
for i, row in metadata[metadata['strain']=='FA1090'].iterrows():
    strain = row['strain']
    start = int(row['start_metadata'])
    stop = int(row['stop'])
    strand = row['strand']
    
    record = list(SeqIO.parse(path + 'FA1090.fa', "fasta"))[0]
    if strand == 1: 
        roary_only_records.append(SeqRecord(record.seq[start:stop], id = row['id']))
    elif strand == -1:
        roary_only_records.append(SeqRecord(record.seq[start:stop].reverse_complement(), id = row['id']))
        
SeqIO.write(roary_only_records, '../../results/roary/opa_roary_only_with_FA1090.fa', 'fasta')

In [91]:
'Number of opa genes found by our script that were not found by Roary: ' + str(len(metadata_only))

'Number of opa genes found by our script that were not found by Roary: 7'

In [92]:
metadata_only['id']

0      10794_opa_10
1        FQ82_opa_5
2       9460_opa_10
3       9464_opa_10
4     FFF007_opa_11
5    GCGS0423_opa_2
6     JJJ010_opa_11
Name: id, dtype: object