In [13]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [14]:
# Load metadata from previous analysis 
yih_homologs_loci = pd.read_csv('yih_homologs_loci.csv', 
                                sep='\t')

target2info_data = pd.read_csv('target2info_data_altToGFFs.csv', sep='\t')

In [15]:
target2info_data.head()

Unnamed: 0,target,target_info,target_location,strand,start,end,locus_tag,chr_id
0,CP018976.1_cds_AQV50030.1_2,lcl|CP018976.1_cds_AQV50030.1_2 [locus_tag=BE9...,1..1497,+,1,1497,BE949_01660,CP018976.1
1,CP018976.1_cds_AQV50031.1_3,lcl|CP018976.1_cds_AQV50031.1_3 [locus_tag=BE9...,1497..1853,+,1497,1853,BE949_01665,CP018976.1
2,CP018976.1_cds_AQV50032.1_4,lcl|CP018976.1_cds_AQV50032.1_4 [locus_tag=BE9...,1853..2122,+,1853,2122,BE949_01670,CP018976.1
3,CP018976.1_cds_AQV50033.1_5,lcl|CP018976.1_cds_AQV50033.1_5 [locus_tag=BE9...,2089..2271,+,2089,2271,BE949_01675,CP018976.1
4,CP018976.1_cds_AQV50034.1_6,lcl|CP018976.1_cds_AQV50034.1_6 [locus_tag=BE9...,2264..4099,+,2264,4099,BE949_01680,CP018976.1


# Generate annotation file that schematically plots _yih_ gene cassettes

In [410]:
def get_domains(strand, start, mid, gene, color, end):
    """Generate iTOL-style domain arrows based on strand."""
    if strand == "-":
        return f"TL|{start}|{mid}|{color}|,RE|{mid}|{end}|{color}|{gene}"
    return f"RE|{start}|{mid}|{color}|{gene},TR|{mid}|{end}|{color}|"


def flip_orientation_locus(tree_data, ref_gene, ref_strand, show_ID=True):
    #tree_data['length'] = tree_data.chr_id.map(tree_data.groupby('chr_id').agg({'start':min, 'end':max}).diff(axis=1).to_dict()['end'])
    tree_data['length'] = tree_data.locusTag_locus.map(tree_data.groupby('locusTag_locus').\
                                                       agg({'start':min, 'end':max}).\
                                                       diff(axis=1).to_dict()['end'])

    locusTag_locus_F = tree_data[(tree_data.gene == ref_gene) & \
                                 (tree_data.strand == ref_strand)].locusTag_locus.unique()
    locusTag_locus_R = tree_data[(tree_data.gene == ref_gene) & \
                                 (tree_data.strand != ref_strand)].locusTag_locus.unique()
    
    tree_data_F = tree_data[tree_data.locusTag_locus.isin(locusTag_locus_F)]
    tree_data_R = tree_data[tree_data.locusTag_locus.isin(locusTag_locus_R)]
    
    # forward data
    tree_data_F['coord_2subtract'] = tree_data_F.locusTag_locus.map(tree_data_F.groupby('locusTag_locus').\
                                                                    agg({'start': min}).to_dict()['start']) - 1

    tree_data_F.start = tree_data_F.start - tree_data_F.coord_2subtract
    tree_data_F.end = tree_data_F.end - tree_data_F.coord_2subtract
    
    
    tree_data_F["mid_coord"] = (round(400 + tree_data_F.start)).astype(int)
    if show_ID:
        tree_data_F['domains'] = tree_data_F.apply(lambda x: get_domains(x.strand, x.start, x.mid_coord, x.query_gene + ' '+ str(round(x.identity,1)), x.color, x.end),  axis=1)
    else:
        tree_data_F['domains'] = tree_data_F.apply(lambda x: get_domains(x.strand, x.start, x.mid_coord, x.gene, x.color, x.end),  axis=1)

    # reverse data
    tree_data_R['coord_2add'] = tree_data_R.locusTag_locus.map(tree_data_R.groupby('locusTag_locus').agg({'end': max}).to_dict()['end']) + 1
    
    tree_data_R.start = -tree_data_R.start
    tree_data_R.end = -tree_data_R.end

    tree_data_R.start = tree_data_R.start + tree_data_R.coord_2add
    tree_data_R.end = tree_data_R.end + tree_data_R.coord_2add
    tree_data_R = tree_data_R.sort_values(['locusTag_locus', 'start'])
    tree_data_R["strand"] = tree_data_R.strand.map({'-': "+", '+': '-'})
    tree_data_R = tree_data_R.rename(columns={'start':'end', 'end':'start'})
    
    tree_data_R["mid_coord"] = (round(400 + tree_data_R.start)).astype(int)
    if show_ID:
        tree_data_R['domains'] = tree_data_R.apply(lambda x:
                                                   get_domains(x.strand, x.start, x.mid_coord, x.query_gene + ' '+ str(round(x.identity,1)), x.color, x.end),  axis=1)
    else:
        tree_data_R['domains'] = tree_data_R.apply(lambda x:
                                                   get_domains(x.strand, x.start, x.mid_coord, x.gene, x.color, x.end),  axis=1)

    
    tree_data_FR = pd.concat([tree_data_R, tree_data_F])
    
    node_domains_map = tree_data_FR.groupby('locusTag_locus').agg({'domains': lambda x: ','.join(x)}).to_dict()['domains'] 

    
    tree_data_FR['all_domains'] = tree_data_FR.locusTag_locus.map(node_domains_map)

    ### IMPORTANT 
    tree_data_FR['line'] = tree_data_FR.terminal_node +',' + (tree_data_FR.length + 1).astype(str) + ',' + tree_data_FR.all_domains
    
    max_length = max(tree_data_FR.length)
    return tree_data_FR




def write_itol_domains(df_all, out_path, annotate_colors):
    with open(out_path, 'w') as f:
        f.write("DATASET_DOMAINS\nSEPARATOR COMMA\n")
        f.write("DATASET_LABEL,loci\n")
        f.write("LEGEND_TITLE,genes\n")
        f.write("LEGEND_SHAPES," + ",".join(['RE'] * len(annotate_colors)) + "\n")
        f.write("LEGEND_COLORS," + ",".join([annotate_colors[g] \
                                             for g in ['ompL', 'yihO', 'yihP',
                                                       'yihQ', 'yihR', 'yihS',
                                                       'yihT', 'yihU', 'yihV',
                                                       'yihW', 'other']]) + "\n")
        f.write("LEGEND_LABELS," + ",".join(['ompL (porin)',
                                             'yihO (MFS transporter)',
                                             'yihP (MFS transporter)',
                                             'yihQ (SQase)',
                                             'yihR (mutarotase)',
                                             'yihS (isomerase)',
                                             'yihT (aldolase)',
                                             'yihU (reductase)',
                                             'yihV (kinase)',
                                             'yihW (csqR) (regulator)', 
                                             'other']) + "\n")
        f.write("BACKBONE_COLOR,#ffffff\nSHOW_DOMAIN_LABELS,1\nBORDER_WIDTH,0\nDATA\n")
        for line in df_all['line'].dropna().unique():
            f.write(line + '\n')


In [12]:
annotate_colors = {
    'yihU': '#00008B',   # very dark blue
    'yihT': '#1A2C5F',   # dark desaturated blue
    'yihV': '#4343F7',   # bright blue
    'yihS': '#6683E9',   # medium blue
    'yihR': '#99CCFF',   # soft sky blue
    'yihQ': '#CCE5FF',   # very light blue

    'yihW': '#708090',  # DarkSlateBlue
    'ompL': '#E5E4E2',   # light gray
    'yihO': '#C0C0C0',   # classic gray
    'yihP':  '#C0C0C0', #'#848884',   # gray-green

    'other': '#F9F6EE'   # fallback color (light cream/neutral)
}


In [None]:
# Prepare data on yih loci 
# filters genes within yih genomic ranges
chr_ranges = yih_homologs_loci.\
drop_duplicates(['chr_id',  'start_min',  'end_max'])[['chr_id', 'start_min',\
                                                       'end_max',  'set_locusTag_locus', \
                                                       'locusTag_locus', 'set_queryGenes_locus']]

# merge with target2info_data to filter genes within these ranges
subset_target2info = pd.merge(
    target2info_data,
    chr_ranges,
    on='chr_id',
    how='inner'
).query('start >= start_min and end <= end_max')

# drop the temporary range columns if needed
subset_target2info = subset_target2info.drop(['start_min', 'end_max'], axis=1)

# sort by chromosome and position
subset_target2info = subset_target2info.sort_values(['chr_id', 'start'])

subset_target2info = pd.merge(
    subset_target2info,
    yih_homologs[yih_homologs.groupby(['target', 'query'])['bit_score']\
    .transform(max) == yih_homologs['bit_score']][['locus_tag', 'genome_id', 'query_gene']],
    on='locus_tag',
    how='left'
).rename(columns={'genome_id': 'terminal_node'})

subset_target2info['gene'] = subset_target2info['query_gene'].fillna('other')
subset_target2info['color'] = subset_target2info.gene.map(annotate_colors)
subset_target2info = subset_target2info.drop_duplicates('locus_tag')


In [None]:
# Standardizes orientations using yihW as reference gene
df_prepared = flip_orientation_locus(subset_target2info, 
                                     ref_gene='yihW',
                                     ref_strand='+', show_ID=False)

# Generate domain coordinates for arrow visualization
write_itol_domains(df_prepared, 'Ecoli.domains.txt', annotate_colors)


# Generate annotation file describing _yih_ gene cassette content in a binary format (gene presence/absence)

In [2]:
yih_homologs_loci = pd.read_csv('yih_homologs_loci.csv', 
                                sep='\t')

In [4]:
yih_homologs_loci.head()

Unnamed: 0,query,target,identity,alignment_length,mismatches,gap_openings,query_start,query_end,target_start,target_end,...,midpoint,queryGenes_locus,locusTag_locus,set_queryGenes_locus,start_min,end_max,set_locusTag_locus,variant,Info,Cluster
0,AYG21329.1|CP032667.1,AE005674.2_cds_AAN45382.1_3949,0.961,330,13,0,1,330,37,366,...,4076793.0,ompL/yihP/yihO/yihO/yihP/yihQ/yihR/yihS/yihT/y...,SF3947/SF3948/SF3948/SF3949/SF3949/SF3950/SF39...,ompL~yihO~yihP~yihQ~yihR~yihS~yihT~yihU~yihV~yihW,4076600,4088098,SF3947~SF3948~SF3949~SF3950~SF3951~SF3952~SF39...,long,shigellosis,7 (B1)
1,AYG21327.1|CP032667.1,AE005674.2_cds_AAN45383.2_3950,0.686,1365,425,0,10,1374,1,1356,...,4077719.5,ompL/yihP/yihO/yihO/yihP/yihQ/yihR/yihS/yihT/y...,SF3947/SF3948/SF3948/SF3949/SF3949/SF3950/SF39...,ompL~yihO~yihP~yihQ~yihR~yihS~yihT~yihU~yihV~yihW,4076600,4088098,SF3947~SF3948~SF3949~SF3950~SF3951~SF3952~SF39...,long,shigellosis,7 (B1)
2,AYG21328.1|CP032667.1,AE005674.2_cds_AAN45384.1_3951,0.659,1029,347,0,1,1020,43,1071,...,4079244.5,ompL/yihP/yihO/yihO/yihP/yihQ/yihR/yihS/yihT/y...,SF3947/SF3948/SF3948/SF3949/SF3949/SF3950/SF39...,ompL~yihO~yihP~yihQ~yihR~yihS~yihT~yihU~yihV~yihW,4076600,4088098,SF3947~SF3948~SF3949~SF3950~SF3951~SF3952~SF39...,long,shigellosis,7 (B1)
3,AYG21326.1|CP032667.1,AE005674.2_cds_AAN45385.1_3952,1.0,2034,0,0,1,2034,1,2034,...,4080902.0,ompL/yihP/yihO/yihO/yihP/yihQ/yihR/yihS/yihT/y...,SF3947/SF3948/SF3948/SF3949/SF3949/SF3950/SF39...,ompL~yihO~yihP~yihQ~yihR~yihS~yihT~yihU~yihV~yihW,4076600,4088098,SF3947~SF3948~SF3949~SF3950~SF3951~SF3952~SF39...,long,shigellosis,7 (B1)
4,AYG21325.1|CP032667.1,AE005674.2_cds_AAN45386.2_3953,0.986,900,13,0,25,924,1,900,...,4082570.0,ompL/yihP/yihO/yihO/yihP/yihQ/yihR/yihS/yihT/y...,SF3947/SF3948/SF3948/SF3949/SF3949/SF3950/SF39...,ompL~yihO~yihP~yihQ~yihR~yihS~yihT~yihU~yihV~yihW,4076600,4088098,SF3947~SF3948~SF3949~SF3950~SF3951~SF3952~SF39...,long,shigellosis,7 (B1)


In [8]:
#Create binary matrix of gene presence/absence

crosstab = pd.crosstab(yih_homologs_loci['genome_id'],
                       yih_homologs_loci['query_gene'])

In [9]:
crosstab.loc[crosstab['yihP'] == 2, 'yihO'] = 1
crosstab.loc[crosstab['yihO'] == 2, 'yihP'] = 1
crosstab  = crosstab.replace(2, 1)

In [10]:
gene2color = {'yihU': '#00008B', 'yihT': '#1A2C5F',
              'yihV': '#4343f7', 'yihS': '#6683E9',
              'yihR': '#99CCFF', 'yihQ': '#CCE5FF',
              'ompL': '#E5E4E2', 'yihO': '#C0C0C0',
              'yihP': '#C0C0C0', 'yihW': '#708090'}
genes_order = ['ompL', 'yihO', 'yihP', 'yihQ',
               'yihR', 'yihS', 'yihT', 'yihU',
               'yihV', 'yihW']

with open('yih_loci_presence.txt', 'w') as f:
    f.write('\n'.join(['DATASET_BINARY', 'SEPARATOR COMMA',
          'DATASET_LABEL,genes', 'COLOR,#ff0000', 
                       f'FIELD_SHAPES,{",".join(["2"]*len(genes_order))}'])+'\n')

    f.write(f'LEGEND_TITLE,genes\n')
    f.write(f'LEGEND_SHAPES,{",".join(["2"]*len(genes_order))}\n')
    f.write(f'LEGEND_SHAPES,{",".join(["2"]*len(genes_order))}\n')
    f.write(f'LEGEND_LABELS,{",".join([i for i in genes_order])}\n')
    f.write(f'LEGEND_COLORS,{",".join([gene2color[i] for i in genes_order])}\n')
    
    f.write(f'FIELD_LABELS,{",".join([i for i in genes_order])}\n')
    f.write(f'FIELD_COLORS,{",".join([gene2color[i] for i in genes_order])}\nDATA\n')
    binary_data = crosstab.reset_index(drop=False).astype(str).\
    agg(','.join, axis=1).values
    for i in binary_data:
        f.write(i+'\n')