In [1]:
import pandas as pd
import numpy as np
from PIL import Image, ImageColor
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

what I have:  
- gff file containing location info - has protein_id/contig/start/end/strand  
- function predicton results: predicted function result in "top_function", protein id in column "id"   
- color mapping

what I want:  
the idea is to show function patterns in the bacteria genome  
there are 245 contigs, my idea is to make a heatmap for each contig  
one contig can have more than 100 proteins, so maybe it's hard to show all the genes in one row  
may utilize the code in the script; have a pipline in python notebook  

In [2]:
colors = {
    "lysis": "#f35f49",
    "tail": "#07e9a2",
    "connector": "#35d7ff",
    "dna_rna_and_nucleotide_metabolism": "#ffdf59",
    "head_and_packaging": "#3e83f6",
    "other": "#838383",
    "transcription_regulation": "#a861e3",
    "moron_auxiliary_metabolic_gene_and_host_takeover": "#ff59f5",
    "unknown_function": "#313131",  # maybe should be white or not to be shown
    "integration_and_excision": "#fea328",
    "no_hit": "#F5F5F5",  # maybe not to be shown
}

In [3]:
def create_contig_heatmap(
    gff_df, predictions_df, contig_id, colors, max_proteins_per_row=100, gene_width=10
):
    """
    Create a heatmap for a single contig with wider genes

    Parameters:
    - gff_df: DataFrame with protein locations (protein_id, contig, start, end, strand)
    - predictions_df: DataFrame with predictions (id, top_function)
    - contig_id: ID of the contig to visualize
    - colors: Dictionary mapping functions to colors
    - max_proteins_per_row: Maximum number of proteins to show in one row
    - gene_width: Width of each gene in pixels (default 5)
    """
    # Filter data for this contig
    contig_data = gff_df[gff_df["contig"] == contig_id].copy()

    # Sort by start position
    contig_data = contig_data.sort_values("start")

    # Get predictions for these proteins
    contig_data = contig_data.merge(
        predictions_df[["id", "top_function"]],
        left_on="protein_id",
        right_on="id",
        how="left",
    )

    # Calculate number of rows needed
    n_proteins = len(contig_data)
    n_rows = (n_proteins + max_proteins_per_row - 1) // max_proteins_per_row

    # Create image with wider genes
    width = max_proteins_per_row * gene_width
    height = n_rows
    im = Image.new("RGBA", (width, height), "white")

    # Fill in the heatmap
    for idx, row in contig_data.iterrows():
        # Calculate position in the heatmap
        row_num = idx // max_proteins_per_row
        col_num = idx % max_proteins_per_row

        # Get color for this function
        function = row["top_function"]
        color = ImageColor.getcolor(colors.get(function, colors["no_hit"]), "RGBA")

        # If on negative strand, reverse the order
        if row["strand"] == "-":
            col_num = max_proteins_per_row - 1 - col_num

        # Set multiple pixels for each gene
        for w in range(gene_width):
            im.putpixel((col_num * gene_width + w, row_num), color)

    return im

In [4]:
def create_all_contig_heatmaps(gff_df, predictions_df, output_dir, colors):
    """
    Create heatmaps for all contigs

    Parameters:
    - gff_df: DataFrame with protein locations
    - predictions_df: DataFrame with predictions
    - output_dir: Directory to save heatmaps
    - colors: Dictionary mapping functions to colors
    """
    # Get unique contigs
    contigs = gff_df["contig"].unique()

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Create heatmap for each contig
    for contig_id in tqdm(contigs):
        # Create heatmap
        im = create_contig_heatmap(gff_df, predictions_df, contig_id, colors)

        # Save only the large version
        output_path = f"{output_dir}/contig_{contig_id}_heatmap.png"
        im.save(output_path)

In [6]:
gff_df = pd.read_csv(
    "../dataset/demonstration_samples/Escherichia_coli_O157_H7_str_FRIK2000/gff_df.csv"
)
predictions_df = pd.read_csv("../results/demonstration/prediction_GCF_000175755.1.csv")

In [None]:
predictions_df

In [None]:
gff_df

In [29]:
predictions_df["id"] = predictions_df["id"].str.split().str[0]

Unnamed: 0,id,hit_number,hit_function,top_function,prob_lysis,prob_tail,prob_connector,prob_dna_rna_and_nucleotide_metabolism,prob_head_and_packaging,prob_other,prob_transcription_regulation,prob_moron_auxiliary_metabolic_gene_and_host_takeover,prob_unknown_function,prob_integration_and_excision
0,WP_000002304.1,0,,no_hit,0.000025,0.005049,3.965188e-03,0.033466,1.915140e-03,0.005226,1.346041e-05,0.003127,0.001871,0.000128
1,WP_000002542.1,0,,no_hit,0.000342,0.043206,1.180836e-05,0.001428,2.302157e-02,0.003058,4.371904e-07,0.001554,0.005889,0.000021
2,WP_000002701.1,2,"transcription_regulation,unknown_function",unknown_function,0.000387,0.082121,2.116558e-05,0.087142,4.601374e-05,0.000108,5.419045e-01,0.010056,0.837887,0.011937
3,WP_000002907.1,2,"lysis,moron_auxiliary_metabolic_gene_and_host_...",lysis,0.986116,0.000066,6.596111e-06,0.000036,8.304536e-04,0.004222,5.938268e-07,0.828335,0.248284,0.000674
4,WP_000002953.1,0,,no_hit,0.000838,0.001368,1.333162e-03,0.000877,3.418033e-04,0.049895,1.260442e-04,0.005843,0.028239,0.000076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5090,WP_255087022.1,1,moron_auxiliary_metabolic_gene_and_host_takeover,moron_auxiliary_metabolic_gene_and_host_takeover,0.004632,0.003339,4.313661e-07,0.000293,2.798416e-07,0.003419,7.375170e-08,0.972719,0.026150,0.001025
5091,WP_272481064.1,2,moron_auxiliary_metabolic_gene_and_host_takeov...,unknown_function,0.000235,0.000060,4.939919e-07,0.000387,3.246010e-03,0.073867,8.136473e-07,0.603329,0.822039,0.000286
5092,WP_306256364.1,0,,no_hit,0.016409,0.108107,2.321029e-05,0.002118,6.534757e-05,0.000662,3.529115e-08,0.000082,0.003502,0.000030
5093,WP_323670517.1,1,unknown_function,unknown_function,0.000003,0.000031,3.391745e-05,0.038855,8.032553e-03,0.017652,5.358598e-02,0.001807,0.956895,0.003226


In [7]:
create_all_contig_heatmaps(
    gff_df=gff_df,
    predictions_df=predictions_df,
    output_dir="../results/demonstration/FRIK2000_contig_heatmaps",
    colors=colors,
)

100%|██████████| 245/245 [00:00<00:00, 594.80it/s]


+ : gene A       gene C
- :        gene B       geneD


predictor 
true heatmap - pattern
10 pici/cf/p4 show heatmap
bacteria genome where we know there are pici -1 for pici 1 for cf 1for p4