In [1]:
import pandas as pd
import numpy as np
from PIL import Image, ImageColor
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

what I have:  
- gff file containing location info - has protein_id/contig/start/end/strand  
- function predicton results: predicted function result in "top_function", protein id in column "id"   
- color mapping

what I want:  
the idea is to show function patterns in the bacteria genome  
there are 245 contigs, my idea is to make a heatmap for each contig  
one contig can have more than 100 proteins, so maybe it's hard to show all the genes in one row  
may utilize the code in the script; have a pipline in python notebook  

In [2]:
colors = {
    "lysis": "#f35f49",
    "tail": "#07e9a2",
    "connector": "#35d7ff",
    "dna_rna_and_nucleotide_metabolism": "#ffdf59",
    "head_and_packaging": "#3e83f6",
    "other": "#838383",
    "transcription_regulation": "#a861e3",
    "moron_auxiliary_metabolic_gene_and_host_takeover": "#ff59f5",
    "unknown_function": "#313131",  # maybe should be white or not to be shown
    "integration_and_excision": "#fea328",
    "no_hit": "#F5F5F5",  # maybe not to be shown
}

In [3]:
def create_contig_heatmap(gff_df, predictions_df, contig_id, colors):
    """
    Create a heatmap for a single contig showing genes in their true genomic order,
    excluding 'no_hit' and 'unknown_function' genes
    """
    # Filter data for this contig
    contig_data = gff_df[gff_df["contig"] == contig_id].copy()

    # Sort by start position
    contig_data = contig_data.sort_values("start")

    # Get predictions for these proteins
    contig_data = contig_data.merge(
        predictions_df[["id", "top_function"]],
        left_on="protein_id",
        right_on="id",
        how="left",
    )

    # Filter out 'no_hit' and 'unknown_function' genes
    contig_data = contig_data[
        ~contig_data["top_function"].isin(["no_hit", "unknown_function"])
    ]

    # If no genes left after filtering, return None
    if len(contig_data) == 0:
        return None

    # Create image with width based on actual number of genes
    width = len(contig_data) * 10  # 10 times wider
    height = 10  # 10 pixels tall

    im = Image.new("RGBA", (width, height), "white")

    # Fill in the heatmap
    for i, row in enumerate(contig_data.itertuples()):
        # Get color for this function
        function = row.top_function
        color = ImageColor.getcolor(colors.get(function, colors["no_hit"]), "RGBA")

        # Set pixel color for a 10x10 block
        for x in range(10):  # 10 pixels wide
            for y in range(10):  # 10 pixels tall
                im.putpixel((i * 10 + x, y), color)

    return im

In [4]:
def create_all_contig_heatmaps(gff_df, predictions_df, output_dir, colors):
    """
    Create heatmaps for all contigs
    """
    # Get unique contigs
    contigs = gff_df["contig"].unique()

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Create heatmap for each contig
    for contig_id in tqdm(contigs):
        # Create heatmap
        im = create_contig_heatmap(gff_df, predictions_df, contig_id, colors)

        # Skip if no valid genes found
        if im is None:
            print(f"Skipping contig {contig_id} - no valid genes found")
            continue

        # Save only the large version
        output_path = f"{output_dir}/contig_{contig_id}_heatmap.png"
        im.save(output_path)

In [5]:
gff_df = pd.read_csv(
    "../dataset/demonstration_samples/Escherichia_coli_O157_H7_str_FRIK2000/gff_df.csv"
)
predictions_df = pd.read_csv("../results/demonstration/prediction_GCF_000175755.1.csv")

In [6]:
predictions_df["id"] = predictions_df["id"].str.split().str[0]

In [7]:
create_all_contig_heatmaps(
    gff_df=gff_df,
    predictions_df=predictions_df,
    output_dir="../results/demonstration/FRIK2000_contig_heatmaps",
    colors=colors,
)

 36%|███▌      | 88/245 [00:00<00:00, 436.96it/s]

Skipping contig NZ_ACXO01000007.1 - no valid genes found
Skipping contig NZ_ACXO01000009.1 - no valid genes found
Skipping contig NZ_ACXO01000034.1 - no valid genes found
Skipping contig NZ_ACXO01000059.1 - no valid genes found
Skipping contig NZ_ACXO01000076.1 - no valid genes found
Skipping contig NZ_ACXO01000080.1 - no valid genes found
Skipping contig NZ_ACXO01000085.1 - no valid genes found
Skipping contig NZ_ACXO01000086.1 - no valid genes found
Skipping contig NZ_ACXO01000094.1 - no valid genes found
Skipping contig NZ_ACXO01000095.1 - no valid genes found


 78%|███████▊  | 192/245 [00:00<00:00, 449.29it/s]

Skipping contig NZ_ACXO01000105.1 - no valid genes found
Skipping contig NZ_ACXO01000107.1 - no valid genes found
Skipping contig NZ_ACXO01000108.1 - no valid genes found
Skipping contig NZ_ACXO01000115.1 - no valid genes found
Skipping contig NZ_ACXO01000129.1 - no valid genes found
Skipping contig NZ_ACXO01000135.1 - no valid genes found
Skipping contig NZ_ACXO01000136.1 - no valid genes found
Skipping contig NZ_ACXO01000149.1 - no valid genes found
Skipping contig NZ_ACXO01000150.1 - no valid genes found
Skipping contig NZ_ACXO01000159.1 - no valid genes found
Skipping contig NZ_ACXO01000182.1 - no valid genes found
Skipping contig NZ_ACXO01000183.1 - no valid genes found
Skipping contig NZ_ACXO01000186.1 - no valid genes found
Skipping contig NZ_ACXO01000193.1 - no valid genes found


100%|██████████| 245/245 [00:00<00:00, 449.59it/s]

Skipping contig NZ_ACXO01000197.1 - no valid genes found
Skipping contig NZ_ACXO01000198.1 - no valid genes found
Skipping contig NZ_ACXO01000199.1 - no valid genes found
Skipping contig NZ_ACXO01000203.1 - no valid genes found
Skipping contig NZ_ACXO01000207.1 - no valid genes found
Skipping contig NZ_ACXO01000208.1 - no valid genes found
Skipping contig NZ_ACXO01000214.1 - no valid genes found
Skipping contig NZ_ACXO01000224.1 - no valid genes found
Skipping contig NZ_ACXO01000226.1 - no valid genes found
Skipping contig NZ_ACXO01000229.1 - no valid genes found
Skipping contig NZ_ACXO01000242.1 - no valid genes found
Skipping contig NZ_ACXO01000243.1 - no valid genes found



