In [1]:
import pandas as pd
import numpy as np
from PIL import Image, ImageColor
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

what I have:  
- gff file containing location info - has protein_id/contig/start/end/strand  
- function predicton results: predicted function result in "top_function", protein id in column "id"   
- color mapping

what I want:  
the idea is to show function patterns in the bacteria genome  
there are 245 contigs, my idea is to make a heatmap for each contig  
one contig can have more than 100 proteins, so maybe it's hard to show all the genes in one row  
may utilize the code in the script; have a pipline in python notebook  

In [2]:
colors = {
    "lysis": "#f35f49",
    "tail": "#07e9a2",
    "connector": "#35d7ff",
    "dna_rna_and_nucleotide_metabolism": "#ffdf59",
    "head_and_packaging": "#3e83f6",
    "other": "#838383",
    "transcription_regulation": "#a861e3",
    "moron_auxiliary_metabolic_gene_and_host_takeover": "#ff59f5",
    "unknown_function": "#313131",  # maybe should be white or not to be shown
    "integration_and_excision": "#fea328",
    "no_hit": "#F5F5F5",  # maybe not to be shown
}

In [3]:
def create_genome_heatmap(gff_df, predictions_df, colors, block_size=10):
    """
    Create a single heatmap image for all contigs, each contig is one row.
    """
    contigs = gff_df["contig"].unique()
    # Prepare data for all contigs
    contig_gene_lists = []
    for contig_id in contigs:
        contig_data = gff_df[gff_df["contig"] == contig_id].copy()
        contig_data = contig_data.sort_values("start")
        contig_data = contig_data.merge(
            predictions_df[["id", "top_function"]],
            left_on="protein_id",
            right_on="id",
            how="left",
        )
        contig_data = contig_data[
            ~contig_data["top_function"].isin(["no_hit", "unknown_function"])
        ]
        contig_gene_lists.append(contig_data["top_function"].tolist())
    # Determine image size
    max_genes = max(len(genes) for genes in contig_gene_lists)
    width = max_genes * block_size
    height = len(contigs) * block_size
    im = Image.new("RGBA", (width, height), "white")
    # Draw each contig as a row
    for row, gene_labels in enumerate(contig_gene_lists):
        for i, label in enumerate(gene_labels):
            color = ImageColor.getcolor(colors.get(label, colors["no_hit"]), "RGBA")
            for dx in range(block_size):
                for dy in range(block_size):
                    x = i * block_size + dx
                    y = row * block_size + dy
                    if x < width and y < height:
                        im.putpixel((x, y), color)
    return im

In [4]:
gff_df = pd.read_csv("../dataset/demonstration_samples/GCF_000175755.1/gff_df.csv")
predictions_df = pd.read_csv("../results/demonstration/prediction_GCF_000175755.1.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/demonstration_samples/Escherichia_coli_O157_H7_str_FRIK2000/gff_df.csv'

In [6]:
predictions_df["id"] = predictions_df["id"].str.split().str[0]

In [7]:
im = create_genome_heatmap(gff_df, predictions_df, colors, block_size=10)
im.save("../results/demonstration/FRIK2000_contig_heatmaps/genome_heatmap.png")

 36%|███▌      | 88/245 [00:00<00:00, 436.96it/s]

Skipping contig NZ_ACXO01000007.1 - no valid genes found
Skipping contig NZ_ACXO01000009.1 - no valid genes found
Skipping contig NZ_ACXO01000034.1 - no valid genes found
Skipping contig NZ_ACXO01000059.1 - no valid genes found
Skipping contig NZ_ACXO01000076.1 - no valid genes found
Skipping contig NZ_ACXO01000080.1 - no valid genes found
Skipping contig NZ_ACXO01000085.1 - no valid genes found
Skipping contig NZ_ACXO01000086.1 - no valid genes found
Skipping contig NZ_ACXO01000094.1 - no valid genes found
Skipping contig NZ_ACXO01000095.1 - no valid genes found


 78%|███████▊  | 192/245 [00:00<00:00, 449.29it/s]

Skipping contig NZ_ACXO01000105.1 - no valid genes found
Skipping contig NZ_ACXO01000107.1 - no valid genes found
Skipping contig NZ_ACXO01000108.1 - no valid genes found
Skipping contig NZ_ACXO01000115.1 - no valid genes found
Skipping contig NZ_ACXO01000129.1 - no valid genes found
Skipping contig NZ_ACXO01000135.1 - no valid genes found
Skipping contig NZ_ACXO01000136.1 - no valid genes found
Skipping contig NZ_ACXO01000149.1 - no valid genes found
Skipping contig NZ_ACXO01000150.1 - no valid genes found
Skipping contig NZ_ACXO01000159.1 - no valid genes found
Skipping contig NZ_ACXO01000182.1 - no valid genes found
Skipping contig NZ_ACXO01000183.1 - no valid genes found
Skipping contig NZ_ACXO01000186.1 - no valid genes found
Skipping contig NZ_ACXO01000193.1 - no valid genes found


100%|██████████| 245/245 [00:00<00:00, 449.59it/s]

Skipping contig NZ_ACXO01000197.1 - no valid genes found
Skipping contig NZ_ACXO01000198.1 - no valid genes found
Skipping contig NZ_ACXO01000199.1 - no valid genes found
Skipping contig NZ_ACXO01000203.1 - no valid genes found
Skipping contig NZ_ACXO01000207.1 - no valid genes found
Skipping contig NZ_ACXO01000208.1 - no valid genes found
Skipping contig NZ_ACXO01000214.1 - no valid genes found
Skipping contig NZ_ACXO01000224.1 - no valid genes found
Skipping contig NZ_ACXO01000226.1 - no valid genes found
Skipping contig NZ_ACXO01000229.1 - no valid genes found
Skipping contig NZ_ACXO01000242.1 - no valid genes found
Skipping contig NZ_ACXO01000243.1 - no valid genes found



