In [1]:
import altair as alt
import pandas as pd
import glob

In [2]:
def findHeaderIntPos(phyloFlashHeader, frontOffset=0) -> int:
    recreateIntStack = []
    middleIndexSortValue = ""
    for char in phyloFlashHeader[frontOffset:]:
        if char.isdigit():
            recreateIntStack.append(char)
        elif len(recreateIntStack) > 0:
            break
    while(len(recreateIntStack) > 0):
        middleIndexSortValue += recreateIntStack.pop(0)
    return int(middleIndexSortValue)

In [3]:
# sort by phyloFlash header
source = pd.read_csv("./results/amanda/hd/hd_2022-07-21.csv")
df = pd.DataFrame(source)

In [4]:
# sort based on Query ID on the digits left of first _ ex. 10_
# this will help keep the same order as the original heatmaps's y axis
dfSorted = df.sort_values(by="PhyloFlash header", key=lambda x: x.map(findHeaderIntPos))

In [5]:
# calculate hamming dissimilarity percentage
dfSorted["Percent"] = round(100*dfSorted["HD for closest hit"] / dfSorted["Target sequence length"], 1)

In [6]:
# Adjust specimen names per request
adjustedNamesSortedDF = dfSorted.copy()

# x-axis
adjustedNamesSortedDF.loc[adjustedNamesSortedDF["Closest hit to reference 16S"] == "Pseudothermotoga hypogea DSM 11164 = NBRC 106472", "Closest hit to reference 16S"] = "Pseudothermotoga hypogea DSM 11164"
adjustedNamesSortedDF.loc[adjustedNamesSortedDF["Closest hit to reference 16S"] == "Candidatus Thermochlorobacteriaceae bacterium GBChlB", "Closest hit to reference 16S"] = "Candidatus Thermochlorobacteriaceae GBChlB"

In [7]:
# The original heatmaps sometimes have a specific arraignment

explicitNameOrder = [
  "Roseiflexus sp. RS-1",
  "Thermodesulfovibrio sp. N1",
  "Thermodesulfovibrio thiophilus DSM 17215",
  "Pseudothermotoga hypogea DSM 11164",
  "Synechococcus sp. JA-2-3B'a(2-13)",
  "Synechococcus sp. JA-3-3Ab",
  "Chloracidobacterium thermophilum B",
  "Pseudanabaena sp. PCC 7367",
  "Pseudanabaena sp. ABRG5-3",
  "Gloeomargarita lithophora Alchichica-D10",
  "Chloroflexus aggregans DSM 9485",
  "Chloroflexus islandicus strain isl-2",
  "Oscillochloris trichoides DG-6",
  "Roseiflexus castenholzii DSM 13941",
  "Candidatus Thermochlorobacteriaceae GBChlB",
  "Meiothermus silvanus DSM 9946",
  "Pseudothermotoga lettingae TMO",
  "Caldimicrobium thiodismutans strain TF1",
  "Elioraea tepidiphila DSM 17972",
  "Thermocrinis ruber strain DSM 23557",
  "Thermus brockianus strain GE-1",
  "Thermus aquaticus Y51MC23",
  "Meiothermus ruber DSM 1279",
  "Meiothermus taiwanensis WR-220",
  "Chloroflexus aurantiacus J-10-fl",
  "Chloroflexus aurantiacus Y-400-fl",
  "Chloroflexus sp. Y-396-1",
  "Chloroflexus sp. MS-G",
  "Ignavibacterium album JCM 16511",
  "Candidatus Solibacter usitatus Ellin6076",
  # not in data set or original heatmap
  # "Candidatus Caldatribacterium saccharofermentans isolate SpSt-82",
]

In [8]:
base = alt.Chart(
    adjustedNamesSortedDF.copy(),
    # title="Hamming Dissimilarity Percentage" removed title per request
    ).encode(
    x=alt.X(
        "Closest hit to reference 16S:O",
        scale=alt.Scale(paddingInner=0),
        sort=explicitNameOrder,
        axis=alt.Axis(
            labelAngle=-45,
            labelLimit=10000
        )
    ),
    y=alt.Y(
        "PhyloFlash header:O",
        scale=alt.Scale(paddingInner=0),
        sort="x",
        title="Assembly"
    )
).properties(
    width= 1050,
    height=850
)

heatmap = base.mark_rect().encode(
    color=alt.Color(
        "Percent:Q",
        scale=alt.Scale(scheme="magma"),
    )
)

text = base.mark_text(
    baseline="middle",
    fontSize=14
    ).encode(
        text="Percent:Q",
        color=alt.condition(
            alt.datum.Percent < 20,
            alt.value("lightgrey"),
            alt.value("black"),
        )
)

heat_plot = heatmap + text

heat_plot.configure_axis(
    titleFontSize = 23,
    labelFontSize = 13
).configure_title(
    fontSize=30
)
