# Hamming Distances of assembled 16Ss to the true 16Ss for mock MBARC-26 heatmap

In [1]:
import altair as alt
import pandas as pd
import glob

In [2]:
def findHeaderIntPos(phyloFlashHeader: pd.DataFrame, frontOffset=12) -> int:
    recreateIntStack = []
    middleIndexSortValue = ""
    for char in phyloFlashHeader.loc[0,"PhyloFlash header"][frontOffset:]:
        if char.isdigit():
            recreateIntStack.append(char)
        elif len(recreateIntStack) > 0:
            break
    while(len(recreateIntStack) > 0):
        middleIndexSortValue += recreateIntStack.pop(0)
    return int(middleIndexSortValue)

In [3]:

files = glob.glob("./results/mbarc26/hd/*.csv")
df = []
for f in files:
    csv = pd.read_csv(f)
    # calculate percentages
    csv["Percent"] = round(100*csv["HD for closest hit"] / csv[["Target sequence length", "Reference sequence length"]].min(axis=1), 1)
    csv.sort_values("Percent", inplace=True)
    df.append(csv)
sortedDF = sorted(df, key=findHeaderIntPos)
df = pd.concat(sortedDF, ignore_index=True, sort=False)

In [4]:
# trim full species names to match original heatmap names
df2 = df.copy()

df2["Closest hit to reference 16S"] = df2["Closest hit to reference 16S"].str.extract("([a-zA-Z\.]+\s[a-zA-Z]+)")

# manually adjust names that do not match substring extraction
df2.loc[df2["Closest hit to reference 16S"] == "E.coli K", "Closest hit to reference 16S"] = "E.coli"
df2.loc[df2["Closest hit to reference 16S"] == "Clostridium perfringensATCC", "Closest hit to reference 16S"] = "Clostridium perfringens"
df2.loc[df2["Closest hit to reference 16S"] == "Clostridium thermocellumVPI", "Closest hit to reference 16S"] = "Clostridium thermocellum"

In [5]:
# explicitFullNameOrder = [
#     "Streptococcus pyogenes M1 GAS",#
#     "Hirschia baltica ATCC 49814",#
#     "Frateuria aurantia DSM 6220",#
#     "Thermobacillus composti KWC4",#
#     "Coraliomargarita akajimensis DSM 45221",#
#     "Pseudomonas stutzeri RCH2",#
#     "Natronobacterium gregoryi SP2",#
#     "Spirochaeta smaragdinae DSM 11293",#
#     "Clostridium perfringensATCC 13124",#
#     "Salmonella enterica subsp. arizonae serovar 62 z4 z23 - strain RSK2980",#
#     "Segniliparus rotundus DSM 44985",#
#     "Olsenella uli DSM 7084",#
#     "Natronococcus occultus DSM 3396",#
#     "Terriglobus roseus DSM 18391",#
#     "Halovivax ruber XH-70",#
#     "Corynebacterium glutamicum ATCC 13032",#
#     "Desulfosporosinus acidiphilus SJ4 DSM 22704",#
#     "Clostridium thermocellumVPI 7372 ATCC 27405",#
#     "Echinicola vietnamensis DSM 17526",#
#     "Meiothermus silvanus DSM 9946",#
#     "Desulfosporosinus meridiei DSM 13257",#
#     "Desulfotomaculum gibsoniae DSM 7213",#
#     "E.coli K12 ATCC 700926",#
#     "Salmonella bongori NCTC 12419",#
#     "Fervidobacterium pennivorans DSM 9078",#
#     "Nocardiopsis dassonvillei DSM 43111",
# ]
explicitNameOrder = [
    "Streptococcus pyogenes",
    "Hirschia baltica",
    "Frateuria aurantia",
    "Thermobacillus composti",
    "Coraliomargarita akajimensis",
    "Pseudomonas stutzeri",
    "Natronobacterium gregoryi",
    "Spirochaeta smaragdinae",
    "Clostridium perfringens",
    "Salmonella enterica",
    "Segniliparus rotundus",
    "Olsenella uli",
    "Natronococcus occultus",
    "Terriglobus roseus",
    "Halovivax ruber",
    "Corynebacterium glutamicum",
    "Desulfosporosinus acidiphilus",
    "Clostridium thermocellum",
    "Echinicola vietnamensis",
    "Meiothermus silvanus",
    "Desulfosporosinus meridiei",
    "Desulfotomaculum gibsoniae",
    "E.coli",
    "Salmonella bongori",
    "Fervidobacterium pennivorans",
    "Nocardiopsis dassonvillei",
]

In [6]:
base = alt.Chart(
    df2.copy(),
    title="Hamming Distance Percentage"
    ).encode(
    x=alt.X(
        "Closest hit to reference 16S:O",
        scale=alt.Scale(paddingInner=0),
        sort=explicitNameOrder,
        title="Reference 16S",
        axis=alt.Axis(
            labelAngle=-45,
            labelLimit=10000
        )
    ),
    y=alt.Y(
        "PhyloFlash header:O",
        scale=alt.Scale(paddingInner=0),
        sort="x",
        title="Reference 16S",
        axis=alt.Axis(
            labelLimit=10000,
        )
    )
).properties(
    width= 1050,
    height=850
)

heatmap = base.mark_rect().encode(
    color=alt.Color(
        "Percent:Q",
#         scale=alt.Scale(scheme="blues", domain=[0,60,70]),
        scale=alt.Scale(scheme="blues", domain=[0,60,63,68,70])
    )
)

text = base.mark_text(
    baseline="middle",
    fontSize=14
    ).encode(
        text=alt.Text("Percent:Q",format=",.0f"),
        color=alt.condition(
            alt.datum.Percent > 69,
            alt.value("lightgrey"),
            alt.value("black"),
        )
)

heat_plot = heatmap + text

heat_plot.configure_axis(
    titleFontSize = 23,
    labelFontSize = 13
).configure_title(
    fontSize=30
)


  for col_name, dtype in df.dtypes.iteritems():
