In [11]:
import altair as alt
import pandas as pd
import glob

In [12]:
# sort by phyloFlash header
source = pd.read_csv("./results/amanda/blast/blast_id_heatmap_amanda_real16s.vs.real16s.csv")
df = pd.DataFrame(source)

In [13]:
# splits the species names by : and returns the left hand side string from :
# replaces underscores with spaces to make name look more naturals to the common person
def splitReplace(series: pd.Series, replace: str, replaceBy: str = ' ', splitBy: str = ':') -> pd.Series:
    return series.map(
        lambda x: x.split(splitBy)[0].replace(replace, replaceBy)
    )


In [14]:
# replaces first name with its initial 
# returns None if string on name indexing error
def firstWordInitial(wholeName: str, delimiter: str = ' ') -> str | None:
    splitName = wholeName.split(delimiter)
    try:
        initial = f"{splitName[0][0]}."
        return wholeName.replace(splitName[0], initial, 1)
    except IndexError:
        return None


In [15]:
# Adjust specimen names per request
adjustedNamesDF = df.copy()

# y-axis
adjustedNamesDF.loc[adjustedNamesDF["Subject ID"] == "Pseudothermotoga hypogea DSM 11164 = NBRC 106472", "Subject ID"] = "Pseudothermotoga hypogea DSM 11164"
adjustedNamesDF.loc[adjustedNamesDF["Subject ID"] == "Candidatus Thermochlorobacteriaceae bacterium GBChlB", "Subject ID"] = "Candidatus Thermochlorobacteriaceae GBChlB"

# x-axis
adjustedNamesDF.loc[adjustedNamesDF["Query ID"] == "Pseudothermotoga hypogea DSM 11164 = NBRC 106472", "Query ID"] = "Pseudothermotoga hypogea DSM 11164"
adjustedNamesDF.loc[adjustedNamesDF["Query ID"] == "Candidatus Thermochlorobacteriaceae bacterium GBChlB", "Query ID"] = "Candidatus Thermochlorobacteriaceae GBChlB"

adjustedNamesDF["Query ID"] = adjustedNamesDF["Query ID"].apply(firstWordInitial)

In [16]:
# Get alphabetically sorted x-axis by species name before first 'name' abbreviation occurs
sortedRefOneDF = adjustedNamesDF.sort_values("Subject ID")
sortedRefOneDF["nameAbbreviation"] = adjustedNamesDF["Subject ID"].apply(firstWordInitial)
# order to sort x-axis
explicitXAxisOrder = sortedRefOneDF["nameAbbreviation"].unique()

In [17]:
# The original heatmaps sometimes have a specific arraignment

# explicitFullNameOrder = [
#     "Streptococcus pyogenes M1 GAS",#
#     "Hirschia baltica ATCC 49814",#
#     "Frateuria aurantia DSM 6220",#
#     "Thermobacillus composti KWC4",#
#     "Coraliomargarita akajimensis DSM 45221",#
#     "Pseudomonas stutzeri RCH2",#
#     "Natronobacterium gregoryi SP2",#
#     "Spirochaeta smaragdinae DSM 11293",#
#     "Clostridium perfringensATCC 13124",#
#     "Salmonella enterica subsp. arizonae serovar 62 z4 z23 - strain RSK2980",#
#     "Segniliparus rotundus DSM 44985",#
#     "Olsenella uli DSM 7084",#
#     "Natronococcus occultus DSM 3396",#
#     "Terriglobus roseus DSM 18391",#
#     "Halovivax ruber XH-70",#
#     "Corynebacterium glutamicum ATCC 13032",#
#     "Desulfosporosinus acidiphilus SJ4 DSM 22704",#
#     "Clostridium thermocellumVPI 7372 ATCC 27405",#
#     "Echinicola vietnamensis DSM 17526",#
#     "Meiothermus silvanus DSM 9946",#
#     "Desulfosporosinus meridiei DSM 13257",#
#     "Desulfotomaculum gibsoniae DSM 7213",#
#     "E.coli K12 ATCC 700926",#
#     "Salmonella bongori NCTC 12419",#
#     "Fervidobacterium pennivorans DSM 9078",#
#     "Nocardiopsis dassonvillei DSM 43111",
# ]
# explicitNameOrder = [
#     "Streptococcus pyogenes",
#     "Hirschia baltica",
#     "Frateuria aurantia",
#     "Thermobacillus composti",
#     "Coraliomargarita akajimensis",
#     "Pseudomonas stutzeri",
#     "Natronobacterium gregoryi",
#     "Spirochaeta smaragdinae",
#     "Clostridium perfringens",
#     "Salmonella enterica",
#     "Segniliparus rotundus",
#     "Olsenella uli",
#     "Natronococcus occultus",
#     "Terriglobus roseus",
#     "Halovivax ruber",
#     "Corynebacterium glutamicum",
#     "Desulfosporosinus acidiphilus",
#     "Clostridium thermocellum",
#     "Echinicola vietnamensis",
#     "Meiothermus silvanus",
#     "Desulfosporosinus meridiei",
#     "Desulfotomaculum gibsoniae",
#     "E.coli",
#     "Salmonella bongori",
#     "Fervidobacterium pennivorans",
#     "Nocardiopsis dassonvillei",
# ]

In [18]:
base = alt.Chart(
    adjustedNamesDF.copy(),
    # title="BLAST ID Percentage" removed per request
    ).encode(
    x=alt.X(
        "Query ID:O",
        scale=alt.Scale(paddingInner=0),
        axis=alt.Axis(
            labelAngle=-45,
            labelLimit=10000,
            titleY=190
        ),
        title="Reference 16S Query",
        sort=explicitXAxisOrder
    ),
    y=alt.Y(
        "Subject ID:O",
        scale=alt.Scale(paddingInner=0),
        title="Reference 16S Subject",
        axis=alt.Axis(
            labelLimit=10000,
            titleX=-300
        )
    )
).properties(
    width= 1050,
    height=850
)

heatmap = base.mark_rect().encode(
    color=alt.Color(
        "Percent identity:Q",
        scale=alt.Scale(
            scheme="blues"
            ),
    )
)

text = base.mark_text(
    baseline="middle",
    fontSize=14
    ).encode(
        text=alt.Text("Percent identity:Q",format=",.0f"),
        color=alt.condition(
            alt.datum["Percent identity"] > 90,
            alt.value("lightgrey"),
            alt.value("black"),
        )
)

heat_plot = heatmap + text

heat_plot.configure_axis(
    titleFontSize = 23,
    labelFontSize = 13
).configure_title(
    fontSize=30
)


  for col_name, dtype in df.dtypes.iteritems():


SchemaValidationError: Invalid specification

        altair.vegalite.v4.schema.channels.Color, validating 'additionalProperties'

        Additional properties are not allowed ('text' was unexpected)
        

alt.LayerChart(...)