In [1]:
import altair as alt
import pandas as pd
import glob

In [2]:
source = pd.read_csv("./results/amanda/hd_real_vs_real/hd_amanda_real_2022-07-20.csv")
df = pd.DataFrame(source)

In [3]:
# calculate hamming dissimilarity percentage
df["Percent"] = round(100*df["Hamming distance"] / df[["Reference 1 sequence length", "Reference 2 sequence length"]].min(axis=1), 1)

In [4]:
# splits the species names by : and returns the left hand side string from :
# replaces underscores with spaces to make name look more naturals to the common person
def splitReplace(series: pd.Series, replace: str, replaceBy: str = ' ', splitBy: str = ':') -> pd.Series:
    return series.map(
        lambda x: x.split(splitBy)[0].replace(replace, replaceBy)
    )


In [5]:
# replaces first name with its initial 
# returns None if string on name indexing error
def firstWordInitial(wholeName: str, delimiter: str = ' ') -> str | None:
    splitName = wholeName.split(delimiter)
    try:
        initial = f"{splitName[0][0]}."
        return wholeName.replace(splitName[0], initial, 1)
    except IndexError:
        return None


TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [6]:
# Adjust specimen names per request
adjustedNamesDF = df.copy()
# splits the species names by : and returns the left hand side string from :
# replaces underscores with spaces to make name look more naturals to the common person
# adjustedNamesDF["Reference 1"] = adjustedNamesDF["Reference 1"].map(
#     lambda x: x.split(":")[0].replace("_", " ")
# )
adjustedNamesDF["Reference 1"] = splitReplace(adjustedNamesDF["Reference 1"], "_")
# y-axis
adjustedNamesDF.loc[adjustedNamesDF["Reference 1"] == "Pseudothermotoga hypogea DSM 11164 = NBRC 106472", "Reference 1"] = "Pseudothermotoga hypogea DSM 11164"
adjustedNamesDF.loc[adjustedNamesDF["Reference 1"] == "Candidatus Thermochlorobacteriaceae bacterium GBChlB", "Reference 1"] = "Candidatus Thermochlorobacteriaceae GBChlB"

# x-axis
adjustedNamesDF["Reference 2"] = splitReplace(adjustedNamesDF["Reference 2"], "_")
adjustedNamesDF.loc[adjustedNamesDF["Reference 2"] == "Pseudothermotoga hypogea DSM 11164 = NBRC 106472", "Reference 2"] = "Pseudothermotoga hypogea DSM 11164"
adjustedNamesDF.loc[adjustedNamesDF["Reference 2"] == "Candidatus Thermochlorobacteriaceae bacterium GBChlB", "Reference 2"] = "Candidatus Thermochlorobacteriaceae GBChlB"

adjustedNamesDF["Reference 2"] = adjustedNamesDF["Reference 2"].apply(firstWordInitial)

NameError: name 'firstWordInitial' is not defined

In [7]:
# Get alphabetically sorted x-axis by species name before first 'name' abbreviation occurs
sortedRefOneDF = adjustedNamesDF.sort_values("Reference 1")
sortedRefOneDF["nameAbbreviation"] = adjustedNamesDF["Reference 1"].apply(firstWordInitial)

explicitXAxisOrder = sortedRefOneDF["nameAbbreviation"].unique()

In [8]:
base = alt.Chart(
    adjustedNamesDF.copy(),
    ).encode(
    x=alt.X(
        "Reference 2:O",
        scale=alt.Scale(paddingInner=0),
        axis=alt.Axis(
            labelAngle=-45,
            labelLimit=10000,
            titleY=170
        ),
        title="Reference 16s",
        sort=explicitXAxisOrder
    ),
    y=alt.Y(
        "Reference 1:O",
        scale=alt.Scale(paddingInner=0),
        title="Reference 16s",
        axis=alt.Axis(
            labelLimit=10000,
            titleX=-275
        )
    )
).properties(
    width= 1050,
    height=850
)

heatmap = base.mark_rect().encode(
    color=alt.Color(
        "Percent:Q",
        scale=alt.Scale(scheme="viridis"),
    )
)

text = base.mark_text(
    baseline="middle",
    fontSize=14
    ).encode(
        text="Percent:Q",
        color=alt.condition(
            alt.datum.Percent < 20,
            alt.value("lightgrey"),
            alt.value("black"),
        )
)

heat_plot = heatmap + text

heat_plot.configure_axis(
    titleFontSize = 23,
    labelFontSize = 13
).configure_title(
    fontSize=30
)
