In [1]:
%run constants.ipynb

In [60]:
def pivot_and_clean_dates(df: pd.DataFrame) -> pd.DataFrame:
    """Pivot so that we have a row for every day."""
    df.columns = ["metric"] + df.columns.tolist()[1:]
    df.set_index("metric", inplace=True)
    df = df.T
    df.reset_index(inplace=True)
    df.columns = df.columns.tolist()
    df.rename(columns={"index": "date"}, inplace=True)

    df["date clean"] = pd.to_datetime(df["date"], errors="coerce")
    df = df[df["date clean"].notnull()]
    df.set_index("date clean", inplace=True)
    df.sort_index(inplace=True)
    return df

In [61]:
def remove_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    # remove headers and blank column names
    bad_cols = [
        np.nan, "MEDICATION", "OBSERVATIONS", "MENSTRUAL", "SLEEP", "COMMON", "DIET", np.nan,
    ]

    bad_cols = [c for c in bad_cols if c in df.columns]
    return df.drop(columns=bad_cols, axis=1)

In [None]:
def make_metrics_numeric(df: pd.DataFrame, numeric_columns: list[str]) -> pd.DataFrame:
    for metric in numeric_columns:
        df[metric] = pd.to_numeric(df[metric].astype(str).str.replace(",", ""), errors="coerce")

    return df

In [62]:
def get_med_times(df: pd.DataFrame) -> list[str]:
    """
        Columns that correspond to medication usage, excluding daily morning meds.
        Update this if you track your medications differently.
    """
    return [c for c in df.columns if ":00" in c]


def clean_drugs(df: pd.DataFrame) -> pd.DataFrame:
    # Replace Nexium with it's chemical name
    # and remove any excess whitespace
    cols = get_med_times(df) + ["Daily in morning"]
    for col in cols:
        df[col] = df[col].astype(str).str.replace("Nexium", "Esomeprazole").str.strip()
    
    return df


def get_potential_drugs(df: pd.DataFrame) -> set:
    """"Get full list of hourly and daily drugs."""
    cols = get_med_times(df) + ["Daily in morning"]
    potential_drugs = set()

    for col in cols:
        # Drugs without recorded dosage
        meds = df[col].astype(str).str.extractall(
            f"(^|, )([a-zA-Z ]+\/?[a-zA-Z ]+)"
        )[1].unique().tolist()
        potential_drugs = potential_drugs.union(set(meds))

        # Drugs with a known dosage
        for unit in DRUG_UNITS:
            meds = df[col].astype(str).str.extractall(
                f"[^|, ][\d+\.?\d+] ?{unit} ([a-zA-Z ]+\/?[a-zA-Z ]+)"
            )[0].unique().tolist()
            potential_drugs = potential_drugs.union(set(meds))

    if "nan" in potential_drugs:
        potential_drugs.remove("nan")

    return potential_drugs


def get_daily_dosage(row, drug, unit):
    """"
        row: row of df
        drug: string of drug name
        unit: ml, mg
        
        "500 Ibuprofen, 1000 mg Acetaminophen" -> 1000
    """
    total_dose = 0
    for med_time in get_med_times(df):
        observation = str(row[med_time])
        mentions = re.findall(rf"(\d+\.?\d+) ?{unit} {drug}", observation)
        if mentions:
            dose = float(mentions[0])
            total_dose += dose
    return total_dose


def get_med_count(row: dict, value: str):
    total = 0
    for med_time in get_med_times(df):
        observation = str(row[med_time])
        if value in observation.lower():
            total += 1
    return total

In [63]:
def sum_hourly_medications(df: pd.DataFrame) -> pd.DataFrame:
    """Sum total medication usage per day, from hourly columns."""
#     unit = "mg"
#     mg_drugs = [
#         "Acetaminophen",
#         "Ibuprofen",
#         "Duloxetine",
#         "Naproxen",
#         "THC",
#         "CBD",  # ignore CBN and CBG for now
#         "Omeprazole",
#         "Esomeprazole",
#     ]
#     for drug in mg_drugs:
#         df[f"{drug} {unit}"] = df.apply(get_daily_dosage, axis=1, args=(drug, unit))


#     unit = "ml"
#     ml_drugs = [
#         "THC/CBD",
#         "CBD oil",
#     ]
#     for drug in ml_drugs:
#         df[f"{drug} {unit}"] = df.apply(get_daily_dosage, axis=1, args=(drug, unit))

    potential_drugs = get_potential_drugs(df)
    for unit in DRUG_UNITS:
        for drug in potential_drugs:
            daily_dosage = df.apply(get_daily_dosage, axis=1, args=(drug, unit))
            if daily_dosage.sum() > 0:
                df[f"{drug} {unit}"] = daily_dosage

    if "Protab pill" in potential_drugs:
        df["protab_pills"] = df.apply(get_med_count, axis=1, args=("protab",))

    # Calculate mg metrics
    # Assume 0.25 ml of 1:1 THC/CBD is equivalent to 3.75 mg THC and 3.75 mg CBD
    df["THC mg"] = df["THC mg"] + ((3.75 / 0.25) * df["THC/CBD ml"])
    df["THC mg"] = df["THC mg"] + (2 * df["protab_pills"])
    df["CBD mg"] = df["CBD mg"] + ((3.75 / 0.25) * df["THC/CBD ml"])
    df["CBD mg"] += ((3.75 / 0.25) * df["CBD oil ml"])

    return df

In [64]:
def add_daily_medications(df: pd.DataFrame) -> pd.DataFrame:
    """Process the 'Daily in morning' column."""

    for drug in get_potential_drugs(df):
        # Create boolean for whether this drug was taken this day
        df[f"daily {drug}"] = df["Daily in morning"].str.contains(drug)

        # Also create a metric for the exact quantity taken each day
        # daily in morning + throughout the day
        for unit in DRUG_UNITS:
            drug_info = df["Daily in morning"].str.findall(f"\d+ {unit} {drug}")
            dosage = drug_info.astype(str).str.findall(f"\d+ {unit}")
            amount = dosage.astype(str).str.extract("(\d+)").astype(float)
            if amount[0].isnull().mean() == 1:
                continue
            amount = amount[0].fillna(0)
            if f"{drug} {unit}" in df.columns:
                df[f"{drug} {unit}"] += amount
            else:
                df[f"{drug} {unit}"] = amount

    return df

In [None]:
def get_nth_cycle(df: pd.DataFrame) -> pd.DataFrame:
    df["day of cycle"] = pd.to_numeric(df["day of cycle"], errors="coerce")
    
    first_days = df.loc[df["day of cycle"] == 1].index.values

    def _get_nth_cycle(row: dict[str, float]) -> float:
        if np.isnan(row["day of cycle"]):
            return np.nan

        # How many other cycles came before this one?
        previous_first_days = [
            d for d in first_days
            if d <= row["date clean"]
        ]
        return len(previous_first_days)

    df["nth_cycle"] = df.reset_index().apply(_get_nth_cycle, axis=1).tolist()
    
    return df


def get_last_bleeding_day(df: pd.DataFrame) -> pd.DataFrame:
    # the last day of the period is a day of bleeding followed by no bleeding
    bleeding_symbols = ["L", "M", "H", "S", "B"]
    df["symbol_tomorrow"] = df["symbol"].shift(-1)
    df["last_bleeding_day"] = (
        df["symbol"].isin(bleeding_symbols) &
        (~df["symbol_tomorrow"].isin(bleeding_symbols)) &
        (df["day of cycle"] < 10)
        # make sure we're not including random days of spotting in the middle of the cycle
    )
    return df


def get_valid_cycles(df: pd.DataFrame) -> pd.DataFrame:
    """
        Calculate boolean for whether we correctly charted this cycle.
        
        This will exclude cycles where we omitted symbols,
        but doesn't exclude the last cycle, where we haven't reached the end yet.
    """
    # fill in the last day's symbol as unknown if we haven't filled it out yet
    if df["day of cycle"][-1] != np.nan and df["symbol"][-1] == "":
        df["symbol"][-1] = "U"

    subset = df[df["day of cycle"].notnull()]
    subset["day of cycle"] = subset["day of cycle"].astype(int)
    subset.loc[subset["symbol"] == "", "symbol"] = np.nan

    table = subset.pivot(
        index="nth_cycle",
        columns="day of cycle",
        values="symbol",
    )

    day_count = pd.DataFrame(subset["nth_cycle"].value_counts())
    symbol_count = pd.DataFrame(table.notnull().sum(axis=1))

    merged = day_count.merge(
        symbol_count,
        left_index=True,
        right_index=True,
    )
    merged.columns = ["days_count", "symbols_count"]
    valid_cycles = merged.loc[merged["symbols_count"] == merged["days_count"]].index.values

    df["valid_cycle"] = df["nth_cycle"].isin(valid_cycles)
    return df


def process_menstrual_cycles(df: pd.DataFrame) -> pd.DataFrame:
    df["peak_day"] = df["cycle event"] == "peak day"
    df.loc[df["cycle event"].isnull(), "peak_day"] = np.nan

    df = get_nth_cycle(df)
    df = get_last_bleeding_day(df)
    df = get_valid_cycles(df)
    
    return df

### Custom to how I track metrics

- Inferring pelvic pain given medication
- Categorical metrics' mapping to quantitative scores
- Oura sleep score grouping

In [None]:
def get_pelvic_pain_inferred(df: pd.DataFrame) -> pd.DataFrame:
    # Pain inferred from pain reported + meds
    df["pelvic_pain_inferred"] = df["pelvic pain"].copy()

    # Add a point for every 1500 mg Acetaminophen / day and every 4 mg THC
    df["pelvic_pain_inferred"] += df["Acetaminophen mg"] // 1500
    df["pelvic_pain_inferred"] += df["THC mg"] // 4
    
    return df

In [None]:
def get_metric_change(df: pd.DataFrame, metric: str) -> pd.DataFrame:
    for n_days_prior in [7, 28]:
        previous = df.shift(n_days_prior)
        df[f"pelvic_pain_vs_{n_days_prior}_days_ago"] = (df[metric] - previous[metric])
    return df

In [11]:
def map_categorical_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
    headache_values = {
            "no": 0,
            "a bit": 0.5,
            "yes": 1,
            "extreme": 2,
    }
    # Note: we consider missing values to mean "no", since that's how I record it
    for s, v in headache_values.items():
        df.loc[df["headache"] == s, "headache_numeric"] = v

    df.loc[df["headache"] == "", "headache_numeric"] = 0
    df.loc[df["headache"].isnull(), "headache_numeric"] = 0


    fatigue_values = {
            "ok": 0,
            "yes": 1,
            "extreme": 2,
    }
    # Note: we ignore missing values, rather than assuming no fatigue
    df.loc[df["fatigue"].isnull(), "fatigue"] = "unknown"
    for s, v in fatigue_values.items():
        df.loc[df["fatigue"] == s, "fatigue_numeric"] = v


    stomach_metrics = ["uncomfortable stomach", "visibly bloated stomach"]
    stomach_values = {
        "yes": 1,
        "no": 0,
        "": 0
    }
    for stomach_metric in stomach_metrics:
        for s, v in stomach_values.items():
            df.loc[df[stomach_metric] == s, f"{stomach_metric}_numeric"] = v

    return df

In [None]:
def get_oura_groups(df: pd.DataFrame) -> pd.DataFrame:
    # Oura sleep scoure as qualitative groups
    df["previous night Oura score"] = pd.to_numeric(df["previous night Oura score"], errors="coerce")
    df.loc[df["previous night Oura score"] < 80, "oura_group"] = "< 80"
    df.loc[
        (df["previous night Oura score"] >= 80) &
        (df["previous night Oura score"] < 90)
    , "oura_group"] = "80 - 90"
    df.loc[df["previous night Oura score"] >= 90, "oura_group"] = "90+"
    df["oura_group"] = pd.Categorical(df["oura_group"], ["< 80", "80 - 90", "90+"])

    # quintiles
    df["oura_quintile"] = pd.cut(df["previous night Oura score"], bins=5).apply(lambda x: x.mid)
    
    return df