In [1]:
%run constants.ipynb
%run read_sheet.ipynb

In [2]:
# Load the arguments
# Change this if you want to run this notebook directly, rather than from the parent NB
with open(ARGUMENTS_FILENAME, "r") as fid:
    arguments = json.load(fid)

In [59]:
df = read_sheet(arguments["sheet_id"], col_range="A:ZZ")

### Pivot, clean up dates

In [60]:
df.columns = ["metric"] + df.columns.tolist()[1:]
df.set_index("metric", inplace=True)
df = df.T
df.reset_index(inplace=True)
df.columns = df.columns.tolist()
df.rename(columns={"index": "date"}, inplace=True)

df["date clean"] = pd.to_datetime(df["date"], errors="coerce")
df = df[df["date clean"].notnull()]
df.set_index("date clean", inplace=True)
df.sort_index(inplace=True)

### Medication

In [61]:
# remove headers and blank column names
bad_cols = [
    np.nan, "MEDICATION", "OBSERVATIONS", "MENSTRUAL", "SLEEP", "COMMON", np.nan,
]

bad_cols = [c for c in bad_cols if c in df.columns]
df.drop(columns=bad_cols, axis=1, inplace=True)

In [62]:
med_times = [c for c in df.columns if ":00" in c]

def get_daily_dosage(row, drug, unit):
    """"
        row: row of df
        drug: string of drug name
        unit: ml, mg
        
        "500 Ibuprofen, 1000 mg Acetaminophen" -> 1000
    """
    total_dose = 0
    for med_time in med_times:
        observation = str(row[med_time])
        mentions = re.findall(rf"(\d+\.?\d+) ?{unit} {drug}", observation)
        if mentions:
            dose = float(mentions[0])
            total_dose += dose
    return total_dose


def get_med_count(row: dict, value: str):
    total = 0
    for med_time in med_times:
        observation = str(row[med_time])
        if value in observation.lower():
            total += 1
    return total

In [63]:
unit = "mg"
mg_drugs = [
    "Acetaminophen",
    "Ibuprofen",
    "Duloxetine",
    "Naproxen",
    "THC",
    "CBD",  # ignore CBN and CBG for now
]
for drug in mg_drugs:
    df[f"{drug} {unit}"] = df.apply(get_daily_dosage, axis=1, args=(drug, unit))


unit = "ml"
ml_drugs = [
    "THC/CBD",
    "CBD oil",
]
for drug in ml_drugs:
    df[f"{drug} {unit}"] = df.apply(get_daily_dosage, axis=1, args=(drug, unit))


df["protab_pills"] = df.apply(get_med_count, axis=1, args=("protab",))

    
# Calculate mg metrics
# Assume 0.25 ml of 1:1 THC/CBD is equivalent to 3.75 mg THC and 3.75 mg CBD
df["THC mg"] = df["THC mg"] + ((3.75 / 0.25) * df["THC/CBD ml"])
df["THC mg"] = df["THC mg"] + (2 * df["protab_pills"])
df["CBD mg"] = df["CBD mg"] + ((3.75 / 0.25) * df["THC/CBD ml"])
df["CBD mg"] += ((3.75 / 0.25) * df["CBD oil ml"])

In [64]:
units = ["mg", "mcg", "IU"]

# Create boolean columns tracking daily meds/supplements
potential_drugs = set()
for drugs_str in df["Daily in morning"].unique():
    if str(drugs_str) in ["", "nan"]:
        continue
    drugs = drugs_str.split(", ")
    drugs = [drug.strip() for drug in drugs]
    for unit in units:
        drugs = [
            drug.split(f" {unit} ")[-1]
            for drug in drugs
        ]
    potential_drugs = potential_drugs.union(set(drugs))


for drug in potential_drugs:
    # Create boolean for whether this drug was taken this day
    df[f"daily {drug}"] = df["Daily in morning"].str.contains(drug)
    
    # Also create a metric for the exact quantity taken each day
    # daily in morning + throughout the day
    for unit in units:
        drug_info = df["Daily in morning"].str.findall(f"\d+ {unit} {drug}")
        dosage = drug_info.astype(str).str.findall(f"\d+ {unit}")
        amount = dosage.astype(str).str.extract("(\d+)").astype(float)
        if amount[0].isnull().mean() == 1:
            continue
        amount = amount[0].fillna(0)
        if f"{drug} {unit}" in df.columns:
            df[f"{drug} {unit}"] += amount
        else:
            df[f"{drug} {unit}"] = amount

## Numeric metrics

In [9]:
for pain_metric in [c for c in df.columns if "pain" in c]:
    df[f"{pain_metric} numeric"] = pd.to_numeric(df[pain_metric], errors="coerce")

df["steps"] = pd.to_numeric(df["steps"].str.replace(",", ""))

# Pain inferred from pain reported + meds
df["pelvic_pain_inferred"] = df["pelvic pain numeric"]

# Add a point for every 1500 mg Acetaminophen / day and every 4 mg THC
df["pelvic_pain_inferred"] += df["Acetaminophen mg"] // 1500
df["pelvic_pain_inferred"] += df["THC mg"] // 4

df["day of cycle"] = pd.to_numeric(df["day of cycle"])

In [11]:
headache_values = {
        "no": 0,
        "a bit": 0.5,
        "yes": 1,
        "extreme": 2,
}
# Note: we consider missing values to mean "no", since that's how I record it
for s, v in headache_values.items():
    df.loc[df["headache"] == s, "headache_numeric"] = v

df.loc[df["headache"] == "", "headache_numeric"] = 0
df.loc[df["headache"].isnull(), "headache_numeric"] = 0

fatigue_values = {
        "ok": 0,
        "yes": 1,
        "extreme": 2,
}
# Note: we ignore missing values, rather than assuming no fatigue
df.loc[df["fatigue"].isnull(), "fatigue"] = "unknown"
for s, v in fatigue_values.items():
    df.loc[df["fatigue"] == s, "fatigue_numeric"] = v


stomach_values = {
    "yes": 1,
    "no": 0,
}
for s, v in stomach_values.items():
    df.loc[df["bloated stomach"] == s, "bloated_stomach_numeric"] = v

df.loc[df["bloated stomach"] == "", "bloated_stomach_numeric"] = 0


# Oura sleep scoure as qualitative groups
df["previous night Oura score"] = pd.to_numeric(df["previous night Oura score"], errors="coerce")
df.loc[df["previous night Oura score"] < 80, "oura_group"] = "< 80"
df.loc[
    (df["previous night Oura score"] >= 80) &
    (df["previous night Oura score"] < 90)
, "oura_group"] = "80 - 90"
df.loc[df["previous night Oura score"] >= 90, "oura_group"] = "90+"
df["oura_group"] = pd.Categorical(df["oura_group"], ["< 80", "80 - 90", "90+"])

# quintiles
df["oura_quintile"] = pd.cut(df["previous night Oura score"], bins=5).apply(lambda x: x.mid)

### Change in pain metrics

b/c over a long time, the metric skews and can't be compared directly

In [12]:
for n_days_prior in [7, 28]:
    previous = df.shift(n_days_prior)
    metric = "pelvic pain numeric"
    df[f"pelvic_pain_vs_{n_days_prior}_days_ago"] = (df[metric] - previous[metric])

## Which cycle is this?

In [13]:
first_days = df.loc[df["day of cycle"] == 1].index.values

def get_nth_cycle(row):
    if np.isnan(row["day of cycle"]):
        return np.nan

    # How many other cycles came before this one?
    previous_first_days = [
        d for d in first_days
        if d <= row["date clean"]
    ]
    return len(previous_first_days)

df["nth_cycle"] = df.reset_index().apply(get_nth_cycle, axis=1).tolist()

In [14]:
df["peak_day"] = df["cycle event"] == "peak day"
df.loc[df["cycle event"].isnull(), "peak_day"] = np.nan

# the last day of the period is a day of bleeding followed by no bleeding
bleeding_symbols = ["L", "M", "H", "S", "B"]

df["symbol_tomorrow"] = df["symbol"].shift(-1)

df["last_bleeding_day"] = (
    df["symbol"].isin(bleeding_symbols) &
    (~df["symbol_tomorrow"].isin(bleeding_symbols)) &
    (df["day of cycle"] < 10)
    # make sure we're not including random days of spotting in the middle of the cycle
)

# last cycle day is the day before the next nth_cycle, aka before day of cycle is 1
df["day_of_cycle_tomorrow"] = df["day of cycle"].shift(-1)
df["last_cycle_day"] = (df["day_of_cycle_tomorrow"] == 1)

In [15]:
# marker for whether we fully charted this cycle
# this will exclude cycles where we omitted symbols
# but doesn't exclude the last cycle, where we haven't reached the end yet

subset = df[df["day of cycle"].notnull()]
subset["day of cycle"] = subset["day of cycle"].astype(int)
subset.loc[subset["symbol"] == "", "symbol"] = np.nan

table = subset.pivot(
    index="nth_cycle",
    columns="day of cycle",
    values="symbol",
)

day_count = pd.DataFrame(subset["nth_cycle"].value_counts())
symbol_count = pd.DataFrame(table.notnull().sum(axis=1))

merged = day_count.merge(
    symbol_count,
    left_index=True,
    right_index=True,
)
merged.columns = ["days_count", "symbols_count"]
valid_cycles = merged.loc[merged["symbols_count"] == merged["days_count"]].index.values

# valid_cycles = table[table.notnull().sum(axis=1) > 0].index.values
df["valid_cycle"] = df["nth_cycle"].isin(valid_cycles)

In [16]:
df.to_csv(arguments["dataframe_outfile"], index=True)