In [None]:
%cd ..
%pwd

In [None]:
from pathlib import Path


import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

from scipy import stats

# This projects imports
import utils

sns.set_style("darkgrid")

In [None]:
PROJECT_ROOT = utils.paths.P_ROOT
PROJECT_ROOT

## Data Sanity Checks & Description Statistics

In this notebook we will

- Check for outliers
- Assess whether the data meets our prior assumptions about how the brain should look

In [None]:
df_ages = utils.io.get_gestational_age_df()
df_ages

In [None]:
aseg_fname_newborn = PROJECT_ROOT / "csv" / "aseg_newborn_brainvol.table.txt"
aseg_fname_sixmonth = PROJECT_ROOT / "csv" / "aseg_sixmonth_brainvol.table.txt"


df_newborn_zed = utils.io.get_brainvol_df(aseg_fname_newborn, zscore=True)


df_sixmonth_zed = utils.io.get_brainvol_df(aseg_fname_sixmonth, zscore=True)
df_sixmonth_zed

In [None]:
fig, axes = plt.subplots(figsize=(12, 6), nrows=2, constrained_layout=True)

iterators = zip(
    axes,
    [df_newborn_zed, df_sixmonth_zed],
    ["newborn", "sixmonth"],
    )
for this_ax, this_df, this_label in iterators:
    sns.violinplot(data=this_df, x="region", y="volume", ax=this_ax)
    this_ax.set_title(this_label)
fig.suptitle("Z-score distributions of Brain region Volume", fontweight="bold")
fig.show()

In [None]:
def merge_in_gestational_df(df, gestational_age_df, session):
    # Check
    assert "study_id" in df.columns
    assert "study_id" in gestational_age_df.columns
    assert "newborn_scan_age_weeks" in gestational_age_df.columns
    assert "sixmonth_scan_age_weeks" in gestational_age_df.columns
    assert session in ["newborn", "sixmonth"]
    age_col = f"{session}_scan_age_weeks"
    # Do the merge
    to_merge = gestational_age_df[["study_id", age_col]]
    return df.merge(to_merge, on="study_id", how="inner")


In [None]:
LONGITUDINAL_ONLY = True

df_newborn = utils.io.get_brainvol_df(aseg_fname_newborn, zscore=False).reset_index()
# Add gestation age from a separate CSV
df_newborn = merge_in_gestational_df(df_newborn, gestational_age_df=df_ages, session="newborn")
df_newborn = df_newborn.rename(columns={"newborn_scan_age_weeks": "gestational_age_weeks"})
df_newborn["session"] = "newborn"
df_newborn = df_newborn.set_index("study_id")

# Six Months
df_sixmonth = utils.io.get_brainvol_df(aseg_fname_sixmonth, zscore=False).reset_index()
# Add gestation age from a separate CSV
df_sixmonth = merge_in_gestational_df(df_sixmonth, gestational_age_df=df_ages, session="sixmonth")
df_sixmonth = df_sixmonth.rename(columns={"sixmonth_scan_age_weeks": "gestational_age_weeks"})
df_sixmonth["session"] = "sixmonth"
df_sixmonth = df_sixmonth.set_index("study_id")

df_merged = pd.concat([df_newborn, df_sixmonth])
# limit to only the subjects that have both newborn and sixmonth data
if LONGITUDINAL_ONLY:
    in_newborn = df_merged.index.isin(df_newborn.index)
    in_sixmonth = df_merged.index.isin(df_sixmonth.index)
    df_merged = df_merged.loc[in_newborn & in_sixmonth]

In [None]:
df_merged

In [None]:
fig, ax = plt.subplots(figsize=(12, 6), constrained_layout=True)

sns.boxplot(
    data=df_merged,
    x="region",
    y="volume",
    hue="session",
    ax=ax,
)

sns.stripplot(
    data=df_merged,
    x="region",
    y="volume",
    hue="session",
    ax=ax,
    dodge=True,
    alpha=.5,
    legend=False
)

ax.set_title(
    r"Change in Volume ($mm^3$) between visits " + f"(n={df_merged.index.unique().shape[0]})",
    fontweight="bold",
    )
ax.set_xlabel("Region", fontweight="bold")
ax.set_ylabel(f"Volume ($mm^3$)", fontweight="bold")
fig.show()

In [None]:

if LONGITUDINAL_ONLY:
    regions = df_merged["region"].unique()
    num_cols = len(regions)
    colors = sns.color_palette()
    fig, axes = plt.subplots(figsize=(15, 5), ncols=num_cols, constrained_layout=True)

    for this_region, this_ax, this_color in zip(regions, axes, colors):
        df_region = df_merged.loc[df_merged["region"] == this_region]
        df_region = df_region[["volume", "session"]].pivot(columns="session")
        df_region.columns =  df_region.columns.droplevel()
        df_region.columns.name = None
        this_ax.plot(
            df_region.to_numpy().T,
            color=this_color,
            alpha=.7,
            linewidth=.7,
            )
        this_ax.set_title(this_region)
        this_ax.set_xticks([0, 1])
        this_ax.set_xticklabels(["newborn", "sixmonth"])
        this_ax.set_ylabel(r"Volume ($mm^3$)")
    fig.suptitle(f"Change in Brain volume from 1-6 months (n={len(df_region)})", fontweight="bold")
    fig.show()

In [None]:
if LONGITUDINAL_ONLY:
    regions = df_merged["region"].unique()
    num_cols = len(regions)
    colors = sns.color_palette()
    fig, axes = plt.subplots(figsize=(15, 5), ncols=num_cols, constrained_layout=True)

    for this_region, this_ax, this_color in zip(regions, axes, colors):
        df_region = df_merged.loc[df_merged["region"] == this_region]

        ages = df_region.pivot(columns="session")[('gestational_age_weeks',)]
        volumes = df_region.pivot(columns="session")[('volume',)]
        assert len(ages) == len(volumes)

        this_ax.plot(
            ages.to_numpy().T,
            volumes.to_numpy().T,
            color=this_color,
            alpha=.7,
            linewidth=.7,
            marker="o",
            )
        this_ax.set_title(this_region)
        this_ax.set_xlabel("Gestational Age (Weeks)")
        this_ax.set_ylabel(r"Volume ($mm^3$)")
    fig.suptitle(f"Change in Brain volume from 1-6 months (n={len(ages)})", fontweight="bold")
    fig.show()

## Surface Outputs (aparc)

In [None]:
aparc_lh_newborn = PROJECT_ROOT / "csv" / "aparc_newborn_lh.csv"
aparc_rh_newborn = PROJECT_ROOT / "csv" / "aparc_newborn_rh.csv"

aparc_lh_sixmonth = PROJECT_ROOT / "csv" / "aparc_sixmonth_lh.csv"
aparc_rh_sixmonth = PROJECT_ROOT / "csv" / "aparc_sixmonth_rh.csv"


def get_aparc_long_format(fname):
    df = pd.read_csv(fname)
    return df.melt(
        id_vars=["study_id", "StructName"],
        var_name="metric",
)


def get_aparc_all_hemisphere(fname_lh, fname_rh):
    df_lh = get_aparc_long_format(fname_lh)
    df_lh["hemisphere"] = "lh"
    df_lh["StructName"] = df_lh["StructName"].str.replace("lh-", "")
    df_rh = get_aparc_long_format(fname_rh)
    df_rh["hemisphere"] = "rh"
    df_rh["StructName"] = df_rh["StructName"].str.replace("rh-", "")
    df_hemis = pd.concat([df_lh, df_rh])
    return df_hemis

df_hemis = get_aparc_all_hemisphere(aparc_lh_newborn, aparc_rh_newborn)
df_hemis_6mo = get_aparc_all_hemisphere(aparc_lh_sixmonth, aparc_rh_sixmonth)

# Now combine the ages
df_hemis_6mo["session"] = "sixmonth"
df_hemis["session"] = "newborn"
df_all = pd.concat([df_hemis, df_hemis_6mo])

In [None]:
df_all.loc[df_all["metric"] == "SurfArea"]

In [None]:
if LONGITUDINAL_ONLY:
    # Let's limit to infants at both time points just for sanity purposes
    in_newborn = df_all["study_id"].isin(df_hemis["study_id"])
    in_sixmonth = df_all["study_id"].isin(df_hemis_6mo["study_id"])
    df_all = df_all.loc[in_newborn & in_sixmonth]

g = sns.FacetGrid(
    data=df_all.loc[df_all["metric"] == "SurfArea"],
    col="StructName",
    col_wrap=5,
)
g.map_dataframe(sns.violinplot, x="session", hue="hemisphere", y="value", split=True)

# Pair Plots between Anatomical data and Bx Measures

## Parenting Sensitivity
### TODO: Yanbin share sensitivity data with Scott

## 6-month Adult word Count (From LENA Data)

## 6-Month Temperament Assment

## 6-month Cognitive Assessment

## Q: How do early life experiences affect brain devel?

### I.e. language exposure

#### LENA data at 6 months as a proxy. 2 days of recordings at 6-months. LENA outputs adult word count (how often they spoke). Maybe we wan to control for recording lengths.. 

## Things we need to account for (in order of importance)

### Notes: We covary/use corrected age (age from due date)

### Same for gestational age at birth

### Birth weight

### Biological Sex




## TODO: Scott. Get LENA data. 

## 