### Setup environment

In [None]:
# Import packages
import dxpy
import subprocess
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

# Local imports
import phenofhy

# Turn off logging
import logging
# logging.disable(logging.CRITICAL)

### Initialize Spark

In [None]:
spark = SparkSession.builder \
    .appName("Phenotype Analysis") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.kryoserializer.buffer.max", "128") \
    .getOrCreate()

### Load and preprocess data

In [None]:
# Run using config-defined .CSV input file and output path
phenofhy.load.field_list(
    # input_file="HSE_COMPARISON_PHENOTYPES", 
    input_file="file-J4v66K92gbP5g09BF51g85B2", 
    output_file="outputs/intermediate/table_s4_hse_comparison_phenotypes_metadata.csv",
)

In [None]:
phenofhy.extract.fields(
    input_file="outputs/intermediate/table_s4_hse_comparison_phenotypes_metadata.csv",
    output_file="outputs/raw/table_s4_hse_comparison_phenotypes_raw_values_query.sql", 
    cohort_key="FULL_SAMPLE_ID", 
    # cohort_key="TEST_COHORT_ID", 
    sql_only=True
)

raw_questionnaire_df = phenofhy.extract.sql_to_pandas(
    "outputs/raw/table_s4_hse_comparison_phenotypes_raw_values_query.sql"
)

In [None]:
parc_df = phenofhy.process.participant_fields(raw_questionnaire_df)

In [None]:
questionnaire_df = phenofhy.process.questionnaire_fields(
    parc_df, 
    derive='auto'
)

In [None]:
pheno_df = phenofhy.process.clinic_measurements_fields(
    questionnaire_df, 
    derive=['bmi', 'bmi_status']
)

### Analysis

#### Compute sex-stratified sample sizes

In [None]:
pheno_df['participant.demog_sex_2_1'].value_counts()

In [None]:
pheno_df['derived.sex'].value_counts()

#### Compute anthropometric measures and smoking initiation summary with smaller sample

- Remove outliers

In [None]:
# Set implausible heights to NaN but keep the rest of the row
pheno_df['clinic_measurements.height'] = pheno_df['clinic_measurements.height'].mask(
    pheno_df['clinic_measurements.height'] >= 240
)

# Set implausible weights to NaN but keep the rest of the row
pheno_df['clinic_measurements.weight'] = pheno_df['clinic_measurements.weight'].mask(
    pheno_df['clinic_measurements.weight'] >= 250
)

pheno_df['clinic_measurements.waist'] = pheno_df['clinic_measurements.waist'].mask(
    pheno_df['clinic_measurements.waist'] >= 180
)

- Compute sex-stratified summary

In [None]:
pheno_df['derived.sex'].unique()

#### Filter

##### by clinic measurements availability

In [None]:
res_df = pheno_df.loc[pheno_df['clinic_measurements.weight'].notnull()]

##### by sex

In [None]:
# for males use 1, for females use 2 otherwise skip sex assignment
res_df = res_df.loc[res_df['derived.sex'].isin([
    1,
    2,
    3,
    4
])] # 2 == females

##### by ethnicity

In [None]:
res_df = res_df.loc[res_df['participant.demog_ethnicity_1_1'].isin([
    # 10,  # Asian or Asian British – Indian
    # 11,  # Asian or Asian British – Pakistani
    # 12,  # Asian or Asian British – Bangladeshi
    # 13,  # Chinese
    # 14,  # Any other Asian/Asian British background

    # 15,  # Black or Black British – African
    # 16,  # Black or Black British – Caribbean
    # 17,  # Any other Black / African / Caribbean background

    1,   # White – English / Welsh / Scottish / Northern Irish / British
    2,   # White – Irish
    3,   # White – Gypsy or Irish Traveller
    4,   # White – Polish
    5,   # Any other white background
])]  

#### Compute anthropoemtric measures

In [None]:
traits = [
    "derived.bmi",
    "clinic_measurements.height",
    "clinic_measurements.weight",
    "clinic_measurements.waist",
    "derived.smoke_reg_first_age", # toggle off/on if using clinic_res_df
]

# collect whole-sample summaries only
out_whole = phenofhy.calculate.summary(
    res_df, traits=traits, stratify=None, granularity="variable"
)

# numeric summary (already whole sample)
num_df = out_whole["numeric"]

coding_order = traits
omap = {k: i for i, k in enumerate(coding_order)}

# build the output table (no grouping needed)
wide = (
    num_df
    .pivot_table(index=["trait", "coding_name"], aggfunc="first")
    .reset_index()
    .sort_values(by="coding_name", key=lambda s: s.map(omap).fillna(len(omap)))
    [["trait", "count", "mean", "std"]]
    .set_index("trait")
)

# label columns under "whole" for consistency with previous wide format
wide.columns = pd.MultiIndex.from_product([["whole"], wide.columns])

display(wide)

#### Compute lifestyle characteristics summary

In [None]:
# --- TRAITS / SETTINGS ---
traits = [
    "questionnaire.alcohol_curr_1_1",
    "derived.smoke_status",
    "derived.vape_status",
    "questionnaire.health_status_curr_1_1",
    "questionnaire.health_status_chronic_1_1",
    "derived.walk_16_10",
    "derived.bmi_status",
]

metrics = ["count", "percent"]

# --- COLLECT SUMMARIES (categorical), WHOLE SAMPLE ONLY ---
out_whole = phenofhy.calculate.summary(
    res_df,
    traits=traits,
    # stratify='derived.age_group',
    granularity="category",
    label_mode="labels",
)

cat_df = out_whole["categorical"].copy()

# --- ensure coding_name normalized to string (important for matching "nan") ---
cat_df["coding_name"] = cat_df["coding_name"].fillna("nan").astype(str)

# --- ORDERING PAIRS ---
order_keys = [
    ("questionnaire.alcohol_curr_1_1","Daily or almost daily"),
    ("questionnaire.alcohol_curr_1_1","Three or four times a week"),
    ("questionnaire.alcohol_curr_1_1","Once or twice a week"),
    ("questionnaire.alcohol_curr_1_1","One to three times a month"),
    ("questionnaire.alcohol_curr_1_1","Special occasions only"),
    ("questionnaire.alcohol_curr_1_1","Never"),
    ("questionnaire.alcohol_curr_1_1","Prefer not to answer"),
    ("questionnaire.alcohol_curr_1_1","nan"),

    ("derived.smoke_status","Current"),
    ("derived.smoke_status","Former"),
    ("derived.smoke_status","Never"),
    ("derived.smoke_status","Unknown"),
    ("derived.smoke_status","nan"),

    ("derived.vape_status","Ever used"),
    ("derived.vape_status","Never used"),
    ("derived.vape_status","Prefer not to answer"),
    ("derived.vape_status","nan"),

    ("questionnaire.health_status_curr_1_1","Excellent"),
    ("questionnaire.health_status_curr_1_1","Good"),
    ("questionnaire.health_status_curr_1_1","Fair"),
    ("questionnaire.health_status_curr_1_1","Poor"),
    ("questionnaire.health_status_curr_1_1","Prefer not to answer"),
    ("questionnaire.health_status_curr_1_1","Do not know"),
    ("questionnaire.health_status_curr_1_1","nan"),

    ("questionnaire.health_status_chronic_1_1","Yes"),
    ("questionnaire.health_status_chronic_1_1","No"),
    ("questionnaire.health_status_chronic_1_1","Prefer not to answer"),
    ("questionnaire.health_status_chronic_1_1","Do not know"),
    ("questionnaire.health_status_chronic_1_1","nan"),

    ("derived.walk_16_10","Meets threshold"),
    ("derived.walk_16_10","Below threshold"),
    ("derived.walk_16_10","nan"),

    ("derived.bmi_status","Underweight"), # toggle off/on if using clinic_res_df
    ("derived.bmi_status","Normal"),
    ("derived.bmi_status","Overweight"),
    ("derived.bmi_status","Obese"),
    ("derived.bmi_status","nan"),
]

In [None]:
# Collapse for two questionnaire health traits
collapse_traits = [
    "questionnaire.health_status_curr_1_1",
    "questionnaire.health_status_chronic_1_1"
]

collapse_values = ["Prefer not to answer", "Do not know"]

collapsed_df_list = []

for trait_name in collapse_traits:
    df_t = cat_df[cat_df["coding_name"] == trait_name].copy()

    # Rows to merge are found in the *trait* column (category labels)
    mask = df_t["trait"].isin(collapse_values)

    if mask.any():
        merged_count = df_t.loc[mask, "count"].sum()
        total_count = df_t["count"].sum()
        merged_percent = (merged_count / total_count) * 100

        # Create merged row
        merged_row = pd.DataFrame([{
            "trait": "Prefer not to answer / Do not know",
            "coding_name": trait_name,
            "count": merged_count,
            "percent": merged_percent,
        }])

        # Keep unmerged rows + merged row
        df_t = pd.concat([df_t[~mask], merged_row], ignore_index=True)

    collapsed_df_list.append(df_t)

# Keep other traits unchanged and append collapsed groups
cat_df = pd.concat(
    [cat_df[~cat_df["coding_name"].isin(collapse_traits)]] + collapsed_df_list,
    ignore_index=True
)

# --- rebuild your order mapping ---
omap = {k: i for i, k in enumerate(order_keys)}

In [None]:
# --- Continue with pivot and ordering ---
wide = (
    cat_df
    .pivot_table(index=["trait", "coding_name"], aggfunc="first")
    .reset_index()
    .assign(order=lambda df: df.apply(
        lambda r: omap.get((r["coding_name"], r["trait"]), len(omap)),
        axis=1
    ))
    .sort_values("order")
    [["trait", "count", "percent"]]
    .set_index(["trait"])
)

# Label columns
wide.columns = pd.MultiIndex.from_product([["whole"], wide.columns])

wide

#### Error detection

In [None]:
check = (
    cat_df.groupby(["coding_name"])["percent"]
          .sum()
          .reset_index()
)

display(check)

#### Upload results

In [None]:
# Upload an entire directory of folders
phenofhy.utils.upload_folders([
    ("phenofhy/", "applets/phenofhy"),
])