# Extraction of participant counts by age, sex and ethnicity to construct population pyramids

## Purpose
This notebook extracts participant counts by age bin, sex and slef-reported ethnic group from the baseline Our Future Health (OFH) questionnaire to construct ethnicity-specific population pyramids.

## Outputs
- `inputs/phenotype_files/population_pyramids/pop_pyramids_ethnicity_GB_OFH.xlsx`, sheet `Ethn_age_sex_OFH_GB`.
  Table reporting participant counts by age bin, sex and self-reported ethnic group (Asian, Black, Mixed, Other, White).
  
## Relationship to manuscript
Results from this notebook are used to create **Extended Data Figure 1** (*Population ageâ€“sex structure compared with the 2021/2022 UK Census by major self-reported ethnic group*).

## Data and access data
Analyses use restricted Our Future Health data accessed within the OFH Trusted Research Environment under approved study permissions. Outputs are limited to aggregated, non-disclosive summary statistics in accordance with OFH Safe Output policies.

In [None]:
# Import libraries
import pandas as pd, pyspark, dxdata, dxpy, os, logging, json
import matplotlib.pyplot as plt
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO)

# Download and load config file
try:
    dxpy.download_dxfile("file-J35yx8k2gbP4K6JQx7xfVQVx", "config.json")
    with open("config.json", "r") as f:
        config = json.load(f)
except Exception as e:
    raise

!dx download "profile_study_v3:/applets/ofh_tools/" -r
#needed to put the option --overwrite in order to download utils, because wihtout it stops at process.py

# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

#from ofh_tools import utils

import sys
sys.path.append("/opt/notebooks/ofh_tools")

import utils

#load dataset
dataset = utils.connect_to_dataset()

#variables of interest: sex = demog_sex_1_1 & demog_sex_2_1; age = registration_year (registration_month) - birth_year (birth_month)
field_names = [
    'registration_year',
    'registration_month',
    'birth_year',
    'birth_month',
    'demog_sex_1_1',
    'demog_sex_2_1',
    'housing_income_1_1',
    'demog_ethnicity_1_1'
]

df = dataset.retrieve_fields(names=field_names, engine=dxdata.connect())

# Convert to Pandas
pdf = df.toPandas()

pdf

# Preprocessing

## Data Cleaning and Exclusions

In [None]:
# Exclude invalid birth year/month
pdf = pdf[pdf['participant$birth_year'] != -999]
pdf = pdf[pdf['participant$birth_month'] != -999]

# Exclude sex values: 3 (Intersex), -3 (Prefer not to answer)
pdf = pdf[~pdf['participant$demog_sex_1_1'].isin([3, -3])]
pdf = pdf[~pdf['participant$demog_sex_2_1'].isin([3, -3])]

# Exclude ethnicity values: -3 (Prefer not to answer)
pdf = pdf[pdf['participant$demog_ethnicity_1_1'] != -3]

# Optional: Exclude income responses -1 (Don't know), -3 (Prefer not to answer), and NaN
# pdf = pdf[~pdf['questionnaire$housing_income_1_1'].isin([-1, -3, np.nan])]

## Age Calculation

# Calculate age in months and convert to years
pdf['age_months'] = ((pdf['participant$registration_year'] - 1) * 12 + pdf['participant$registration_month']) - ((pdf['participant$birth_year'] - 1) * 12 + pdf['participant$birth_month'])
pdf['age'] = (pdf['age_months'] / 12).round()

# Calculate age using datetime
pdf['registration_date'] = pd.to_datetime(dict(year=pdf['participant$registration_year'], month=pdf['participant$registration_month'], day=1))
pdf['birth_date'] = pd.to_datetime(dict(year=pdf['participant$birth_year'], month=pdf['participant$birth_month'], day=1))
pdf['datetime_age'] = (pdf['registration_date'] - pdf['birth_date']).dt.days / 365.25
pdf['age_group'] = pdf['datetime_age'].astype(int)
# Exclude those aged <18
pdf = pdf[pdf['age_group'] >= 18]

## Sex variable transformation

pdf['sex'] = np.where(pdf['participant$demog_sex_2_1'].notna(), pdf['participant$demog_sex_2_1'], pdf['participant$demog_sex_1_1'])
pdf['sex'] = pdf['sex'].map({1: 'Male', 2: 'Female'})

## Ethnic group variables

#Define ethnic groups
ethn_grps = {
    'Asian': [10, 11, 12, 13, 14],
    'Black': [15, 16, 17],
    'Mixed': [6, 7, 8, 9],
    'White': [1, 2, 3, 4, 5],
    'Other': [18, 19]
}

group_map = {item: group for group, items in ethn_grps.items() for item in items}

pdf['ethnic_group'] = pdf['participant$demog_ethnicity_1_1'].map(group_map)

## Create age bins variables

max_age = max(pdf['age_group'])

#Define age bins and labels
#bins = [17, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, max_age]
#labels = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', 
#          '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
#newly defined bins and labels
bins = [17, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, max_age]
labels = ['18-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', 
          '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
    
pdf['age_bins'] = pd.cut(pdf['age_group'], bins=bins, labels=labels)

#remove age bin 18-19
#pdf = pdf[pdf['age_bins'] != '18-19']

## Inspect data

pdf.info()
pdf.describe()
pdf.head()

## Data frames by ethnicity, sex and age bins

#One dataframe per ethnicity
pdf_asian = pdf[pdf['ethnic_group']=='Asian'].groupby('age_bins', observed=True)['sex'].value_counts().reset_index()
pdf_asian.columns = ['Age', 'Sex', 'Count']
pdf_asian = pdf_asian.sort_values(by=['Age', 'Sex']).reset_index(drop=True)

pdf_black = pdf[pdf['ethnic_group']=='Black'].groupby('age_bins', observed=True)['sex'].value_counts().reset_index()
pdf_black.columns = ['Age', 'Sex', 'Count']
pdf_black = pdf_black.sort_values(by=['Age', 'Sex']).reset_index(drop=True)

pdf_mixed = pdf[pdf['ethnic_group']=='Mixed'].groupby('age_bins', observed=True)['sex'].value_counts().reset_index()
pdf_mixed.columns = ['Age', 'Sex', 'Count']
pdf_mixed = pdf_mixed.sort_values(by=['Age', 'Sex']).reset_index(drop=True)

pdf_white = pdf[pdf['ethnic_group']=='White'].groupby('age_bins', observed=True)['sex'].value_counts().reset_index()
pdf_white.columns = ['Age', 'Sex', 'Count']
pdf_white = pdf_white.sort_values(by=['Age', 'Sex']).reset_index(drop=True)

pdf_other = pdf[pdf['ethnic_group']=='Other'].groupby('age_bins', observed=True)['sex'].value_counts().reset_index()
pdf_other.columns = ['Age', 'Sex', 'Count']
pdf_other = pdf_other.sort_values(by=['Age', 'Sex']).reset_index(drop=True)

#All in one dataframe
age_sex_ethn = pdf.groupby(['age_bins', 'sex', 'ethnic_group'], observed=True).size().reset_index(name='Count')
age_sex_ethn.columns = ['Age', 'Sex', 'Ethnicity', 'Count']
age_sex_ethn = age_sex_ethn.sort_values(by=['Age', 'Sex', 'Ethnicity']).reset_index(drop=True)

#Inspect age_sex_ethn
age_sex_ethn.info()
age_sex_ethn.describe()
age_sex_ethn

## Export data

age_sex_ethn.to_csv("Ethn_age_sex_OFH_GB.csv")
pdf_asian.to_csv("asian_pop_OFH_GB.csv")
pdf_black.to_csv("black_pop_OFH_GB.csv")
pdf_mixed.to_csv("mixed_pop_OFH_GB.csv")
pdf_other.to_csv("other_pop_OFH_GB.csv")
pdf_white.to_csv("white_pop_OFH_GB.csv")

# Plotting (Old code)

plt.figure(figsize=(16, 12))
plt.barh(age_sex_props.index, age_sex_props['Male'], label='Male', color='steelblue')
plt.barh(age_sex_props.index, age_sex_props['Female'], label='Female', color='salmon')
plt.xlabel('Proportion within sex')
plt.ylabel('Age')
plt.title('Population Pyramid (Proportions by Sex and Age)')
plt.legend()
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

## Inspect long data frames

age_sex_counts.info()
age_sex_counts.describe()
age_sex_counts.head()

age_sex_props.info()
age_sex_props.describe()
age_sex_props.head()