# Stats and visualisation for UKB data

### Environment setup

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math, boto3, tempfile
import scipy.stats as sp
from sklearn import manifold
from sklearn.preprocessing import minmax_scale
from utils import *
from label_utils import *
import statsmodels as sm
from math import sqrt
from ukb_cox_proportional_hazards_utils import compute_is_cancer_at_recruitment
from outlier_methods import detect_outliers
from statsmodels.stats.multitest import fdrcorrection
import json

In [None]:
# Cleaner variable names

with open('ukb_feature_rename_map.json', 'r') as f:
    rename_mapping = json.load(f)

### Prepare data

In [None]:
df = read_csv('s3://file_path/file1.csv')
df_col = pd.read_csv('s3://file_path/additional_data/file2.csv')
df_col['200-0.0'] = df['200-0.0']
df_col['190-0.0'] = df['190-0.0']
df_col = remove_not_consented_participants(df_col)
df_col = remove_consent_withdrawals(df_col)
df_add = df_col[['845-0.0', '6138-0.0', '2306-0.0']].copy(deep=True)

df_add.rename(
    columns={'845-0.0': 'yoe', '6138-0.0': 'edu_level', '2306-0.0': 'weight_change'},
    inplace=True,
)
df_add['edu_level'] = df_add['edu_level'].replace([-3, -7], np.NaN)
df_add['weight_change'] = df_add['weight_change'].replace([-1, -3], np.NaN)
df_add['weight_change'] = df_add['weight_change'].replace([0, 2, 3], [2, 3, 1])
df_add['yoe'] = df_add['yoe'].replace([-1, -3, -2], [np.NaN, np.NaN, 0])

In [None]:
# Get rest of the data

df_merged = pd.read_csv('s3://file_path/file3.csv', low_memory=False)
df_merged['is_cancer-0'] = df_merged.apply(compute_is_cancer_at_recruitment, axis=1)
df_label = pd.read_csv('s3://file_path/file4.csv')
df_merged.label_class.value_counts()

In [None]:
df_merged[['edu_level', 'weight_change', 'yoe']] = df_add[
    ['edu_level', 'weight_change', 'yoe']
]
df_merged.shape

In [None]:
# Remove incident and other primary cancers

incident_cancer = df_merged[(df_merged['label_class'] == 2)].eid
print('Number of participants with incident cancer', len(incident_cancer))
df_merged = df_merged.loc[~df_merged.eid.isin(incident_cancer), :]
other_cancer = df_merged[(df_merged['label_class'] == 3)].eid
print('Number of participants with other primary cancers', len(other_cancer))
df_merged = df_merged.loc[~df_merged.eid.isin(other_cancer), :]
df_merged.label_class.value_counts()

In [None]:
# Remove non-baseline columns

nonbaseline_cols = [
    col for col in df_merged.columns if col.endswith(('-1', '-2', '-3'))
]
df_merged.drop(nonbaseline_cols, axis='columns', inplace=True)
df_merged.rename(columns=lambda x: x.split('-')[0], inplace=True)
print(df_merged.shape)

In [None]:
# Select columns

selected_cols = [
    'age', 'visit_centre','sex','ethnicity','townsend','alcohol','smoke','fasted',
    'redmeat_intake','oily_fish_intake','famhist_cancer','famhist_colorectal_cancer','edu_university',
    'regular_aspirin','regular_statin','health_rating','diseasehist_ibd','diseasehist_cardiovascular',
    'diseasehist_diabetes','diseasehist_anyliverbiliary','met_mins','hgrip','tlr','whr','bmi','height',
    'met_rate','impedance','sleep_dur','sbp','dbp','pulse','hgb','hct','wbc','rbc','plt','lym','mcv',
    'mono','neut','eos','baso','n_rbc','reti','u_sodium','u_potas', 'u_cr','apoa','apob','chol', 'hdl',
    'ldl','tgly','urea','crp','tprotein','glu','phos', 'alb', 'alp','alt','ast','ggt','urate','d_bil',
    't_bil','shbg','igf1', 'vitd','cysc','calc','hba1c','tst','edu_level','weight_change','yoe','crc_screening',
    'n_cancer_occurred','cancer_first_occurred_age','label_first_occurred_age','label_tumour_behaviour',
    'label_class',
]

df = df_merged.loc[:, [col for col in df_merged.columns if col in selected_cols]]

In [None]:
# Detect and remove outliers from continuous variables

continuous_vars = [
    'hgrip','tlr','whr','height','met_rate','impedance','sleep_dur','sbp','dbp','pulse','bmi','hgb',
    'hct','wbc','rbc','plt','lym','mcv','mono','neut','eos','baso','n_rbc','reti','u_sodium','u_potas',
    'u_cr','apoa','apob','chol','hdl','ldl','tgly','urea','crp','tprotein','glu','phos','alb','alp',
    'alt','ast','ggt','urate','d_bil','t_bil','shbg','igf1','vitd','cysc','calc','hba1c','tst',
]
outliers = []
for i, col in enumerate(continuous_vars):
    outliers_ = detect_outliers(
        df, col, method='percentile', percentile_threshold=0.001
    )
    outliers += list(outliers_)

outliers = np.unique(outliers)
print(f'Number of outliers: {len(outliers)}')

df.drop(outliers, axis='index', inplace=True)
df.replace({True: 1, False: 0}, inplace=True)
print(df.shape)

In [None]:
df.label_class.value_counts()

# X2 tests

In [None]:
# X2 tests for categorical measures

perc_c = []
perc_p = []
chi_res = []
pval = []

c_idx = df['label_class'] == False
p_idx = df['label_class'] == True
c, p = df['label_class'].value_counts()

cols = [
    'sex','ethnicity','edu_level','visit_centre','weight_change','regular_aspirin','regular_statin',
    'famhist_cancer','famhist_colorectal_cancer','diseasehist_cardiovascular','diseasehist_diabetes',
    'diseasehist_ibd','diseasehist_anyliverbiliary','crc_screening','health_rating','redmeat_intake',
    'oily_fish_intake','smoke','alcohol','fasted',
]

for col in cols:
    codes = pd.unique(df[col])
    codes = codes[~np.isnan(codes)]  # remove nan from codes
    c_n = []
    p_n = []
    for i in codes:
        c_n.append(len(df.loc[(df[col] == i) & (df['label_class'] == False)]))
        p_n.append(len(df.loc[(df[col] == i) & (df['label_class'] == True)]))
    if min(c_n) == 0:
        ix = c_n.index(0)
        c_n[ix] = 0.000001
        p_n[ix] = 0.000001
    if min(p_n) == 0:
        ix = p_n.index(0)
        p_n[ix] = 0.000001
        c_n[ix] = 0.000001
    perc_c.append(np.around(np.array(c_n) / np.sum(c_n) * 100, 1))
    perc_p.append(np.around(np.array(p_n) / np.sum(p_n) * 100, 1))
    chi, ps = sp.chisquare(
        f_obs=np.array(c_n) / np.sum(c_n) * 100, f_exp=np.array(p_n) / np.sum(p_n) * 100
    )
    chi_res.append(round(chi, 3))
    pval.append(round(ps, 3))

In [None]:
rej, pcor = fdrcorrection(pval, alpha=0.05, method='indep', is_sorted=False)
stats_df = pd.DataFrame()
stats_df = stats_df.assign(
    biomarker=cols, perc_c=perc_c, perc_p=perc_p, chi2=chi_res, p=pval
)
stats_df['pcor'] = pcor
stats_df.to_csv(s3_path + 'tables/chisquare_results.csv', index=False)
stats_df

# T-tests

In [None]:
df2 = df[
    [
        'townsend','bmi','wbc','rbc','hgb','hct','plt','lym','u_cr','u_potas','u_sodium','apoa',
        'apob','urea','chol','crp','cysc','hdl','igf1','ldl','shbg','tst','tprotein','tgly',
        'vitd','pulse','dbp','sbp','age','height','sleep_dur','met_mins','met_rate','impedance',
        'mcv','mono','neut','eos','baso','n_rbc','reti','alb','alp','alt','ast','d_bil','calc',
        'ggt','glu','hba1c','phos','t_bil','urate','hgrip','whr','tlr','yoe',
    ]
].copy(deep=True)

c_idx = df['label_class'] == False
p_idx = df['label_class'] == True

In [None]:
mean_c = []
mean_p = []
sd_c = []
sd_p = []
count_c = []
count_p = []
deg_f = []
tval = []
tval_abs = []
t_pval = []

for col in df2:
    cc = df2[col][c_idx].count()
    count_c.append(cc)
    pc = df2[col][p_idx].count()
    count_p.append(pc)
    deg_f.append(cc + pc - 2)
    mean_c.append(round(np.nanmean(df2[col][c_idx]), 2))
    mean_p.append(round(np.nanmean(df2[col][p_idx]), 2))
    sd_c.append(round(np.nanstd(df2[col][c_idx]), 2))
    sd_p.append(round(np.nanstd(df2[col][p_idx]), 2))
    res = sp.ttest_ind(
        df2[col][c_idx], df2[col][p_idx], nan_policy='omit'
    )  # Between samples t-tests
    tval.append(round(res[0], 3))
    tval_abs.append(round(np.abs(res[0]), 2))
    t_pval.append(round(res[1], 6));

In [None]:
rej, pcor = fdrcorrection(t_pval, alpha=0.05, method='indep', is_sorted=False)
pd.set_option('display.max_rows', None)
stats_df = pd.DataFrame()
stats_df = stats_df.assign(
    biomarker=df2.columns,
    count_c=count_c,
    count_p=count_p,
    mean_c=mean_c,
    sd_c=sd_c,
    mean_p=mean_p,
    sd_p=sd_p,
    tval=tval,
    tval_abs=tval_abs,
    deg_f=deg_f,
    t_pval=t_pval,
)
stats_df['pcor'] = pcor
stats_df.to_csv(s3_path + 'tables/group_stats.csv', index=False)
stats_df = stats_df.sort_values(by=['tval_abs'], ascending=False, ignore_index=True)
stats_df

In [None]:
# Calculate normalised data distributions of the significant variables

sig_cols = stats_df['biomarker'].iloc[stats_df.index[stats_df['pcor'] < 0.05]].to_list()
df_nm = df[sig_cols].copy(deep=True)

data_p = pd.DataFrame()
data_c = pd.DataFrame()

for c in df_nm.columns:
    df_nm[c] = minmax_scale(df2[c], feature_range=(0, 1), axis=0)
    data_p[c] = df_nm[c][p_idx]
    data_c[c] = df_nm[c][c_idx]

data_p = [data_p[c].dropna() for c in data_p]
data_c = [data_c[c].dropna() for c in data_c]

In [None]:
# Plot boxplots of the normalised distributions

ticklabels = [rename_mapping[v] for v in df_nm.columns]


def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color='white')
    plt.setp(bp['medians'], color='white')


fig = plt.figure(figsize=(15, 7))
bpl = plt.boxplot(
    data_c,
    positions=np.array(range(len(data_c))) * 0.45 - 0.1,
    sym='',
    widths=0.13,
    patch_artist=True,
    boxprops=dict(facecolor='navy'),
    medianprops=dict(linewidth=2.5),
)
bpr = plt.boxplot(
    data_p,
    positions=np.array(range(len(data_c))) * 0.45 + 0.1,
    sym='',
    widths=0.13,
    patch_artist=True,
    boxprops=dict(facecolor='tomato'),
    medianprops=dict(linewidth=2.5),
)
set_box_color(bpl, 'navy')
set_box_color(bpr, 'tomato')

xt = np.array(range(len(data_c))) * 0.45
ppos = np.array(range(len(data_c))) * 0.45 + 0.1
fig.autofmt_xdate(rotation=45)
plt.xticks(xt, ticklabels, fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(-0.5, round(max(ppos)) + 0.3)
plt.ylim(-0.05, 1.3)
plt.ylabel('au', fontsize=14)
plt.tight_layout()

for i in range(len(p_fdr_sig)):
    if p_fdr_sig[i] < 0.0001:
        text = '****'
    elif p_fdr_sig[i] < 0.001:
        text = '***'
    elif p_fdr_sig[i] < 0.01:
        text = '**'
    elif p_fdr_sig[i] < 0.05:
        text = '*'
    plt.text(
        xt[i] + 0.15,
        1.05,
        text,
        ha='left',
        rotation=90,
        wrap=True,
        fontsize=18,
        color='r',
    )

plt.savefig('./figures/paper_boxplots.jpg', dpi=150, bbox_inches='tight')
plt.show()

# Cancer-related descriptive stats

In [None]:
p_idx = df['label_class'] == True
p_n = df['sex'][p_idx].count()

In [None]:
# Check the number of distinct cancer diagnoses by group

df.groupby(['label_class', 'n_cancer_occurred']).n_cancer_occurred.count()

In [None]:
# Percent distinct cancer diagnoses in the CRC group

df['n_cancer_occurred'][p_idx].value_counts() / p_n * 100

In [None]:
# Tumour behaviour (3 = malignant in primary site)

df['label_tumour_behaviour'][p_idx].value_counts() / p_n * 100

In [None]:
# Calculate the descriptives for age of diagnosis

print('Mean:', df['cancer_first_occurred_age'][p_idx].mean())
print('SD:', df['cancer_first_occurred_age'][p_idx].std())
print('Max:', df['cancer_first_occurred_age'][p_idx].max())
print('Min:', df['cancer_first_occurred_age'][p_idx].min())

In [None]:
# Calculate the number of concurrent cancers by type

p_nm = (df['label_class'] == True).sum()

icd_codes = [
    ['C787', 'C220', 'C221', 'C222', 'C223', 'C224', 'C227', 'C229'],
    ['C786', 'C480', 'C481', 'C482', 'C488'],
    ['C780', 'C340', 'C341', 'C342', 'C343', 'C348', 'C349'],
    ['C61'],
    ['C67','C670','C671','C672','C673','C674','C675','C676','C677','C678','C679'],
    ['C64'],
    ['C70','C700','C701','C709','C71','C710','C711','C712','C713','C714','C715','C716','C717','C718','C719'],
    ['C82','C820','C821','C821','C822','C823','C824','C825','C826','C827','C828','C829','C83','C830','C831','C832','C833',
    'C834','C835','C836','C837','C838','C839','C84','C840','C841','C842','C843','C844','C845','C846','C847','C848','C849',
    'C85','C850','C851','C852','C857','C859','C86','C860','C861','C862','C863','C864','C865','C866'],
    ['C73'],
    ['C16','C160','C161','C162','C163','C164','C165','C166','C167','C168','C169'],
    ['C25','C250','C251','C252','C253','C254','C255','C256','C257','C258','C259'],
    ['C530', 'C531', 'C538', 'C539'],
]
cancer_names = [
    'liver',
    'peritoneum',
    'lung',
    'breast',
    'prostate',
    'kidney',
    'brain',
    'nh lymphoma',
    'thyroid',
    'stomach',
    'pancreas',
    'cervical',
]

for ca in range(len(cancer_names)):
    met_eid = get_othercancer_counts(df_merged, icd_codes[ca])
    print(
        'N of CRC and',
        cancer_names[ca],
        'cancer:',
        len(met_eid),
        'and',
        round(len(met_eid) / p_nm * 100, 2),
        '%',
    )