# Overlapping tokens association with attribution scores

## Read mapping information

In [2]:
import os

import pandas as pd


demoTrainDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_train.csv', sep='\t')
demoTestDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_test.csv', sep='\t')
demoValidateDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_validate.csv', sep='\t')
demoDf = pd.concat([demoTrainDf, demoTestDf, demoValidateDf], ignore_index=True)
demoDf

Unnamed: 0,person_id,visit_occurrence_id,TYPE,COUNTRY_OF_BIRTH,SEX,Age_Bin,JOURNEY_ID
0,678487,3312694,Emergency,Australia,Male,"(60, 70]",3312694
1,2094045,9199249,Emergency,Australia,Male,"(20, 30]",9199249
2,2107813,9410987,Emergency,Australia,Male,"(50, 60]",9410987
3,2083212,602890,Inpatient,Australia,Female,"(40, 50]",9039437
4,2083212,9039437,Emergency,Australia,Female,"(40, 50]",9039437
...,...,...,...,...,...,...,...
7194,2603535,16588204,Inpatient,Australia,Male,"(60, 70]",16588204
7195,2603535,16709341,Inpatient,Australia,Male,"(60, 70]",16709341
7196,2603535,16651755,Inpatient,Australia,Male,"(60, 70]",16651755
7197,2098616,16872774,Emergency,Australia,Female,"(80, 90]",16872774


In [3]:
import os

mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [4]:
mappedJourneyTubeIdsDf = mappingDf[['tube_code', 'PATIENT_ID', 'EPISODE_ID']].drop_duplicates().merge(
    demoDf[['person_id', 'visit_occurrence_id', 'JOURNEY_ID']],
    how='inner',
    left_on=['PATIENT_ID', 'EPISODE_ID'],
    right_on=['person_id', 'visit_occurrence_id']
).drop(
    columns=['PATIENT_ID', 'EPISODE_ID', 'visit_occurrence_id']
)[['person_id', 'JOURNEY_ID', 'tube_code']].drop_duplicates()
mappedJourneyTubeIdsDf

Unnamed: 0,person_id,JOURNEY_ID,tube_code
0,50056,13224842,AH20B011
1,62567,12645629,AH19G065
2,138108,14737781,AH21E052
3,211314,13955099,AH20J009
4,211314,13955099,KPN2214
...,...,...,...
512,2084373,16158768,ALF22C004
513,1058066,16759482,ALF22H089
514,2552781,16966609,ALF22J054
515,2638484,16969999,ALF22K143


## Read bed files containing overlapping tokens and attribution scores

In [5]:
import os
import pandas as pd
from pathlib import Path


overlappingFilesDir = Path(os.environ['GENOMICS_DATA_BASE'], 'genome_nlp_tokens', 'overlapping_with_annotations')

overlappingDfList = []
for overlappingFile in os.listdir(overlappingFilesDir):
    df = pd.read_csv(Path(overlappingFilesDir, overlappingFile), sep='\t', names=['contig_id', 'start_position', 'end_position', 'tokens', 'score', 'id', 'name', 'gene', 'atributes'])
    df['tube_code'] = [overlappingFile.split('_')[0]]*df.shape[0]
    overlappingDfList.append(df)
overlappingDf = pd.concat(overlappingDfList, ignore_index=True)
overlappingDf = overlappingDf.merge(
    mappedJourneyTubeIdsDf,
    how='inner',
    on=['tube_code']
)
overlappingDf = overlappingDf[['tube_code', 'contig_id', 'start_position', 'end_position', 'tokens', 'score']].drop_duplicates()
overlappingDf

Unnamed: 0,tube_code,contig_id,start_position,end_position,tokens,score
0,AH19E065,1,191,192,GT,-0.001426
1,AH19E065,1,193,197,ATAAC,-0.005218
2,AH19E065,1,198,199,GC,-0.000682
3,AH19E065,1,200,203,GCCA,-0.010548
4,AH19E065,1,204,210,ATTGATT,-0.004120
...,...,...,...,...,...,...
23487,AH19C091,18,149,152,GCTC,0.012179
23488,AH19C091,18,153,158,TAACCA,0.012327
23489,AH19C091,18,159,164,GCTGAG,-0.020735
23490,AH19C091,18,165,170,CTATAG,-0.013398


## Perform chi-square test

### High attribution score

In [27]:
from scipy import stats

cutoff = 0.01
highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


320.2026936026936 0.006409491697989487 True


In [31]:
filteredOverlappingDf.loc[:, 'count_difference'] = filteredOverlappingDf.count_hs - filteredOverlappingDf.count_expected
filteredOverlappingDf.sort_values(by=['count_difference'])

Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,count_difference
6,CG,11,201,0.030298,37.963973,-26.963973
38,GCT,6,45,0.006783,8.499397,-2.499397
47,CTAAT,5,39,0.005879,7.366144,-2.366144
26,CCG,7,49,0.007386,9.254899,-2.254899
56,TACTT,5,38,0.005728,7.177269,-2.177269
57,TAC,5,37,0.005577,6.988393,-1.988393
34,GTTT,6,38,0.005728,7.177269,-1.177269
44,C,6,34,0.005125,6.421767,-0.421767
0,CGT,17,90,0.013566,16.998794,0.001206
50,ACAGG,5,26,0.003919,4.910763,0.089237


### Low attribution score

In [32]:
from scipy import stats

cutoff = -0.01
highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


741.1725170068028 2.7855090319426614e-06 True


In [34]:
filteredOverlappingDf.loc[:, 'count_difference'] = filteredOverlappingDf.count_hs - filteredOverlappingDf.count_expected

In [39]:
filteredOverlappingDf.sort_values(by=['count_difference'])[:25]

Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,count_difference
4,CGT,69,38,0.015218,156.230677,-87.230677
462,CTGAG,7,18,0.007209,74.004005,-67.004005
600,CTCAGG,5,12,0.004806,49.336003,-44.336003
221,CCC,15,14,0.005607,57.55867,-42.55867
237,CCAAC,14,13,0.005206,53.447337,-39.447337
27,TAACTT,38,18,0.007209,74.004005,-36.004005
58,CGAA,30,16,0.006408,65.781338,-35.781338
420,GGTAGAG,8,10,0.004005,41.113336,-33.113336
62,GACC,29,15,0.006007,61.670004,-32.670004
120,GAACCC,21,13,0.005206,53.447337,-32.447337


In [40]:
filteredOverlappingDf.sort_values(by=['count_difference'])[-10:]

Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,count_difference
42,GTAAT,33,5,0.002002,20.556668,12.443332
38,GAG,34,5,0.002002,20.556668,13.443332
12,ATTGG,48,7,0.002803,28.779335,19.220665
13,GAA,48,7,0.002803,28.779335,19.220665
8,CGAC,53,5,0.002002,20.556668,32.443332
7,GCAAT,55,5,0.002002,20.556668,34.443332
3,ACG,71,6,0.002403,24.668002,46.331998
1,TTC,103,13,0.005206,53.447337,49.552663
2,CGG,90,8,0.003204,32.890669,57.109331
0,CG,185,27,0.010813,111.006007,73.993993
