# Statistical analysis to get the significantly associated tokens for attribution score groupings

## Read mapping information

In [1]:
import os

import pandas as pd


demoTrainDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_train.csv', sep='\t')
demoTestDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_test.csv', sep='\t')
demoValidateDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_validate.csv', sep='\t')
demoDf = pd.concat([demoTrainDf, demoTestDf, demoValidateDf], ignore_index=True)
demoDf

Unnamed: 0,person_id,visit_occurrence_id,TYPE,COUNTRY_OF_BIRTH,SEX,Age_Bin,JOURNEY_ID
0,678487,3312694,Emergency,Australia,Male,"(60, 70]",3312694
1,2094045,9199249,Emergency,Australia,Male,"(20, 30]",9199249
2,2107813,9410987,Emergency,Australia,Male,"(50, 60]",9410987
3,2083212,602890,Inpatient,Australia,Female,"(40, 50]",9039437
4,2083212,9039437,Emergency,Australia,Female,"(40, 50]",9039437
...,...,...,...,...,...,...,...
7194,2603535,16588204,Inpatient,Australia,Male,"(60, 70]",16588204
7195,2603535,16709341,Inpatient,Australia,Male,"(60, 70]",16709341
7196,2603535,16651755,Inpatient,Australia,Male,"(60, 70]",16651755
7197,2098616,16872774,Emergency,Australia,Female,"(80, 90]",16872774


In [2]:
import os

mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [3]:
mappedJourneyTubeIdsDf = mappingDf[['tube_code', 'PATIENT_ID', 'EPISODE_ID']].drop_duplicates().merge(
    demoDf[['person_id', 'visit_occurrence_id', 'JOURNEY_ID']],
    how='inner',
    left_on=['PATIENT_ID', 'EPISODE_ID'],
    right_on=['person_id', 'visit_occurrence_id']
).drop(
    columns=['PATIENT_ID', 'EPISODE_ID', 'visit_occurrence_id']
)[['person_id', 'JOURNEY_ID', 'tube_code']].drop_duplicates()
mappedJourneyTubeIdsDf

Unnamed: 0,person_id,JOURNEY_ID,tube_code
0,50056,13224842,AH20B011
1,62567,12645629,AH19G065
2,138108,14737781,AH21E052
3,211314,13955099,AH20J009
4,211314,13955099,KPN2214
...,...,...,...
512,2084373,16158768,ALF22C004
513,1058066,16759482,ALF22H089
514,2552781,16966609,ALF22J054
515,2638484,16969999,ALF22K143


## Read bed files containing overlapping tokens and attribution scores

In [4]:
import os
import pandas as pd
from pathlib import Path


overlappingFilesDir = Path(os.environ['GENOMICS_DATA_BASE'], 'genome_nlp_tokens', 'overlapping_with_annotations')

overlappingDfList = []
for overlappingFile in os.listdir(overlappingFilesDir):
    df = pd.read_csv(Path(overlappingFilesDir, overlappingFile), sep='\t', names=['contig_id', 'start_position', 'end_position', 'tokens', 'score', 'feature_type', 'id', 'name', 'gene', 'atributes'])
    df['tube_code'] = [overlappingFile.split('_')[0]]*df.shape[0]
    overlappingDfList.append(df)
overlappingDf = pd.concat(overlappingDfList, ignore_index=True)
overlappingDf = overlappingDf.merge(
    mappedJourneyTubeIdsDf,
    how='inner',
    on=['tube_code']
)
overlappingDf = overlappingDf.drop_duplicates()
overlappingDf = overlappingDf[overlappingDf.tokens.apply(lambda x: (len(x) >= 5))].reset_index()
overlappingDf['gene'] = overlappingDf.gene.str.lower()
overlappingDf = overlappingDf[overlappingDf.feature_type.isin(['CDS', 'ncRNA', 'oriC', 'regulatory_region', 'oriT'])]
overlappingDf

Unnamed: 0,index,contig_id,start_position,end_position,tokens,score,feature_type,id,name,gene,atributes,tube_code,person_id,JOURNEY_ID
0,0,1,214,219,TGAGGG,-0.000951,CDS,CENNFK_00005,Transposase,,ID=CENNFK_00005;Name=Transposase;locus_tag=CEN...,AH21B039,908851,14408941
1,1,1,220,224,GTTAT,-0.000473,CDS,CENNFK_00005,Transposase,,ID=CENNFK_00005;Name=Transposase;locus_tag=CEN...,AH21B039,908851,14408941
2,3,1,228,233,GAAAGA,-0.000867,CDS,CENNFK_00005,Transposase,,ID=CENNFK_00005;Name=Transposase;locus_tag=CEN...,AH21B039,908851,14408941
3,4,1,234,239,TATTGT,-0.000915,CDS,CENNFK_00005,Transposase,,ID=CENNFK_00005;Name=Transposase;locus_tag=CEN...,AH21B039,908851,14408941
4,5,1,240,245,AGCTGG,-0.001374,CDS,CENNFK_00005,Transposase,,ID=CENNFK_00005;Name=Transposase;locus_tag=CEN...,AH21B039,908851,14408941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14878529,18368428,5,205763,205769,ATGATAT,0.002546,CDS,KKIOAG_10525,Prepilin signal peptidase PulO (type II secret...,pulo,ID=KKIOAG_10525;Name=Prepilin signal peptidase...,AH21K020,2035444,15586881
14878530,18368429,5,205770,205775,CCATCA,0.000022,CDS,KKIOAG_10525,Prepilin signal peptidase PulO (type II secret...,pulo,ID=KKIOAG_10525;Name=Prepilin signal peptidase...,AH21K020,2035444,15586881
14878531,18368430,5,205776,205781,ATTTTT,-0.002056,CDS,KKIOAG_10525,Prepilin signal peptidase PulO (type II secret...,pulo,ID=KKIOAG_10525;Name=Prepilin signal peptidase...,AH21K020,2035444,15586881
14878532,18368431,5,205782,205787,AGGAGG,0.000008,CDS,KKIOAG_10525,Prepilin signal peptidase PulO (type II secret...,pulo,ID=KKIOAG_10525;Name=Prepilin signal peptidase...,AH21K020,2035444,15586881


## Perform chi-square test

### High attribution score

In [28]:
cutoff = overlappingDf.score.mean() + 1 * overlappingDf.score.std()
cutoff

np.float64(0.003378765748551873)

In [29]:
from scipy import stats

highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


727805.0274963141 3.55127562151092e-20 True


`The test indicates that the tokens associated with the high score group are significantly different than the rest`

#### Calculate the ratio of count differences

In [30]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the high attribution score

In [32]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 10]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 10]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
12,AAAATGT,1249,7343,0.000567,123.669344,10.099512
595,CTGTGG,89,512,4e-05,8.623002,10.321231
124,TCAGTAT,410,2348,0.000181,39.544548,10.368054
132,AGATGG,389,2227,0.000172,37.506691,10.371483
14,TTGCAA,1203,6755,0.000521,113.766365,10.574303
2,CAATTT,1955,10968,0.000847,184.720872,10.583536
150,TTATATA,354,1983,0.000153,33.397291,10.599662
783,AAGAAAAAAA,55,307,2.4e-05,5.170433,10.637407
1030,TCCTTTCC,30,167,1.3e-05,2.812581,10.66636
131,CTGACT,393,2104,0.000162,35.435149,11.090683


#### Over represented genes associated with the high attribution score

In [33]:
tempDf = overrepresentedTokensDf.merge(overlappingDf[['tokens', 'feature_type', 'id', 'name', 'gene']], how='inner', on=['tokens'])

tempDf[['tokens', 'feature_type', 'gene', 'name']].groupby(by=['feature_type', 'gene', 'name']).agg('count').reset_index().sort_values(by=['tokens'], ascending=False)[:15]
# gene.value_counts().reset_index()[:5]

Unnamed: 0,feature_type,gene,name,tokens
1391,ncRNA,s35,Staphylococcus sRNA 35 (srn_0335),289
596,CDS,lys2b,Thioester reductase domain of alpha aminoadipa...,229
308,CDS,era,GTPase Era%2C involved in 16S rRNA processing,220
961,CDS,rlha,23S rRNA C2501 and tRNA U34 5'-hydroxylation p...,210
272,CDS,dnag,DNA primase,192
251,CDS,degq,Serine protease HtrA-like,182
27,CDS,adda,helicase-exonuclease AddAB subunit AddA,181
810,CDS,pepf,oligoendopeptidase F,173
1247,CDS,vals,valine--tRNA ligase,173
758,CDS,nrde,class 1b ribonucleoside-diphosphate reductase ...,172


#### Over represented annotations associated with the high attribution score

In [34]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,hyperosmolarity resistance protein Ebh,336
1,Staphylococcus sRNA 35 (srn_0335),289
2,Thioester reductase domain of alpha aminoadipa...,229
3,Lipoprotein,221
4,GTPase Era%2C involved in 16S rRNA processing,220


#### Under represented tokens associated with the high attribution score

In [35]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 1]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 1]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
1642,ATGAT,8,67845,0.005237,1142.631981,0.007001
1747,ATCGC,6,43346,0.003346,730.024701,0.008219
1529,ACAAT,10,49758,0.003841,838.014328,0.011933
1836,TACCT,5,17409,0.001344,293.198912,0.017053
1801,ACGTT,6,20409,0.001575,343.724314,0.017456
...,...,...,...,...,...,...
1230,AAGTAGCT,20,1202,0.000093,20.243845,0.987955
893,TTTTTCT,41,2461,0.000190,41.447672,0.989199
861,AAGCTTA,44,2641,0.000204,44.479196,0.989227
1319,AAGTCCT,16,953,0.000074,16.050236,0.996870


#### Under represented genes associated with the high attribution score

In [36]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,lys2b,28136
1,araj,23981
2,pote,23001
3,ccma,22015
4,ebh,21359


#### Under represented annotations associated with the high attribution score

In [37]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,hyperosmolarity resistance protein Ebh,50615
1,Thioester reductase domain of alpha aminoadipa...,28136
2,Lipoprotein,27845
3,Extracellular matrix-binding protein ebh,27736
4,putative arabinose efflux permease AraJ%2C MFS...,24507


### Low attribution score

In [38]:
cutoff = overlappingDf.score.mean() - 1 * overlappingDf.score.std()
cutoff

np.float64(-0.005417008772367425)

In [39]:
from scipy import stats

highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


2222337.507815295 0.0 True


#### Calculate the ratio of count differences

In [40]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the low attribution score

In [43]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 80]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 80]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
499,TAACAA,6859,5,6e-06,84.532468,81.14042
486,CATTG,6964,5,6e-06,84.532468,82.382546
181,GTCTG,11695,8,1e-05,135.25195,86.468255
417,ATATTTG,7640,5,6e-06,84.532468,90.379474
140,AATTAAA,12563,8,1e-05,135.25195,92.885907
131,CACCAC,12850,8,1e-05,135.25195,95.007873
167,ACAGT,12057,7,9e-06,118.345456,101.879704
78,AATTT,15299,7,9e-06,118.345456,129.274081
13,TGATTT,31651,12,1.5e-05,202.877924,156.010074
49,TAATTT,19852,7,9e-06,118.345456,167.746196


#### Over represented genes associated with the low attribution score

In [44]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,lys2b,727
1,araj,559
2,ccma,554
3,caia,531
4,ftsk,482


#### Over represented annotations associated with the low attribution score

In [45]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,hyperosmolarity resistance protein Ebh,967
1,Thioester reductase domain of alpha aminoadipa...,727
2,Extracellular matrix-binding protein ebh,612
3,Lipoprotein,605
4,putative arabinose efflux permease AraJ%2C MFS...,573


#### Under represented tokens associated with the low attribution score

In [46]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.3]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.3]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
3102,GCTCAG,465,101,0.000125,1707.555863,0.272319
3080,CTGTGG,496,105,0.00013,1775.181837,0.279408
2737,TTTCTTTTTT,1030,218,0.000271,3685.615624,0.279465
3370,GGAGATTTCA,57,12,1.5e-05,202.877924,0.280957
3261,GTGATCTG,227,47,5.8e-05,794.605203,0.285676


#### Under represented genes associated with the low attribution score

In [47]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,mnth,65
1,dnak,53
2,ybjt,43
3,tagb,35
4,amya,33


#### Under represented annotations associated with the low attribution score

In [48]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,Lipoprotein,72
1,molecular chaperone DnaK,53
2,Mn2+ or Fe2+ transporter%2C NRAMP family,52
3,DUF2867 domain-containing protein,43
4,CDP-glycerol glycerophosphotransferase%2C TagB...,35
