# Statistical analysis to get the significantly associated tokens for attribution score groupings

## Read mapping information

In [1]:
import os

import pandas as pd


demoTrainDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_train.csv', sep='\t')
demoTestDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_test.csv', sep='\t')
demoValidateDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_validate.csv', sep='\t')
demoDf = pd.concat([demoTrainDf, demoTestDf, demoValidateDf], ignore_index=True)
demoDf

Unnamed: 0,person_id,visit_occurrence_id,TYPE,COUNTRY_OF_BIRTH,SEX,Age_Bin,JOURNEY_ID
0,678487,3312694,Emergency,Australia,Male,"(60, 70]",3312694
1,2094045,9199249,Emergency,Australia,Male,"(20, 30]",9199249
2,2107813,9410987,Emergency,Australia,Male,"(50, 60]",9410987
3,2083212,602890,Inpatient,Australia,Female,"(40, 50]",9039437
4,2083212,9039437,Emergency,Australia,Female,"(40, 50]",9039437
...,...,...,...,...,...,...,...
7194,2603535,16588204,Inpatient,Australia,Male,"(60, 70]",16588204
7195,2603535,16709341,Inpatient,Australia,Male,"(60, 70]",16709341
7196,2603535,16651755,Inpatient,Australia,Male,"(60, 70]",16651755
7197,2098616,16872774,Emergency,Australia,Female,"(80, 90]",16872774


In [2]:
import os

mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [3]:
mappedJourneyTubeIdsDf = mappingDf[['tube_code', 'PATIENT_ID', 'EPISODE_ID']].drop_duplicates().merge(
    demoDf[['person_id', 'visit_occurrence_id', 'JOURNEY_ID']],
    how='inner',
    left_on=['PATIENT_ID', 'EPISODE_ID'],
    right_on=['person_id', 'visit_occurrence_id']
).drop(
    columns=['PATIENT_ID', 'EPISODE_ID', 'visit_occurrence_id']
)[['person_id', 'JOURNEY_ID', 'tube_code']].drop_duplicates()
mappedJourneyTubeIdsDf

Unnamed: 0,person_id,JOURNEY_ID,tube_code
0,50056,13224842,AH20B011
1,62567,12645629,AH19G065
2,138108,14737781,AH21E052
3,211314,13955099,AH20J009
4,211314,13955099,KPN2214
...,...,...,...
512,2084373,16158768,ALF22C004
513,1058066,16759482,ALF22H089
514,2552781,16966609,ALF22J054
515,2638484,16969999,ALF22K143


## Read bed files containing overlapping tokens and attribution scores

In [4]:
import os
import pandas as pd
from pathlib import Path


overlappingFilesDir = Path(os.environ['GENOMICS_DATA_BASE'], 'genome_nlp_tokens', 'overlapping_with_annotations')

overlappingDfList = []
for overlappingFile in os.listdir(overlappingFilesDir):
    df = pd.read_csv(Path(overlappingFilesDir, overlappingFile), sep='\t', names=['contig_id', 'start_position', 'end_position', 'tokens', 'score', 'id', 'name', 'gene', 'atributes'])
    df['tube_code'] = [overlappingFile.split('_')[0]]*df.shape[0]
    overlappingDfList.append(df)
overlappingDf = pd.concat(overlappingDfList, ignore_index=True)
overlappingDf = overlappingDf.merge(
    mappedJourneyTubeIdsDf,
    how='inner',
    on=['tube_code']
)
overlappingDf = overlappingDf.drop_duplicates()
overlappingDf

Unnamed: 0,contig_id,start_position,end_position,tokens,score,id,name,gene,atributes,tube_code,person_id,JOURNEY_ID
0,1,191,192,GT,-0.001426,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
1,1,193,197,ATAAC,-0.005218,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
2,1,198,199,GC,-0.000682,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
3,1,200,203,GCCA,-0.010548,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
4,1,204,210,ATTGATT,-0.004120,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
...,...,...,...,...,...,...,...,...,...,...,...,...
121252,31,99,103,GGTGG,-0.252315,KKIOAG_13715,tRNA-Ile(gat),trnI,ID=KKIOAG_13715;Name=tRNA-Ile(gat);locus_tag=K...,AH21K020,2035444,15586881
121253,31,104,106,TTC,-0.134230,KKIOAG_13715,tRNA-Ile(gat),trnI,ID=KKIOAG_13715;Name=tRNA-Ile(gat);locus_tag=K...,AH21K020,2035444,15586881
121254,31,107,110,GAGT,-0.080241,KKIOAG_13715,tRNA-Ile(gat),trnI,ID=KKIOAG_13715;Name=tRNA-Ile(gat);locus_tag=K...,AH21K020,2035444,15586881
121255,31,111,116,CCACTT,0.215479,KKIOAG_13715,tRNA-Ile(gat),trnI,ID=KKIOAG_13715;Name=tRNA-Ile(gat);locus_tag=K...,AH21K020,2035444,15586881


## Perform chi-square test

### High attribution score

In [5]:
from scipy import stats

cutoff = 0.02
highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


1197.7722222222224 0.020681173280632906 True


`The test indicates that the tokens associated with the high score group are significantly different than the rest`

#### Calculate the ratio of count differences

In [6]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the high attribution score

In [7]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
54,AAAATGT,5,20,0.000877,1.241343,4.027895
57,CCTGAG,5,19,0.000833,1.179276,4.23989
45,TTGGTT,6,22,0.000964,1.365477,4.394068
25,TATAGT,9,31,0.001359,1.924082,4.677556
4,TAATTT,29,97,0.004252,6.020514,4.816865
5,CAC,27,89,0.003901,5.523977,4.887783
22,CTAA,12,37,0.001622,2.296485,5.225378
44,ATGTGCA,6,18,0.000789,1.117209,5.370527
50,CTGACT,6,18,0.000789,1.117209,5.370527
2,AGAGC,31,88,0.003857,5.461909,5.675671


#### Over represented genes associated with the high attribution score

In [25]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,trnI,133
1,tsr24,95
2,trnA,84
3,trnR,84
4,trnL,77


#### Over represented annotations associated with the high attribution score

In [8]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,tRNA-Ile(gat),133
1,tRNA-Ala(tgc),84
2,tRNA-Arg(acg),84
3,tRNA-Leu(taa),76
4,S. aureus tsr24 small RNA,56


#### Under represented tokens associated with the high attribution score

In [9]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.5]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.5]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
63,CGAA,5,241,0.010564,14.958184,0.334265
9,CG,25,1112,0.048742,69.018673,0.362221


#### Under represented genes associated with the high attribution score

In [10]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,rrl,143
1,rrs,101
2,tnp,90
3,trnA,83
4,trnL,38


#### Under represented annotations associated with the high attribution score

In [11]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,23S ribosomal RNA,86
1,tRNA-Ala(tgc),83
2,16S ribosomal RNA,72
3,Phage protein,46
4,ISL3 family IS1181 transposase,44


### Low attribution score

In [12]:
from scipy import stats

cutoff = -0.02
highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


11124.0714839337 6.47032501278991e-161 True


#### Calculate the ratio of count differences

In [13]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the low attribution score

In [14]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
8,CCG,296,8,0.000925,71.089732,4.163752
10,CATC,263,7,0.000809,62.203515,4.228057
6,GCAAT,299,7,0.000809,62.203515,4.806802


#### Over represented genes associated with the low attribution score

In [15]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,rrs,84
1,trnK,84
2,rrl,78
3,mrp,45
4,trnN,45


#### Over represented annotations associated with the low attribution score

In [18]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,tRNA-Lys(ttt),84
1,16S ribosomal RNA,60
2,tRNA-Asn(gtt),45
3,Fe-S cluster carrier ATPase%2C Mrp/ApbC/NBP35 ...,45
4,23S ribosomal RNA,43


#### Under represented tokens associated with the low attribution score

In [16]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.1]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.1]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
1528,CTCACC,9,29,0.003353,257.700278,0.034924
1089,GACCCC,21,58,0.006707,515.400555,0.040745
1401,CCGAG,12,29,0.003353,257.700278,0.046566
1369,TCCCAGC,13,29,0.003353,257.700278,0.050446
1371,CCACCCC,13,26,0.003006,231.041628,0.056267
1399,AGAGGA,12,24,0.002775,213.269195,0.056267
1607,ATGTACA,6,12,0.001388,106.634598,0.056267
1161,CCCAAA,19,29,0.003353,257.700278,0.073729
1633,TAGTTTTT,6,9,0.001041,79.975948,0.075023
1121,CTCTTG,20,29,0.003353,257.700278,0.07761


#### Under represented genes associated with the low attribution score

In [17]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,trnL,113
1,trnA,83
2,trnR,78
3,trnP,76
4,trnG,71


#### Under represented annotations associated with the low attribution score

In [20]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,tRNA-Leu(taa),113
1,tRNA-Ala(tgc),83
2,tRNA-Pro(tgg),76
3,tRNA-Arg(acg),76
4,tRNA-Gly(gcc),71
