# Statistical analysis to get the significantly associated tokens for attribution score groupings

## Read mapping information

In [1]:
import os

import pandas as pd


demoTrainDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_train.csv', sep='\t')
demoTestDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_test.csv', sep='\t')
demoValidateDf = pd.read_csv(os.environ['EHR_DATA_BASE'] + '/blood_pos_cohort_20240614/data/wb_365_wa_1/splits_v1/demographics/mortality_normal_validate.csv', sep='\t')
demoDf = pd.concat([demoTrainDf, demoTestDf, demoValidateDf], ignore_index=True)
demoDf

Unnamed: 0,person_id,visit_occurrence_id,TYPE,COUNTRY_OF_BIRTH,SEX,Age_Bin,JOURNEY_ID
0,678487,3312694,Emergency,Australia,Male,"(60, 70]",3312694
1,2094045,9199249,Emergency,Australia,Male,"(20, 30]",9199249
2,2107813,9410987,Emergency,Australia,Male,"(50, 60]",9410987
3,2083212,602890,Inpatient,Australia,Female,"(40, 50]",9039437
4,2083212,9039437,Emergency,Australia,Female,"(40, 50]",9039437
...,...,...,...,...,...,...,...
7194,2603535,16588204,Inpatient,Australia,Male,"(60, 70]",16588204
7195,2603535,16709341,Inpatient,Australia,Male,"(60, 70]",16709341
7196,2603535,16651755,Inpatient,Australia,Male,"(60, 70]",16651755
7197,2098616,16872774,Emergency,Australia,Female,"(80, 90]",16872774


In [2]:
import os

mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [3]:
mappedJourneyTubeIdsDf = mappingDf[['tube_code', 'PATIENT_ID', 'EPISODE_ID']].drop_duplicates().merge(
    demoDf[['person_id', 'visit_occurrence_id', 'JOURNEY_ID']],
    how='inner',
    left_on=['PATIENT_ID', 'EPISODE_ID'],
    right_on=['person_id', 'visit_occurrence_id']
).drop(
    columns=['PATIENT_ID', 'EPISODE_ID', 'visit_occurrence_id']
)[['person_id', 'JOURNEY_ID', 'tube_code']].drop_duplicates()
mappedJourneyTubeIdsDf

Unnamed: 0,person_id,JOURNEY_ID,tube_code
0,50056,13224842,AH20B011
1,62567,12645629,AH19G065
2,138108,14737781,AH21E052
3,211314,13955099,AH20J009
4,211314,13955099,KPN2214
...,...,...,...
512,2084373,16158768,ALF22C004
513,1058066,16759482,ALF22H089
514,2552781,16966609,ALF22J054
515,2638484,16969999,ALF22K143


## Read bed files containing overlapping tokens and attribution scores

In [4]:
import os
import pandas as pd
from pathlib import Path


overlappingFilesDir = Path(os.environ['GENOMICS_DATA_BASE'], 'genome_nlp_tokens', 'overlapping_with_annotations')

overlappingDfList = []
for overlappingFile in os.listdir(overlappingFilesDir):
    df = pd.read_csv(Path(overlappingFilesDir, overlappingFile), sep='\t', names=['contig_id', 'start_position', 'end_position', 'tokens', 'score', 'feature_type', 'id', 'name', 'gene', 'atributes'])
    df['tube_code'] = [overlappingFile.split('_')[0]]*df.shape[0]
    overlappingDfList.append(df)
overlappingDf = pd.concat(overlappingDfList, ignore_index=True)
overlappingDf = overlappingDf.merge(
    mappedJourneyTubeIdsDf,
    how='inner',
    on=['tube_code']
)
overlappingDf = overlappingDf.drop_duplicates()
overlappingDf = overlappingDf[overlappingDf.tokens.apply(lambda x: (len(x) >= 5))].reset_index()
overlappingDf['gene'] = overlappingDf.gene.str.lower()
overlappingDf = overlappingDf[overlappingDf.feature_type.isin(['CDS', 'ncRNA', 'oriC', 'regulatory_region', 'oriT'])]
overlappingDf

Unnamed: 0,index,contig_id,start_position,end_position,tokens,score,feature_type,id,name,gene,atributes,tube_code,person_id,JOURNEY_ID
0,1,1,193,197,ATAAC,-0.005218,CDS,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
1,4,1,204,210,ATTGATT,-0.004120,CDS,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
2,5,1,211,218,AATATTTT,0.003234,CDS,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
3,6,1,219,225,GTCATTG,-0.009522,CDS,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
4,7,1,226,233,AATTCTAC,-0.004358,CDS,CFBAKF_00005,DUF3969 domain-containing protein,,ID=CFBAKF_00005;Name=DUF3969 domain-containing...,AH19E065,2221447,681015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94483,121154,20,410,415,TAACTT,0.017504,CDS,KKIOAG_13685,YopX domain-containing protein,yopx,ID=KKIOAG_13685;Name=YopX domain-containing pr...,AH21K020,2035444,15586881
94484,121155,20,416,420,TTTTA,0.001789,CDS,KKIOAG_13685,YopX domain-containing protein,yopx,ID=KKIOAG_13685;Name=YopX domain-containing pr...,AH21K020,2035444,15586881
94485,121156,20,421,427,TCTTTAT,0.003276,CDS,KKIOAG_13685,YopX domain-containing protein,yopx,ID=KKIOAG_13685;Name=YopX domain-containing pr...,AH21K020,2035444,15586881
94486,121157,20,428,432,CCCAC,0.005503,CDS,KKIOAG_13685,YopX domain-containing protein,yopx,ID=KKIOAG_13685;Name=YopX domain-containing pr...,AH21K020,2035444,15586881


## Perform chi-square test

### High attribution score

In [5]:
cutoff = overlappingDf.score.mean() + 2 * overlappingDf.score.std()
cutoff

np.float64(0.034861431584150615)

In [6]:
from scipy import stats

highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


52.50000000000001 0.2061361964240253 False


`The test indicates that the tokens associated with the high score group are significantly different than the rest`

#### Calculate the ratio of count differences

In [7]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the high attribution score

In [8]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 4]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
7,CACCGT,6,22,0.004666,1.348462,4.449512
5,ATGAAA,7,22,0.004666,1.348462,5.191098
8,GAGCA,6,15,0.003181,0.919406,6.525952
0,CTTTG,10,21,0.004454,1.287169,7.76899
4,CACTAT,8,11,0.002333,0.674231,11.865366
2,ACAAGG,9,6,0.001273,0.367762,24.472318
3,TTCTTTGT,9,5,0.00106,0.306469,29.366782


#### Over represented genes associated with the high attribution score

In [22]:
tempDf = overrepresentedTokensDf.merge(overlappingDf[['tokens', 'feature_type', 'id', 'name', 'gene']], how='inner', on=['tokens'])

tempDf[['tokens', 'feature_type', 'gene', 'name']].groupby(by=['feature_type', 'gene', 'name']).agg('count').reset_index().sort_values(by=['tokens'], ascending=False)[:15]
# gene.value_counts().reset_index()[:5]

Unnamed: 0,feature_type,gene,name,tokens
3,CDS,esad,type VII secretion system secreted protein%2C ...,27
18,CDS,rplq,50S ribosomal protein L17,9
22,CDS,tnp,IS3 family transposase ORF B,8
4,CDS,fni,type 2 isopentenyl-diphosphate Delta-isomerase,4
12,CDS,noc,Chromosome segregation protein Spo0J%2C contai...,3
20,CDS,tagb,CDP-glycerol glycerophosphotransferase%2C TagB...,3
21,CDS,tnp,IS3 family ISSau2 transposase ORF A,3
26,ncRNA,rnaiii,(3' truncated) RNAIII,3
23,CDS,trka,Trk/Ktr K+ transport system regulatory compone...,2
11,CDS,muts,DNA mismatch repair ATPase MutS,2


#### Over represented annotations associated with the high attribution score

In [10]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,type VII secretion system secreted protein%2C ...,27
1,DUF600 domain-containing protein,12
2,50S ribosomal protein L17,9
3,IS3 family transposase ORF B,8
4,DUF5079 domain-containing protein,6


#### Under represented tokens associated with the high attribution score

In [16]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 1]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 1]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
10,AAATAT,5,100,0.021209,6.129374,0.815744
11,ATGGT,5,89,0.018876,5.455143,0.916566


#### Under represented genes associated with the high attribution score

In [17]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,ilva,21
1,hly,10
2,esag,8
3,ara1,7
4,tnp,6


#### Under represented annotations associated with the high attribution score

In [18]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,threonine ammonia-lyase IlvA,21
1,Alpha-Hemolysin precursor,10
2,TIGR01741 family protein,9
3,Uncharacterized membrane-anchored protein,8
4,Aldo/keto reductase%2C related to diketogulona...,7


### Low attribution score

In [19]:
cutoff = overlappingDf.score.mean() - 2 * overlappingDf.score.std()
cutoff

np.float64(-0.04383813452765485)

In [20]:
from scipy import stats

highScoreOverlappingDf = overlappingDf[(overlappingDf.score > cutoff)].tokens.value_counts().reset_index()
lowScoreOverlappingDf = overlappingDf[(overlappingDf.score < cutoff)].tokens.value_counts().reset_index()
mergedOverlappingDf = highScoreOverlappingDf.add_suffix('_hs').merge(
    lowScoreOverlappingDf.add_suffix('_ls'),
    how='inner',
    left_on=['tokens_hs'],
    right_on=['tokens_ls']
)[['tokens_hs', 'count_hs', 'count_ls']].rename(columns={'tokens_hs': 'tokens'})

mergedOverlappingDf['proportion_ls'] = mergedOverlappingDf.count_ls/mergedOverlappingDf.count_ls.sum()

mergedOverlappingDf['count_expected'] = mergedOverlappingDf.proportion_ls * mergedOverlappingDf.count_hs.sum()

filteredOverlappingDf = mergedOverlappingDf[(mergedOverlappingDf.count_hs >= 5) & (mergedOverlappingDf.count_ls >= 5)]

chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredOverlappingDf.count_hs, filteredOverlappingDf.count_expected), correction=True)
significant = p < 0.05  # 5% significance level
print(chi2, p, significant)


1282.0340721733464 5.533405826241168e-05 True


#### Calculate the ratio of count differences

In [21]:
filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredOverlappingDf.loc[:, 'ratio_difference'] = (filteredOverlappingDf.count_hs / filteredOverlappingDf.count_expected)


#### Over represented tokens associated with the low attribution score

In [23]:
overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 2]
overrepresentedTokensDf

  overrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference > 2]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
11,ATCGC,151,5,0.00187,58.67988,2.573284
0,ATGAT,297,9,0.003366,105.623785,2.811867
1,ACGAT,254,6,0.002244,70.415856,3.607142


#### Over represented genes associated with the low attribution score

In [24]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,tnp,40
1,mrp,36
2,aro8,21
3,eap,12
4,esag,10


#### Over represented annotations associated with the low attribution score

In [25]:
overrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,Fe-S cluster carrier ATPase%2C Mrp/ApbC/NBP35 ...,36
1,Phage protein,29
2,DNA-binding transcriptional regulator%2C MocR ...,21
3,UPF0344 protein SAR0931,20
4,TIGR01741 family protein,17


#### Under represented tokens associated with the low attribution score

In [26]:
underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.1]
underrepresentedTokensDf

  underrepresentedTokensDf = filteredOverlappingDf.sort_values(by=['ratio_difference'])[filteredOverlappingDf.ratio_difference < 0.1]


Unnamed: 0,tokens,count_hs,count_ls,proportion_ls,count_expected,ratio_difference
892,TAGTTTTT,6,9,0.003366,105.623785,0.056805
741,CTTGAT,12,15,0.00561,176.039641,0.068166
764,ATATATTT,11,11,0.004114,129.095737,0.085208
817,GACCTC,9,9,0.003366,105.623785,0.085208
891,CACTGC,6,6,0.002244,70.415856,0.085208
833,TGCCTT,9,9,0.003366,105.623785,0.085208
812,GGTTG,10,9,0.003366,105.623785,0.094676
869,CTAAGT,7,6,0.002244,70.415856,0.099409


#### Under represented genes associated with the low attribution score

In [27]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).gene.value_counts().reset_index()[:5]

Unnamed: 0,gene,count
0,esad,54
1,rplq,14
2,splc,5
3,aro8,4
4,meca,4


#### Under represented annotations associated with the low attribution score

In [28]:
underrepresentedTokensDf.merge(overlappingDf[['tokens', 'id', 'name', 'gene']], how='inner', on=['tokens']).name.value_counts().reset_index()[:5]

Unnamed: 0,name,count
0,type VII secretion system secreted protein%2C ...,54
1,50S ribosomal protein L17,14
2,serine protease SplC,5
3,Phage protein,4
4,DNA-binding transcriptional regulator%2C MocR ...,4
