In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

def normalized_harmonic_sum(x):
    x_sorted = np.sort(x)[::-1]
    harmonic_sum = np.sum(x_sorted / (np.arange(1, len(x_sorted) + 1) ** 2))
    max_harmonic_sum = 1.644  # Given approximation for a vector of 1,000 ones
    return harmonic_sum / max_harmonic_sum
    
###

map = pd.read_pickle('./OT/Final/diseases.pkl')
map['code'] = map['code'].astype(str)

direct_map = map.loc[map['Code'].notna()][['id','Code']]

mondo_map = map.loc[map['terminology'] == 'MONDO'][['code','id']]
mondo_map = mondo_map.merge(direct_map)
mondo_map['id'] = 'MONDO_' + mondo_map['code'].astype(str)
mondo_map = mondo_map[['id','Code']]

hp_map = map.loc[map['terminology'] == 'HP'][['code','id']]
hp_map = hp_map.merge(direct_map)
hp_map['id'] = 'HP_' + hp_map['code'].astype(str)
hp_map = hp_map[['id','Code']]

orpha_map = map.loc[map['terminology'] == 'Orphanet'][['code','id']]
orpha_map = orpha_map.merge(direct_map)
orpha_map['id'] = 'Orphanet_' + orpha_map['code'].astype(str)
orpha_map = orpha_map[['id','Code']]

efo_map = map.loc[map['terminology'] == 'EFO'][['code','id']]
efo_map = efo_map.merge(direct_map)
efo_map['id'] = 'EFO_' + efo_map['code'].astype(str)
efo_map = efo_map[['id','Code']]

eva_map = pd.concat([direct_map, mondo_map, hp_map, orpha_map, efo_map]).drop_duplicates(['id','Code'])
eva_map = eva_map.rename({'id':'disease'},axis=1)

d = eva_map[['Code']].drop_duplicates()
d['OT'] = 1
d.to_csv('./Phenotyping/ot_present.csv', index=False)

###

targets = pd.read_pickle('./OT/Final/targets.pkl')


## Locus2gene

In [82]:
l2g = pd.read_pickle('./OT/Raw/ot_genetics_portal.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
l2g = l2g.merge(eva_map).merge(targets)
l2g = l2g[['Code','gene','projectId','studyId','variantId','score','variantEffect','directionOnTrait']]
l2g = l2g.loc[l2g['variantEffect'].notna() & l2g['directionOnTrait'].notna()]
l2g = l2g.sort_values('score',ascending=False).drop_duplicates(['Code','gene','variantId','variantEffect','directionOnTrait'])
l2g.loc[(l2g['variantEffect'] == 'GoF') & (l2g['directionOnTrait'] == 'risk'), 'moa'] = 'inhibitor'
l2g.loc[(l2g['variantEffect'] == 'GoF') & (l2g['directionOnTrait'] == 'protect'), 'moa'] = 'activator'
l2g.loc[(l2g['variantEffect'] == 'LoF') & (l2g['directionOnTrait'] == 'risk'), 'moa'] = 'activator'
l2g.loc[(l2g['variantEffect'] == 'LoF') & (l2g['directionOnTrait'] == 'protect'), 'moa'] = 'inhibitor'
l2g = pd.pivot_table(l2g, index=['Code','gene'], columns='moa', values='score', aggfunc=normalized_harmonic_sum).reset_index()
l2g = l2g.set_axis(['Code','gene','l2g_activator','l2g_inhibitor'],axis=1)
l2g.to_pickle('./Features/l2g.pkl')


## Gene burden

In [70]:
gb = pd.read_pickle('./OT/Raw/gene_burden.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
gb = gb.merge(eva_map).merge(targets)
gb = gb[['Code','gene','score','directionOnTrait','cohortId']]
gb.loc[gb['cohortId'].str.contains('UK Biobank', na=False), 'cohortId'] = 'UK Biobank'
gb = gb.sort_values('score',ascending=False).drop_duplicates(['Code','gene','cohortId','directionOnTrait'])
gb = pd.pivot_table(gb, index=['Code','gene'], columns='directionOnTrait', values='score', aggfunc=normalized_harmonic_sum).reset_index()
gb['ot_gene_burden'] = gb['protect']*-1
gb.loc[(gb['risk'] >= gb['protect']) | (gb['protect'].isna()), 'ot_gene_burden'] = gb['risk']
gb = gb[['Code','gene','ot_gene_burden']]
gb.to_pickle('./Features/ot_gb.pkl')


## IMPC

In [51]:
impc = pd.read_pickle('./OT/Raw/impc.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
impc = impc.merge(eva_map).merge(targets)
impc = impc[['biologicalModelId','biologicalModelAllelicComposition','biologicalModelGeneticBackground',
             'Code','gene','score']].drop_duplicates()
impc['datasourceId'] = 'impc_lof'
impc = pd.pivot_table(impc, index=['Code','gene'], columns='datasourceId', values='score', aggfunc=normalized_harmonic_sum).reset_index()
impc.to_pickle('./Features/impc.pkl')


## Orphanet and Gene2phenotype (combining due to sparsity)

In [2]:
orpha = pd.read_pickle('./OT/Raw/orphanet.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
orpha = orpha.merge(eva_map).merge(targets)
orpha = orpha[['Code','gene','variantEffect','score']].dropna()
orpha['value'] = 1
orpha = pd.pivot_table(orpha, index=['Code','gene'], columns='variantEffect', values='value', aggfunc='max').reset_index()
orpha = orpha.set_axis(['Code','gene','gof','lof'],axis=1)

g2p = pd.read_pickle('./OT/Raw/gene2phenotype.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
g2p = g2p.merge(eva_map).merge(targets)
g2p = g2p.loc[g2p['score'] >= 0.5] # remove limited evidence
g2p = g2p[['Code','gene','variantEffect']].dropna()
g2p['value'] = 1
g2p = pd.pivot_table(g2p, index=['Code','gene'], columns='variantEffect', values='value', aggfunc='max').reset_index()
g2p = g2p.set_axis(['Code','gene','gof','lof'],axis=1)

go = pd.concat([orpha,g2p])
go = go.set_axis(['Code','gene','orphanet_g2p_gof','orphanet_g2p_lof'],axis=1)
go.to_pickle('./Features/orphanet_g2p.pkl')


## ClinVar and UniProt variants

In [45]:
eva = pd.read_pickle('./OT/Raw/eva.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
eva = eva.merge(eva_map).merge(targets)
eva = eva[['Code','gene','variantId','score','clinicalSignificances','confidence','variantEffect','directionOnTrait']]
eva = eva.sort_values('score', ascending=False).drop_duplicates(['Code','variantId'])

es = pd.read_pickle('./OT/Raw/eva_somatic.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
es = es.merge(eva_map).merge(targets)
es = es[['Code','gene','variantId','score','clinicalSignificances','confidence','variantEffect','directionOnTrait']]
es = es.sort_values('score', ascending=False).drop_duplicates(['Code','variantId'])

eva = pd.concat([eva,es])
eva.to_pickle('./OT/Processing/eva_cleaned.pkl')

up = pd.read_pickle('./OT/Raw/uniprot_variants.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
up = up.merge(eva_map).merge(targets)
up = up[['Code','gene','variantId','score']]
up = up.sort_values('score', ascending=False).drop_duplicates(['Code','variantId'])


In [82]:
# Write ClinVar variants to VCF for VEP
evep = pd.concat([eva[['variantId']].drop_duplicates(),es[['variantId']].drop_duplicates(),up[['variantId']].drop_duplicates()])
evep = evep.drop_duplicates().dropna()

evep['chr'] = evep['variantId'].str.split('_').str[0].astype(object)
evep['pos'] = evep['variantId'].str.split('_').str[1].astype(int)
evep['ref'] = evep['variantId'].str.split('_').str[2]
evep['alt'] = evep['variantId'].str.split('_').str[3]
evep['qual'] = '.'
evep['filter'] = '.'
evep['info'] = '.'
evep['id'] = evep['variantId'].copy()

vcf_columns = ['chr', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info']
vcf_df = evep[vcf_columns]
vcf_df = vcf_df.sort_values(['chr','pos','ref','alt'])

vcf_header = """##fileformat=VCFv4.2
##source=CustomGeneratedVCF
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
"""
with open('./OT/Processing/eva_unique.vcf', 'w') as f:
    f.write(vcf_header)
    vcf_df.to_csv(f, sep='\t', index=False, header=False)
    

In [116]:
vcf = pd.read_csv('./OT/Processing/eva_unique_loftee_filter.vcf', sep='\t', skiprows=53)
vcf = vcf.set_axis(['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'],axis=1).drop(['QUAL','FILTER'],axis=1)
vcf = vcf.assign(INFO=vcf['INFO'].str.split(',')).explode('INFO')
vcf['INFO'] = vcf['INFO'].str.split('|')

column_names = [
    'Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene', 'Feature_type', 'Feature',
    'BIOTYPE', 'EXON', 'INTRON', 'HGVSc', 'HGVSp', 'cDNA_position', 'CDS_position',
    'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'REF_ALLELE',
    'DISTANCE', 'STRAND', 'FLAGS', 'SYMBOL_SOURCE', 'HGNC_ID', 'CANONICAL', 'MANE_SELECT',
    'LoF','LoF_filter','LoF_flags','LoF_info'
]

vcf[column_names] = pd.DataFrame(vcf['INFO'].tolist(), index=vcf.index)
vcf = vcf.drop(columns=['INFO'])
vcf.loc[vcf['Consequence'].str.contains('stop_lost|start_lost|transcript_amplification|inframe_insertion|inframe_deletion|missense_variant|protein_altering_variant',na=False),'snp_type'] = 'Missense'
vcf.loc[vcf['Consequence'].str.contains('transcript_ablation|splice_acceptor_variant|splice_donor_variant|stop_gained|frameshift_variant',na=False),'snp_type'] = 'PTV'
vcf = vcf.loc[vcf['IMPACT'] != 'MODIFIER']
vcf = vcf[['ID','Consequence','SYMBOL','LoF','snp_type']]

gof = pd.read_pickle('./LoGoFunc/all_predictions.pkl')
gof = gof[['ID','LoGoFunc_GOF']]
vcf = vcf.merge(gof, how='left')

lof = pd.read_csv('./LoGoFunc/filtered_LOGO_LOF.csv', sep='\t')
lof = lof[['ID','LoGoFunc_LOF']]
vcf = vcf.merge(lof, how='left')

vcf['annotation'] = '5_all_other'
vcf.loc[vcf['snp_type'] == 'Missense', 'annotation'] = '4_other_missense'
vcf.loc[vcf['LoGoFunc_GOF'].notna(), 'annotation'] = '3_gof'
#vcf.loc[vcf['LoGoFunc_GOF'] >= 2/3, 'annotation'] = '4_gof_mc'
vcf.loc[vcf['LoGoFunc_LOF'].notna(), 'annotation'] = '2_lof_lc'
#vcf.loc[vcf['LoGoFunc_LOF'] >= 2/3, 'annotation'] = '2_lof_mc'
vcf.loc[vcf['LoF'] == 'LC', 'annotation'] = '2_lof_lc'
vcf.loc[vcf['LoF'] == 'HC', 'annotation'] = '1_lof_hc'

vcf = vcf[['ID','SYMBOL','Consequence','snp_type','annotation']]
vcf = vcf.sort_values('annotation', ascending=True).drop_duplicates('ID')
vcf.to_pickle('./OT/Processing/vep_annotations.pkl')


  vcf = pd.read_csv('./OT/Processing/eva_unique_loftee_filter.vcf', sep='\t', skiprows=53)
  lof = pd.read_csv('./LoGoFunc/filtered_LOGO_LOF.csv', sep='\t')


In [3]:
vcf = pd.read_pickle('./OT/Processing/vep_annotations.pkl').rename({'ID':'variantId'},axis=1)[['variantId','annotation']]
combined = pd.read_pickle('./OT/Processing/eva_cleaned.pkl')
combined = combined.merge(vcf, how='left')
combined.loc[combined['annotation'].isna() & combined['variantEffect'] == 'LoF', 'annotation'] = '1_lof_hc'
combined = combined.loc[(combined['annotation'].notna()) & (combined['directionOnTrait'] == 'risk')]
combined = pd.pivot_table(combined, index=['Code','gene'], columns='annotation', values='score', aggfunc=normalized_harmonic_sum).reset_index()
combined = combined.drop('5_all_other',axis=1).dropna(thresh=3, axis=1)
combined = combined.set_axis(['Code','gene','eva_hclof','eva_lclof','eva_gof','eva_other_missense'],axis=1)
combined.to_pickle('./Features/eva.pkl')


In [119]:
up = pd.read_pickle('./OT/Raw/uniprot_variants.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
up = up.merge(eva_map).merge(targets)
up = up[['Code','gene','variantId','score']]
up = up.sort_values('score', ascending=False).drop_duplicates(['Code','variantId'])
up = up.merge(vcf)
up = pd.pivot_table(up, index=['Code','gene'], columns='annotation', values='score', aggfunc=normalized_harmonic_sum).reset_index()
up.to_pickle('./Features/uniprot_variants.pkl')


In [127]:
drugs = pd.read_pickle('./Drugs/combined.pkl')
temp = drugs.merge(up)
temp.count()

Code                  96
gene                  96
Negative modulator    62
Other                 31
Positive modulator    24
1_lof_hc              20
2_lof_lc              61
3_gof                 59
4_other_missense      51
5_all_other           23
dtype: int64

## Direct target-disease association scores

In [None]:
da = pd.read_pickle('./OT/Raw/associationByOverallDirect.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
da = da.merge(pd.read_pickle('./OT/Final/targets.pkl')).merge(eva_map)
da = da.groupby(['Code','gene'])['score'].max().reset_index()
da.to_pickle('./OT/Final/direct_associations.pkl')

da = pd.read_pickle('./OT/Raw/associationByOverallDirect.pkl').rename({'diseaseId':'disease','targetId':'gene_id'},axis=1)
da = da.merge(pd.read_pickle('./OT/Final/targets.pkl'))
da = da.groupby(['gene'])['score'].max().reset_index()
da = da.set_axis(['gene','max_ot'],axis=1)
da.to_pickle('./OT/Final/max_da.pkl')


## Mantis-ML scores

In [None]:
map1 = eva_map.loc[eva_map['disease'].str.contains('HP_')]
map1 = map1.rename({'disease':'HPO'},axis=1)

# Pickled file of 2023AB_MRCONSO.RRF from UMLS
umls = pd.read_pickle('./Other/umls.pkl')[[0,11,13]]
umls_icd = umls.loc[umls[11].isin(['ICD10CM','ICD10PCS','ICD10'])]
umls_icd = umls_icd[[0,13]].drop_duplicates().rename({13:'ICD10'},axis=1)
umls_hpo = umls.loc[umls[11] == 'HPO']
umls_hpo = umls_hpo[[0,13]].drop_duplicates().rename({13:'HPO'},axis=1)
map2 = umls_icd.merge(umls_hpo)[['ICD10','HPO']]
map2['HPO'] = map2['HPO'].str.replace(':','_')
map2a = map2.merge(custom[['icd','custom_code']].dropna().drop_duplicates().set_axis(['ICD10','Code'],axis=1))
map2a = map2a[['HPO','Code']].drop_duplicates()
map2b = map2.copy()
map2b['ICD10'] = map2b['ICD10'].str[:3]
map2b = map2b.loc[map2b['ICD10'].isin(temp['Code'])]
map2b = map2b.rename({'ICD10':'Code'},axis=1)

hpo_code = pd.concat([map1,map2a,map2b]).drop_duplicates()

hpo = pd.read_csv('./Other/hpo_terms.tsv', sep='\t')
hpo = hpo[['hpo_code','hpo_label']].set_axis(['code','disease'],axis=1)
hpo['disease'] = hpo['disease'].str.lower()

mantis = pd.read_csv('./Other/mantis_ml_scores_out-release.tsv', sep='\t')
mantis['disease'] = mantis['disease'].str.lower()
mantis = mantis.merge(hpo).drop(['resource','disease'],axis=1)
mantis = pd.melt(mantis, id_vars=['code'])
mantis = mantis.rename({'code':'HPO'},axis=1).merge(hpo_code)
mantis = mantis.groupby(['Code','variable'])['value'].max().reset_index()
mantis = mantis.set_axis(['Code','Gene','Mantis'],axis=1)
mantis.to_pickle('./Other/mantis_cleaned.pkl')

mantis = pd.read_csv('./Other/mantis_ml_scores_out-release.tsv', sep='\t').drop('resource',axis=1)
mantis['disease'] = mantis['disease'].str.lower()
mantis = pd.melt(mantis, id_vars=['disease'])
mantis = mantis.groupby('variable')['value'].max().reset_index()
mantis = mantis.set_axis(['gene','max_mantis'],axis=1)
mantis.to_pickle('./Other/mantis_max.pkl')
