In [1]:
import pandas as pd

meddra_all_se.tsv.gz
-----------------------------

1 & 2: STITCH compound ids (flat/stereo, see above)
3: UMLS concept id as it was found on the label
4: MedDRA concept type (LLT = lowest level term, PT = preferred term; in a few cases the term is neither LLT nor PT)
5: UMLS concept id for MedDRA term
6: side effect name

All side effects found on the labels are given as LLT. Additionally, the PT is shown. There is at least one
PT for every LLT, but sometimes the PT is the same as the LLT. LLTs are sometimes too detailed, and therefore
you might want to filter for PT. E.g. for this term:

PT      C0235431        Blood creatinine increased

there are several LLT (leftmost number = count in SIDER 4.1)

149     C0151578        LLT     C0151578        Creatinine increased
100     C0235431        LLT     C0235431        Blood creatinine increased
93      C0700225        LLT     C0700225        Serum creatinine increased
2       C0858118        LLT     C0858118        Plasma creatinine increased

All of these LLTs are equivalent for most purposes and to the same PT. 

344     PT      C0235431        Blood creatinine increased

The mapping was performed by extracting the LLT-->PT relations from UMLS. 


meddra_freq.tsv.gz
-------------------------

This file contains the frequencies of side effects as extracted from the labels. Format:

1 & 2: STITCH compound ids (flat/stereo, see above)
3: UMLS concept id as it was found on the label
4: "placebo" if the info comes from placebo administration, "" otherwise
5: a description of the frequency: for example "postmarketing", "rare", "infrequent", "frequent", "common", or an exact
   percentage
6: a lower bound on the frequency
7: an upper bound on the frequency
8-10: MedDRA information as for meddra_all_se.tsv.gz

The bounds are ranges like 0.01 to 1 for "frequent". If the exact frequency is known, then the lower bound
matches the upper bound. Due to the nature of the data, there can be more than one frequency for the same label,
e.g. from different clinical trials or for different levels of severeness.


meddra_all_indications.tsv.gz
-----------------------------

1: STITCH compound id (flat, see above)
2: UMLS concept id as it was found on the label
3: method of detection: NLP_indication / NLP_precondition / text_mention
4: concept name
5: MedDRA concept type (LLT = lowest level term, PT = preferred term; in a few cases the term is neither LLT nor PT)
6: UMLS concept id for MedDRA term
7: MedDRA concept name

All side effects found on the labels are given as LLT. Additionally, the PT is shown. There is at least one
PT for every LLT, but sometimes the PT is the same as the LLT.


meddra_all_label_indications.tsv.gz and meddra_all_label_se.tsv.gz
---------------------------------------------------------------------------------------

These files contain the same data as the indications/se files, but with an additional first column showing the source label.


meddra.tsv
-----------------------------

1: UMLS concept id
2: MedDRA id
3: kind of term (from MedDRA e.g. PT = preferred term)
4: name of side effect

## 1. map all drug ATCs to drugbank ids

In [2]:
drugs_df = pd.read_csv("../data/side_effects/drug_atc.tsv", sep="\t", header=None)
drugs_df.columns = ['stitch_id', 'atc']
print(len(drugs_df))
drugbank_ids = pd.read_csv('temp/drug_nodes_identifiers.tsv', sep="\t")
drugs_df = pd.merge(left=drugs_df, right=drugbank_ids, on='atc', how='inner')
drugs_df

1560


Unnamed: 0,stitch_id,atc,name,drugbank-id,type,cas,rxcui,unii,uniprot_kb
0,CID100000085,A16AA01,Levocarnitine,DB00583,small molecule,541-15-1,2106.0,0G389FZZ9M,P01857
1,CID100000119,L03AA03,Molgramostim,DB12525,biotech,99283-10-0,70161.0,B321AL142J,
2,CID100000137,L01XD04,Aminolevulinic acid,DB00855,small molecule,106-60-5,155002.0,88755TAZ87,P01857
3,CID100000143,V03AF06,Leucovorin,DB00650,small molecule,58-05-9,6313.0,Q573I9DVLP,P01857
4,CID100000158,G02AD02,Dinoprostone,DB00917,small molecule,363-24-6,3478.0,K7Q1JQR04M,P01857
...,...,...,...,...,...,...,...,...,...
898,CID151601240,J01AA01,Demeclocycline,DB00618,small molecule,127-33-3,3154.0,5R5W9ICI6O,P01857
899,CID154687131,J01AA04,Lymecycline,DB00256,small molecule,992-21-2,6513.0,7D6EM3S13P,P01857
900,CID156603655,S01LA03,Pegaptanib,DB04895,biotech,222716-86-1,498509.0,3HP012Q0FH,
901,CID170695640,C10AC01,Cholestyramine,DB01432,small molecule,11041-12-6,2447.0,4B33BGI082,


## 2. Get a list of all indications

In [3]:
allind_df =  pd.read_csv("../data/side_effects/meddra.tsv", sep="\t", header=None)
allind_df.columns = ['umls_cid','kind','meddra_id','name']
allind_df = allind_df.drop(['kind'], axis=1)
allind_df = allind_df.drop_duplicates()
allind_df

Unnamed: 0,umls_cid,meddra_id,name
0,C0000727,10000647,Acute abdomen
2,C0000727,10042784,Syndrome abdominal acute
3,C0000727,10000096,Abdominal syndrome acute
4,C0000729,10000057,Abdominal cramps
5,C0000729,10000056,Abdominal cramp
...,...,...,...
95906,C3666014,10073984,Mixed ductal lobular breast carcinoma in situ
95907,C3666015,10074155,Device material degradation
95908,C3666016,10074210,Coarse breath sounds
95909,C3666017,10074226,Cholangiopathy


In [4]:
# DUMP TO DISK
allind_df.to_csv('./temp/side_effects.csv', sep='\t', index=False)

## 3. Find side effects of available drugs - edge table

In [5]:
allse_df = pd.read_csv("../data/side_effects/meddra_all_se.tsv", sep="\t", header=None)
allse_df.columns = ['stitch_id', 'stitch_id_stereo','umls_cid','meddra_concept_type','umls_cid_meddra','side_effect_name']
allse_df = allse_df.drop(['meddra_concept_type'], axis=1)
allse_df = allse_df.drop_duplicates()
allse_df

Unnamed: 0,stitch_id,stitch_id_stereo,umls_cid,umls_cid_meddra,side_effect_name
0,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,C0687713,Gastrointestinal pain
5,CID100000085,CID000010917,C0002418,C0002418,Amblyopia
...,...,...,...,...,...
309844,CID171306834,CID071306834,C3203358,C1145670,Respiratory failure
309845,CID171306834,CID071306834,C3665386,C3665386,Abnormal vision
309846,CID171306834,CID071306834,C3665386,C3665347,Visual impairment
309847,CID171306834,CID071306834,C3665596,C3665596,Warts


In [6]:
sefreq_df = pd.read_csv("../data/side_effects/meddra_freq.tsv", sep="\t", header=None)
sefreq_df.columns = ['stitch_id', 'stitch_id_stereo','umls_cid','placebo','freq', 'freq_lb', 'freq_ub', 'meddra_concept_type','umls_cid_meddra','side_effect_name']
sefreq_df = sefreq_df.drop(['meddra_concept_type'], axis=1)
sefreq_df = sefreq_df.drop_duplicates()
sefreq_df['placebo'] = [False if str(i)=='nan' else True for i in sefreq_df['placebo'].to_list()]
sefreq_df = sefreq_df.drop_duplicates()
sefreq_df

Unnamed: 0,stitch_id,stitch_id_stereo,umls_cid,placebo,freq,freq_lb,freq_ub,umls_cid_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,False,21%,0.21,0.21,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,False,21%,0.21,0.21,C0687713,Gastrointestinal pain
3,CID100000085,CID000010917,C0000737,False,5%,0.05,0.05,C0000737,Abdominal pain
5,CID100000085,CID000010917,C0000737,False,5%,0.05,0.05,C0687713,Gastrointestinal pain
6,CID100000085,CID000010917,C0000737,False,6%,0.06,0.06,C0000737,Abdominal pain
...,...,...,...,...,...,...,...,...,...
291622,CID171306834,CID071306834,C2830004,False,3%,0.03,0.03,C2830004,Somnolence
291624,CID171306834,CID071306834,C2830004,False,33%,0.33,0.33,C2830004,Somnolence
291626,CID171306834,CID071306834,C2830004,False,5%,0.00,0.05,C2830004,Somnolence
291628,CID171306834,CID071306834,C2830004,False,5%,0.05,0.05,C2830004,Somnolence


In [7]:
allse_df2 = pd.merge(left=allse_df, right=sefreq_df[['stitch_id', 'placebo', 'freq', 'freq_lb', 'freq_ub']], on='stitch_id', how='left')
allse_df2

Unnamed: 0,stitch_id,stitch_id_stereo,umls_cid,umls_cid_meddra,side_effect_name,placebo,freq,freq_lb,freq_ub
0,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,False,21%,0.21,0.21
1,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,False,21%,0.21,0.21
2,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,False,5%,0.05,0.05
3,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,False,5%,0.05,0.05
4,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,False,6%,0.06,0.06
...,...,...,...,...,...,...,...,...,...
54586582,CID171306834,CID071306834,C3665596,C0347390,Skin papilloma,False,3%,0.03,0.03
54586583,CID171306834,CID071306834,C3665596,C0347390,Skin papilloma,False,33%,0.33,0.33
54586584,CID171306834,CID071306834,C3665596,C0347390,Skin papilloma,False,5%,0.00,0.05
54586585,CID171306834,CID071306834,C3665596,C0347390,Skin papilloma,False,5%,0.05,0.05


In [16]:
allsegrouped_df2 = allse_df2[['stitch_id', 'umls_cid', 'placebo','freq_lb', 'freq_ub']]

In [20]:
allsegrouped_df_max = allsegrouped_df2.groupby(['stitch_id', 'umls_cid']).max(numeric_only=True)
allsegrouped_df_max = pd.DataFrame(allsegrouped_df_max.to_records())
allsegrouped_df_min = allsegrouped_df2.groupby(['stitch_id', 'umls_cid']).min(numeric_only=True)
allsegrouped_df_min = pd.DataFrame(allsegrouped_df_min.to_records())


In [21]:
allsegrouped_df = pd.concat([allsegrouped_df_max[['stitch_id', 'umls_cid', 'freq_ub']], allsegrouped_df_min[['freq_lb']]], join = 'outer', axis = 1)
allsegrouped_df

Unnamed: 0,stitch_id,umls_cid,freq_ub,freq_lb
0,CID100000085,C0000729,0.093518,0.093518
1,CID100000085,C0000737,0.093518,0.093518
2,CID100000085,C0002418,0.093518,0.093518
3,CID100000085,C0002871,0.093518,0.093518
4,CID100000085,C0003123,0.093518,0.093518
...,...,...,...,...
139751,CID171306834,C2830004,0.166813,0.165618
139752,CID171306834,C2979982,0.166813,0.165618
139753,CID171306834,C3203358,0.166813,0.165618
139754,CID171306834,C3665386,0.166813,0.165618


In [25]:
allse_df3 = pd.merge(allse_df, allsegrouped_df, on=['stitch_id', 'umls_cid'], how='left')
allse_df3

Unnamed: 0,stitch_id,stitch_id_stereo,umls_cid,umls_cid_meddra,side_effect_name,freq_ub,freq_lb
0,CID100000085,CID000010917,C0000729,C0000729,Abdominal cramps,0.093518,0.093518
1,CID100000085,CID000010917,C0000729,C0000737,Abdominal pain,0.093518,0.093518
2,CID100000085,CID000010917,C0000737,C0000737,Abdominal pain,0.093518,0.093518
3,CID100000085,CID000010917,C0000737,C0687713,Gastrointestinal pain,0.093518,0.093518
4,CID100000085,CID000010917,C0002418,C0002418,Amblyopia,0.093518,0.093518
...,...,...,...,...,...,...,...
181824,CID171306834,CID071306834,C3203358,C1145670,Respiratory failure,0.166813,0.165618
181825,CID171306834,CID071306834,C3665386,C3665386,Abnormal vision,0.166813,0.165618
181826,CID171306834,CID071306834,C3665386,C3665347,Visual impairment,0.166813,0.165618
181827,CID171306834,CID071306834,C3665596,C3665596,Warts,0.166813,0.165618


In [30]:
edges_df = pd.merge(allse_df3, drugs_df[['stitch_id','atc']], on='stitch_id', how='left')

In [32]:
edges_df = edges_df[['atc','stitch_id','umls_cid','freq_ub','freq_lb']]
edges_df

Unnamed: 0,atc,stitch_id,umls_cid,freq_ub,freq_lb
0,A16AA01,CID100000085,C0000729,0.093518,0.093518
1,A16AA01,CID100000085,C0000729,0.093518,0.093518
2,A16AA01,CID100000085,C0000737,0.093518,0.093518
3,A16AA01,CID100000085,C0000737,0.093518,0.093518
4,A16AA01,CID100000085,C0002418,0.093518,0.093518
...,...,...,...,...,...
189887,L03AB05,CID171306834,C3203358,0.166813,0.165618
189888,L03AB05,CID171306834,C3665386,0.166813,0.165618
189889,L03AB05,CID171306834,C3665386,0.166813,0.165618
189890,L03AB05,CID171306834,C3665596,0.166813,0.165618


In [33]:
# DUMP TO DISK
edges_df.to_csv("./temp/side_effect_edges.csv", sep="\t", index=False)