In [169]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import pubchempy as pcp
import requests

In [170]:
pharmgkb_df = pd.read_csv('rawData/PharmGKB/clinicalVariants/clinicalVariants.tsv', sep='\t')
# print(pharmgkb_df)
pharmgkb_df.head()

chemical_df = pharmgkb_df.dropna(subset=['chemicals'])
chemical_df = chemical_df[~chemical_df['chemicals'].str.contains(',')]
chemical_df = chemical_df[~chemical_df['chemicals'].str.contains('/')]

# filtering missense
missense_df = chemical_df[chemical_df['variant'].str.contains('rs')]

level_df = missense_df[~missense_df['level of evidence'].str.contains('4')]
level_df

Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
3,rs17376848,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms
5,rs1801265,DPYD,Toxicity,1A,capecitabine,Neoplasms
8,rs1801160,DPYD,Toxicity,1A,capecitabine,Neoplasms
9,rs1801159,DPYD,Toxicity,1A,capecitabine,Neoplasms
...,...,...,...,...,...,...
4764,rs57098334,SLC6A4,Toxicity,3,antidepressants,Depression
4766,rs1065852,CYP2D6,Efficacy,3,escitalopram,"Depressive Disorder, Major"
4767,rs924607,CEP72,Toxicity,3,vincristine,Peripheral Nervous System Diseases
4773,rs2231142,ABCG2,Toxicity,3,atorvastatin,


In [171]:
variant_df = pd.read_csv('rawData/PharmGKB/variants/variants.tsv', sep='\t')
columns = ['Variant Name', 'Gene Symbols', 'Synonyms']
variant_df = pd.DataFrame(data=variant_df, columns=columns)
variant_df = variant_df.dropna(subset=['Gene Symbols'])
variant_df = variant_df[variant_df['Synonyms'].str.contains('NP')].reset_index(drop=True)
variant_df

Unnamed: 0,Variant Name,Gene Symbols,Synonyms
0,rs10012,CYP1B1,"rs3172104, NG_008386.2:g.5855C>G, 58311708, NG..."
1,rs1002012563,ABCG2,"1002012563, NC_000004.12:g.88097565G>C, NC_000..."
2,rs10036156,GABRP,"XM_005265874.1:c.19T>C, XP_005265928.1:p.Leu7=..."
3,rs1013940,SLC5A7,"NP_068587.1:p.Ile89Val, NG_042267.1:g.10679A>G..."
4,rs10187694,"UGT1A10,UGT1A13P,UGT1A8","NC_000002.12:g.233636937=, rs10187694, rs17866..."
...,...,...,...
1932,rs28399440,CYP2A6,"NG_008377.1:g.6693=, NC_000019.10:g.40848755=,..."
1933,rs370100007,CYP2C9,"NP_000762.2:p.Gln214His, NC_000010.11:g.949479..."
1934,rs375805362,CYP2C9,"NG_008385.2:g.9076=, NG_008385.2:g.9076C>T, NP..."
1935,rs568811809,CYP2A6,"NP_000753.3:p.Lys196fs, NC_000019.9:g.41354190..."


In [172]:
protein_dict = {
    'Gly':'G', 'Ala':'A', 'Val':'V', 'Leu':'L', 'Ile':'I',
    'Pro':'P', 'Phe':'F', 'Tyr':'Y', 'Trp':'W', 'Ser':'S', 
    'Thr':'T', 'Cys':'C', 'Met':'M', 'Asn':'N', 'Gln':'Q',
    'Asp':'D', 'Glu':'E', 'Lys':'K', 'Arg':'R', 'His':'H'
}

# split NPnumber
variant_table = pd.DataFrame(columns=['gene', 'rsid', 'variant', 'NPid'])
for i in range(variant_df.shape[0]): # 
    gene = variant_df['Gene Symbols'][i].split(',')[0]
    rsid = variant_df['Variant Name'][i]
    synonyms = variant_df['Synonyms'][i]
    # split and filtering
    syn_list = synonyms.split(',')
    np_list = [n for n in syn_list if 'NP' in n]
    missense_list = [n for n in np_list if '=' not in n]
    #print(missense_list)
    try: # for = in NPid: not missense
        NPid = missense_list[0].strip()
    except IndexError:
        continue
    string = NPid.split('p.')[-1]
    try:
        variant = protein_dict[string[0:3]] + string[3:-3] + protein_dict[string[-3:]]
    except KeyError:
        continue
    variant_table = variant_table.append([{'gene':gene, 'rsid':rsid, 'variant':variant, 'NPid':NPid}], ignore_index=True)
variant_table


Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,ABCG2,rs1002012563,T512N,NP_004818.2:p.Thr512Asn
2,GABRP,rs10036156,L7V,NP_055026.1:p.Leu7Val
3,SLC5A7,rs1013940,I89V,NP_068587.1:p.Ile89Val
4,UGT1A10,rs10187694,E139K,NP_061948.1:p.Glu139Lys
...,...,...,...,...
1545,CYP2A6,rs199916117,K194E,NP_000753.3:p.Lys194Glu
1546,CYP3A4,rs201821708,Y319C,NP_059488.2:p.Tyr319Cys
1547,CYP2A6,rs28399440,F118L,NP_000753.3:p.Phe118Leu
1548,CYP2C9,rs370100007,Q214H,NP_000762.2:p.Gln214His


In [173]:
# filtering evidence variants
evidence_variant_list = level_df['variant'].unique()
table_variant_list = variant_table['rsid'].unique()
error_list = [n for n in evidence_variant_list if n not in table_variant_list]
print(len(error_list))

for item in error_list:
    level_df = level_df[~level_df['variant'].str.contains(item)]
level_df = level_df.reset_index(drop=True)
level_df

1683


Unnamed: 0,variant,gene,type,level of evidence,chemicals,phenotypes
0,rs2297595,DPYD,Toxicity,1A,capecitabine,Neoplasms
1,rs1801265,DPYD,Toxicity,1A,capecitabine,Neoplasms
2,rs1801160,DPYD,Toxicity,1A,capecitabine,Neoplasms
3,rs1801159,DPYD,Toxicity,1A,capecitabine,Neoplasms
4,rs1801158,DPYD,Toxicity,1A,fluorouracil,Neoplasms
...,...,...,...,...,...,...
1092,rs279858,GABRA2,Other,3,sevoflurane,
1093,rs1127354,ITPA,Efficacy,3,azathioprine,"liver transplantation,transplant rejection"
1094,rs1142345,TPMT,Toxicity,3,cisplatin,"Drug Toxicity,Neoplasms,Ototoxicity"
1095,rs1065852,CYP2D6,Efficacy,3,escitalopram,"Depressive Disorder, Major"


In [174]:
# fix variant_df
evidence_variant_list = level_df['variant'].unique()
table_variant_list = variant_table['rsid'].unique()
error_list = [n for n in table_variant_list if n not in evidence_variant_list]
for item in error_list:
    variant_table = variant_table[~variant_table['rsid'].str.contains(item)]
variant_table = variant_table.reset_index(drop=True)
variant_table

Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,GRK4,rs1024323,A142V,NP_001004057.1:p.Ala142Val
2,CYP2C9,rs1029359343,R307K,NP_000762.2:p.Arg307Lys
3,GLP1R,rs10305420,P7L,NP_002053.3:p.Pro7Leu
4,ZNF568,rs10405238,Y488D,NP_001191767.1:p.Tyr488Asp
...,...,...,...,...
514,ABCB1,rs9282564,N21H,NP_000918.2:p.Asn21His
515,RRP1B,rs9306160,L436P,NP_055871.1:p.Leu436Pro
516,CYP2C9,rs9332239,P489S,NP_000762.2:p.Pro489Ser
517,MYLIP,rs9370867,N342I,NP_037394.2:p.Asn342Ile


In [175]:
level_df.loc[level_df['variant']=='rs16947','gene'] = 'CYP2D6'

In [176]:
# fix evidence table
evidence_table = pd.DataFrame(columns=['gene', 'variant', 'chemicals', 'type','phenotypes'])
for i in range(level_df.shape[0]): # 
    try:
        gene = level_df['gene'][i].split(',')[0]
    except AttributeError:
        print(level_df['gene'][i])
        print(level_df['variant'][i])
    variant = level_df['variant'][i]
    chemicals = level_df['chemicals'][i]
    phenotypes = level_df['phenotypes'][i]
    type = level_df['type'][i]
    evidence_table = evidence_table.append([{'gene':gene, 'variant':variant, 'chemicals':chemicals, 'type':type, 'phenotypes': phenotypes}], ignore_index=True)
#evidence_table.loc[evidence_table['variant']=='rs16947','gene'] = 'CYP2D6'
evidence_table

Unnamed: 0,gene,variant,chemicals,type,phenotypes
0,DPYD,rs2297595,capecitabine,Toxicity,Neoplasms
1,DPYD,rs1801265,capecitabine,Toxicity,Neoplasms
2,DPYD,rs1801160,capecitabine,Toxicity,Neoplasms
3,DPYD,rs1801159,capecitabine,Toxicity,Neoplasms
4,DPYD,rs1801158,fluorouracil,Toxicity,Neoplasms
...,...,...,...,...,...
1092,GABRA2,rs279858,sevoflurane,Other,
1093,ITPA,rs1127354,azathioprine,Efficacy,"liver transplantation,transplant rejection"
1094,TPMT,rs1142345,cisplatin,Toxicity,"Drug Toxicity,Neoplasms,Ototoxicity"
1095,CYP2D6,rs1065852,escitalopram,Efficacy,"Depressive Disorder, Major"


In [177]:
# check uniprot mutation
import re
gene_list = evidence_table['gene'].unique()
gene_table = pd.DataFrame(columns=['gene', 'uniprotac', 'fasta'])
for i in tqdm(range(len(gene_list))):
    gene = gene_list[i]
    url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(reviewed:true)%20AND%20(organism_id:9606)%20AND%20' + gene_list[i]
    all_fastas = requests.get(url).text
    fasta_list = re.split(r'\n(?=>)', all_fastas)
    try:
        entry = [fasta for fasta in fasta_list if 'GN='+gene_list[i] in fasta][0]
    except IndexError:
        print(gene)
        continue
    except SSLError:
        print(gene)
        continue
    uniprotac = entry.split('|')[1]
    fasta = entry.split(' ')[-1].replace('\n','')[4:]
    gene_table = gene_table.append([{'gene':gene, 'uniprotac':uniprotac, 'fasta':fasta}], ignore_index=True)
print(gene_table)
gene_table.to_csv('middlefile/pharmgkb_gene_table.csv', index=None)


 14%|█▍        | 38/263 [01:04<06:18,  1.68s/it]

CD3EAP


 94%|█████████▍| 247/263 [06:48<00:31,  1.95s/it]

CYP2A7P1


100%|██████████| 263/263 [07:12<00:00,  1.64s/it]

        gene uniprotac                                              fasta
0       DPYD    Q12882  MAPVLSKDSADIESILALNPRTQTHATLCSTSAKKLDKKHWKRNPD...
1     CYP4F2    P78329  MSQLSLSWLGLWPVAASPWLLLLLVGASWLLAHVLAWTYAFYDNCR...
2       EGFR    P00533  MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
3     CYP2B6    P20813  MELSVLLFLALLTGLLLLLVQRHPNTHDRLPPGPRPLPLLGNLLQM...
4     NUDT15    Q9NV35  MTASAQPRGRRPGVGVGVVVTSCKHPRCVLLGKRKGSVGAGSFQLP...
..       ...       ...                                                ...
256   SRD5A2    P31213  MQVQCQQSPVLAGSATLVALGALALYVAKPSGYGKHTESLKPAATR...
257     AHRR    A9YTQ3  MPRTMIPPGECTYAGRKRRRPLQKQRPAVGAEKSNPSKRHRDRLNA...
258    VEGFB    P49765  MSPLLRRLLLAALLQLAPAQAPVSQPDAPGHQRKVVSWIDVYTRAT...
259  FASTKD3    Q14CZ7  MALITLRKNLYRLSDFQMHRALAALKNKPLNHVHKVVKERLCPWLC...
260   GABRA2    P47869  MKTKLNIYNMQFLLFVFLVWDPARLVLANIQEDEAKNNITIFTRIL...

[261 rows x 3 columns]





In [178]:
evidence_table = evidence_table[~evidence_table['gene'].str.contains('CYP2A7P1')]
evidence_table = evidence_table[~evidence_table['gene'].str.contains('CD3EAP')]
evidence_table = evidence_table.reset_index(drop=True)
evidence_table

Unnamed: 0,gene,variant,chemicals,type,phenotypes
0,DPYD,rs2297595,capecitabine,Toxicity,Neoplasms
1,DPYD,rs1801265,capecitabine,Toxicity,Neoplasms
2,DPYD,rs1801160,capecitabine,Toxicity,Neoplasms
3,DPYD,rs1801159,capecitabine,Toxicity,Neoplasms
4,DPYD,rs1801158,fluorouracil,Toxicity,Neoplasms
...,...,...,...,...,...
1088,GABRA2,rs279858,sevoflurane,Other,
1089,ITPA,rs1127354,azathioprine,Efficacy,"liver transplantation,transplant rejection"
1090,TPMT,rs1142345,cisplatin,Toxicity,"Drug Toxicity,Neoplasms,Ototoxicity"
1091,CYP2D6,rs1065852,escitalopram,Efficacy,"Depressive Disorder, Major"


In [179]:
# manually fix
variant_table = variant_table.append([{'gene': 'UGT1A10', 'rsid': 'rs6759892', 'variant': 'S7A', 'NPid':'NP_001063.2'}], ignore_index=True)
variant_table = variant_table.append([{'gene': 'UGT1A4', 'rsid': 'rs6755571', 'variant': 'P24S', 'NPid':'NP_009051.1'}], ignore_index=True)
variant_table = variant_table.append([{'gene': 'UGT2B7', 'rsid': 'rs61361928', 'variant': 'L46P', 'NPid':'NP_001065.2'}], ignore_index=True)
variant_table.drop_duplicates(inplace=True)
variant_table = variant_table.reset_index(drop=True)
variant_table

Unnamed: 0,gene,rsid,variant,NPid
0,CYP1B1,rs10012,R48G,NP_000095.2:p.Arg48Gly
1,GRK4,rs1024323,A142V,NP_001004057.1:p.Ala142Val
2,CYP2C9,rs1029359343,R307K,NP_000762.2:p.Arg307Lys
3,GLP1R,rs10305420,P7L,NP_002053.3:p.Pro7Leu
4,ZNF568,rs10405238,Y488D,NP_001191767.1:p.Tyr488Asp
...,...,...,...,...
517,MYLIP,rs9370867,N342I,NP_037394.2:p.Asn342Ile
518,CD68,rs9901675,A350T,NP_001242.2:p.Ala350Thr
519,UGT1A10,rs6759892,S7A,NP_001063.2
520,UGT1A4,rs6755571,P24S,NP_009051.1


In [180]:
evidence_table.to_csv('middlefile/pharmgkb_evidence_table.csv', index=None)
variant_table.to_csv('middlefile/pharmgkb_variant_table.csv', index=None)

In [181]:
# change variant info to evidence table
evidence_table_new = pd.DataFrame(columns=['gene', 'variant', 'rsid', 'drugs', 'type', 'phenotypes'])
for i in range(len(evidence_table)):
    gene = evidence_table['gene'][i]
    #print(gene)
    rsvariant = evidence_table['variant'][i]
    variant = variant_table[variant_table['rsid'] == rsvariant]['variant'].values[0]
    drugs = evidence_table['chemicals'][i]
    type = evidence_table['type'][i]
    phenotypes = evidence_table['phenotypes'][i]
    evidence_table_new = evidence_table_new.append([{'gene':gene, 'variant':variant, 'rsid':rsvariant, 'drugs':drugs, 'type':type, 'phenotypes':phenotypes}], ignore_index=True)
print(evidence_table_new)
evidence_table_new.to_csv('middlefile/pharmgkb_evidence_table.csv', index=None)

        gene variant       rsid         drugs      type  \
0       DPYD   M166V  rs2297595  capecitabine  Toxicity   
1       DPYD    C29R  rs1801265  capecitabine  Toxicity   
2       DPYD   V732I  rs1801160  capecitabine  Toxicity   
3       DPYD   I543V  rs1801159  capecitabine  Toxicity   
4       DPYD   S534N  rs1801158  fluorouracil  Toxicity   
...      ...     ...        ...           ...       ...   
1088  GABRA2   K132N   rs279858   sevoflurane     Other   
1089    ITPA    P32T  rs1127354  azathioprine  Efficacy   
1090    TPMT   Y240C  rs1142345     cisplatin  Toxicity   
1091  CYP2D6    P34A  rs1065852  escitalopram  Efficacy   
1092   ABCG2   Q141K  rs2231142  atorvastatin  Toxicity   

                                      phenotypes  
0                                      Neoplasms  
1                                      Neoplasms  
2                                      Neoplasms  
3                                      Neoplasms  
4                                   

In [182]:
# fix variant table based on sequence
variant_table.loc[variant_table['rsid']=='rs1042713','variant'] = 'G16R'
variant_table.loc[variant_table['rsid']=='rs71647871','variant'] = 'G143E'
variant_table.loc[variant_table['rsid']=='rs671','variant'] = 'E504K'
variant_table.loc[variant_table['rsid']=='rs396991','variant'] = 'F281I'
variant_table.loc[variant_table['rsid']=='rs20455','variant'] = 'W719R'

variant_table.to_csv('middlefile/pharmgkb_variant_table.csv', index=None)

In [183]:
protein_dict = {'C':0, 'D':1, 'S':2, 'Q':3, 'K':4,
        'I':5, 'P':6, 'T':7, 'F':8, 'N':9,
        'G':10, 'H':11, 'L':12, 'R':13, 'W':14,
        'A':15, 'V':16, 'E':17, 'Y':18, 'M':19}

In [184]:
# check mutation sequence
gene_table_manu = pd.read_csv('middlefile/pharmgkb_gene_table.csv')
for i in range(len(evidence_table_new)):
    gene = evidence_table_new['gene'][i]
    # print(gene)
    variant = evidence_table_new['variant'][i]
    fasta = gene_table_manu[gene_table_manu['gene'] == gene]['fasta'].values[0]
    #print(fasta)
    pos_before = variant[0]
    pos = int(variant[1:-1])
    #print(pos)
    pos_after = variant[-1]
    
    # check pos_after
    if(pos_after not in protein_dict.keys()):
        print('pos_after', gene, variant)
    # check pos_before
    try:
        if(fasta[pos-1] != pos_before):
            print('pos_before', gene, variant)
    except IndexError:
        print('IndexError', gene, variant)

pos_before ADRB2 R16G
pos_before VKORC1 D36Y
pos_before CES1 G144E
pos_before CES1 G144E
pos_before ALDH2 E457K
pos_before FCGR3A F175V
pos_before KIF6 W170R
pos_before UGT1A10 S7A
pos_before ABCB1 I1145M
pos_before ABCB1 I1145M
pos_before ABCB1 I1145M
pos_before ABCB1 S893A
pos_before ABCB1 S893A
pos_before VKORC1 V29L
pos_before VKORC1 V45A
pos_before VKORC1 R58G
pos_before VKORC1 L128R
pos_before ABCB1 S400N
pos_before ABCB1 S893A
pos_before KCNH2 R444G
pos_before CYP2C8 K297R
pos_before CLCN6 A222V
pos_before ABCB1 I1145M
pos_before DDRGK1 P32T
pos_before KCNIP1 E65K
pos_before UGT1A10 T181A
pos_before CYP2C8 K297R
pos_before ABCB1 I1145M
pos_before KCNIP1 V110L
pos_before ABCB1 I1145M
pos_before ABCB1 I1145M
pos_before ABCB1 N21H
pos_before ABCB1 A999S
pos_before ABCB1 S893A
pos_before ERCC1 K261R
pos_before TLR4 D99G
pos_before ABCB1 I1145M
pos_before ABCB1 I1145M
pos_before AGTR1 A192T
pos_before ABCB1 S893A
pos_before POR A503V
pos_before ABCB1 I1145M
pos_before NOD2 R675W
pos_

In [185]:
# remove error evidence
gene_table_manu = pd.read_csv('middlefile/pharmgkb_gene_table.csv')
evidence_table_new = pd.read_csv('middlefile/pharmgkb_evidence_table.csv')
fix_evidence_df = evidence_table_new.copy()
for i in range(len(evidence_table_new)):
    gene = evidence_table_new['gene'][i]
    #print(gene)
    variant = evidence_table_new['variant'][i]
    fasta = gene_table_manu[gene_table_manu['gene'] == gene]['fasta'].values[0]
    #print(fasta)
    pos_before = variant[0]
    pos = int(variant[1:-1])
    #print(pos)
    pos_after = variant[-1]
    
    # check pos_after
    if(pos_after not in protein_dict.keys()):
        #print('pos_after', gene, variant)
        fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
    # check pos_before
    try:
        if(fasta[pos-1] != pos_before):
            #print('pos_before', gene, variant)
            fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
    except IndexError:
        #print('IndexError', gene, variant)
        fix_evidence_df = fix_evidence_df[~((fix_evidence_df['gene'] == gene) & (fix_evidence_df['variant'] == variant))]
fix_evidence_df = fix_evidence_df.reset_index(drop=True)


In [186]:
fix_evidence_df.to_csv('middlefile/pharmgkb_evidence_table.csv', index=None)
drug_unique_list = fix_evidence_df['drugs'].unique()
print(len(drug_unique_list))

230


In [187]:
error_list = []
for i in tqdm(range(len(drug_unique_list))):
    if(len(pcp.get_compounds(drug_unique_list[i],'name')) == 0):
        error_list.append(drug_unique_list[i])
error_list

100%|██████████| 230/230 [05:28<00:00,  1.43s/it]


['hormonal contraceptives for systemic use',
 'hmg coa reductase inhibitors',
 'Platinum compounds',
 'diuretics',
 'trastuzumab',
 'Measles vaccines',
 'antipsychotics',
 'pitrakinra',
 'rituximab',
 'egfr inhibitors',
 'Tumor necrosis factor alpha (TNF-alpha) inhibitors',
 'eculizumab',
 'bevacizumab',
 'catecholamines',
 'corticosteroids',
 'antidepressants',
 'Photodynamic therapy',
 'Opium alkaloids and derivatives',
 'Antibiotics',
 'cetuximab',
 'gemtuzumab ozogamicin',
 'Beta Blocking Agents',
 'Antithyroid Preparations',
 'ustekinumab',
 'Drugs Used In Diabetes',
 'etanercept',
 'glucocorticoids',
 'botulinum toxin type a',
 'anthracyclines and related substances']

In [188]:
# fix drugname for pubchem
fixdrug_df = fix_evidence_df[~fix_evidence_df['drugs'].str.contains('highly active antiretroviral therapy (haart)')]
for item in error_list:
    fixdrug_df = fixdrug_df[~(fixdrug_df['drugs'] == item)]
fixdrug_df = fixdrug_df.reset_index(drop=True)
fixdrug_df.to_csv('middlefile/pharmgkb_evidence_table.csv', index=None)
fixdrug_df

Unnamed: 0,gene,variant,rsid,drugs,type,phenotypes
0,DPYD,M166V,rs2297595,capecitabine,Toxicity,Neoplasms
1,DPYD,C29R,rs1801265,capecitabine,Toxicity,Neoplasms
2,DPYD,V732I,rs1801160,capecitabine,Toxicity,Neoplasms
3,DPYD,I543V,rs1801159,capecitabine,Toxicity,Neoplasms
4,DPYD,S534N,rs1801158,fluorouracil,Toxicity,Neoplasms
...,...,...,...,...,...,...
751,GABRA2,K132N,rs279858,sevoflurane,Other,
752,ITPA,P32T,rs1127354,azathioprine,Efficacy,"liver transplantation,transplant rejection"
753,TPMT,Y240C,rs1142345,cisplatin,Toxicity,"Drug Toxicity,Neoplasms,Ototoxicity"
754,CYP2D6,P34A,rs1065852,escitalopram,Efficacy,"Depressive Disorder, Major"


In [189]:
drug_list = fixdrug_df['drugs'].unique()
new_drug_table = pd.DataFrame(columns=['drugname', 'smile', 'molecular_weight', 'molecular_formula', 'atom', 'fingerprint', 'cactvs_fingerprint'])
for i in tqdm(range(len(drug_list))):
    drugname = drug_list[i]
    compound = pcp.get_compounds(drugname,'name')[0]
    try:
        smile = compound.isomeric_smiles
    except AttributeError:
        smile = np.nan
    try:
        molecular_weight = compound.molecular_weight
    except AttributeError:
        molecular_weight = np.nan   
    try:
        molecular_formula = compound.molecular_formula
    except AttributeError:
        molecular_formula = np.nan
    try: 
        atom = compound.atoms
    except AttributeError:
        atom = np.nan
    try:
        fingerprint = compound.fingerprint
    except AttributeError:
        fingerprint = np.nan
    try:
        cactvs_fingerprint = compound.cactvs_fingerprint
    except AttributeError:
        cactvs_fingerprint = np.nan
    new_drug_table = new_drug_table.append([{'drugname':drugname, 'smile':smile, 'molecular_weight':molecular_weight, 'molecular_formula':molecular_formula, 
                                    'atom':atom, 'fingerprint':fingerprint, 'cactvs_fingerprint':cactvs_fingerprint}], ignore_index=True)
print(new_drug_table)
new_drug_table.to_csv('middlefile/pharmgkb_drug_table_fpfixed.csv', index=None)

100%|██████████| 201/201 [04:54<00:00,  1.47s/it]

         drugname                                              smile  \
0    capecitabine  CCCCCOC(=O)NC1=NC(=O)N(C=C1F)[C@H]2[C@@H]([C@@...   
1    fluorouracil                               C1=C(C(=O)NC(=O)N1)F   
2        warfarin       CC(=O)CC(C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O   
3       gefitinib  COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OC...   
4       efavirenz   C1CC1C#C[C@]2(C3=C(C=CC(=C3)Cl)NC(=O)O2)C(F)(F)F   
..            ...                                                ...   
196    naltrexone  C1CC1CN2CC[C@]34[C@@H]5C(=O)CC[C@]3([C@H]2CC6=...   
197   abiraterone  C[C@]12CC[C@@H](CC1=CC[C@@H]3[C@@H]2CC[C@]4([C...   
198    folic acid  C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=...   
199  Glucarpidase                        [Zn+2].[Zn+2].[Zn+2].[Zn+2]   
200  escitalopram  CN(C)CCC[C@@]1(C2=C(CO1)C=C(C=C2)C#N)C3=CC=C(C...   

    molecular_weight molecular_formula  \
0             359.35       C15H22FN3O6   
1             130.08         C4H3FN2O2   
2        




In [196]:
evidence_table = pd.read_csv('middlefile/pharmgkb_evidence_table.csv')
variant_table = pd.read_csv('middlefile/pharmgkb_variant_table.csv')
gene_table = pd.read_csv('middlefile/pharmgkb_gene_table.csv')
drug_table = pd.read_csv('middlefile/pharmgkb_drug_table_fpfixed.csv')

pharmgkb_all_table = pd.DataFrame(columns=['data source', 'drug name', 'smile', 'Pubchem Fingerprint', 'disease name', 'gene symbol', 'mutation name', 'label'])
for i in tqdm(range(len(evidence_table))):
    data_source = 'pharmgkb'
    gene_symbol = evidence_table['gene'][i]
    mutation_name = evidence_table['variant'][i]
    drug_name = evidence_table['drugs'][i]
    label = evidence_table['type'][i]
    disease_name = evidence_table['phenotypes'][i]
    try:
        smile = drug_table[drug_table['drugname'] == drug_name]['smile'].values[0]
    except IndexError:
        print(drug_name)
        continue
    try:
        fingerprint = drug_table[drug_table['drugname'] == drug_name]['fingerprint'].values[0]
    except IndexError:
        print(drug_name)
        continue
    pharmgkb_all_table = pharmgkb_all_table.append([{'data source':data_source, 'drug name':drug_name, 'smile':smile, 'Pubchem Fingerprint':fingerprint, 'disease name':disease_name, 
                                    'gene symbol':gene_symbol, 'mutation name':mutation_name, 'label':label}], ignore_index=True)
    

pharmgkb_all_table.to_csv('middlefile/pharmgkb_all_table.csv', index=None)


100%|██████████| 756/756 [00:01<00:00, 424.26it/s]
