In [1]:
import pandas as pd
import numpy as np


In [43]:
#read in data
dat = pd.read_table('23andme_v5_hg19_ref.txt', names=['CHR','POS','dbSNP_ID','ALLELE_23andme'])
ann = pd.read_csv('var_drug_ann.tsv', sep = '\t')

In [45]:
#1. Map/merge the 23andme file and the variant-drug annotation file based on dbSNP_ID (also known as rsID).
ann = ann.rename(columns = {'Variant/Haplotypes':'dbSNP_ID',
                           'Gene':'GENE_SYMBOL',
                           'Drug(s)':'DRUG_NAME',
                           'Phenotype Category':'PHENOTYPE_CATEGORY',
                           'Significance':'SIGNIFICANCE',
                           'Notes':'NOTES',
                           'Sentence':'SENTENCE',
                           'Alleles':'ALLELE_PharmGKB'})

data = dat[['dbSNP_ID','ALLELE_23andme']].merge(ann[['dbSNP_ID','GENE_SYMBOL', 'DRUG_NAME',
       'PMID', 'PHENOTYPE_CATEGORY', 'SIGNIFICANCE', 'NOTES', 'SENTENCE',
       'ALLELE_PharmGKB']], how = 'inner', on = 'dbSNP_ID')

#data

In [46]:
#2. Filter the output so that it only contains significant associations (SIGNIFICANCE is yes) for variants that affect the drug efficacy (`PHENOTYPE_CATEGORY` is `efficacy`).
data = data[(data.SIGNIFICANCE == 'yes') & (data.PHENOTYPE_CATEGORY == 'Efficacy')]
data = data.reset_index(drop=True)
data

Unnamed: 0,dbSNP_ID,ALLELE_23andme,GENE_SYMBOL,DRUG_NAME,PMID,PHENOTYPE_CATEGORY,SIGNIFICANCE,NOTES,SENTENCE,ALLELE_PharmGKB
0,rs1801131,T,MTHFR,oxaliplatin,20385995,Efficacy,yes,Statistics given as trend for increased overal...,Genotypes GG + GT are associated with increase...,GG + GT
1,rs1801131,T,MTHFR,"Vitamin B-complex, Incl. Combinations",27035272,Efficacy,yes,Please note that allele has been complemented ...,Allele G is associated with increased response...,G
2,rs1801131,T,MTHFR,methotrexate,25618758,Efficacy,yes,Alleles given as C and A. Efficacy of treatmen...,Genotype GT is associated with decreased respo...,GT
3,rs1801131,T,MTHFR,methotrexate,23198157,Efficacy,yes,Patients with the GG genotype had better overa...,Genotype GG is associated with increased respo...,GG
4,rs1801131,T,MTHFR,methotrexate,24386571,Efficacy,yes,Patients with the TT genotype had significantl...,Genotype TT is associated with decreased respo...,TT
...,...,...,...,...,...,...,...,...,...,...
1298,rs502434,T,GRIA3,"Selective serotonin reuptake inhibitors, venla...",23394390,Efficacy,yes,Regression model looking at interaction betwee...,Genotype CC is associated with decreased respo...,CC
1299,rs3810651,T,GABRQ,venlafaxine,23394390,Efficacy,yes,Those with the AA/AT genotype are more likely ...,Genotypes AA + AT are associated with increase...,AA + AT
1300,rs17435,T,MECP2,"cisplatin, fluorouracil, mitoxantrone",21635146,Efficacy,yes,The study described this variant within the ME...,Allele A is associated with response to cispla...,A
1301,rs1734787,A,MECP2,"cisplatin, fluorouracil, mitoxantrone",21635146,Efficacy,yes,The study described this variant within the ME...,Allele A is associated with response to cispla...,A


In [47]:
#3. Save the output of the filtering step in a tab-separated file (23andme_PharmGKB_map.tsv) with the following columns:
#dbSNP_ID, GENE_SYMBOL, DRUG_NAME, NOTES, SENTENCE, ALLELE_PharmGKB, ALLELE_23andme

data.to_csv('23andme_PharmGKB_map.tsv',sep='\t', columns = ['dbSNP_ID', 'GENE_SYMBOL', 'DRUG_NAME', 'NOTES', 'SENTENCE', 'ALLELE_PharmGKB', 'ALLELE_23andme'])

In [48]:
#4. Create a tab separated file (23andme_PharmGKB_summary.tsv) with summarized data with the following columns:
#GENE_SYMBOL, DRUG_NAME, dbSNP_IDs (list of IDs separated by ";")

#There are rows with multiple drug names and multiple gene symbols so we split those and get list of unique ids with respective gene&drug
data1= pd.DataFrame({'dbSNP_ID': data.dbSNP_ID,
                     'GENE_SYMBOL':data.GENE_SYMBOL,
                     'idx': data.index})

data_drug_split = pd.DataFrame({'DRUG_NAME': [],
                                'idx': []})
data_drug_split.DRUG_NAME = data.DRUG_NAME.str.split(',').explode()
data_drug_split.idx = data_drug_split.index
data_drug_split = data_drug_split.merge(data1, how='left', on='idx')
data_drug_split.idx = data_drug_split.index

data_gene_split = pd.DataFrame({'GENE_SYMBOL': [],
                                'idx': []})
data_gene_split.GENE_SYMBOL = data_drug_split.GENE_SYMBOL.str.split(',').explode()
data_gene_split.idx = data_gene_split.index

data_drug_split = data_drug_split.drop(columns = ['GENE_SYMBOL'])
data_gene_split = data_gene_split.merge(data_drug_split, how='left', on='idx')
data_gene_split = data_gene_split.drop(columns = ['idx'])

data_gene_split

Unnamed: 0,GENE_SYMBOL,DRUG_NAME,dbSNP_ID
0,MTHFR,oxaliplatin,rs1801131
1,MTHFR,Vitamin B-complex,rs1801131
2,MTHFR,Incl. Combinations,rs1801131
3,MTHFR,methotrexate,rs1801131
4,MTHFR,methotrexate,rs1801131
...,...,...,...
2107,MECP2,fluorouracil,rs1734787
2108,MECP2,mitoxantrone,rs1734787
2109,MECP2,cisplatin,rs1734791
2110,MECP2,fluorouracil,rs1734791


In [49]:
data_gene_split.groupby(['GENE_SYMBOL','DRUG_NAME']).value_counts()

GENE_SYMBOL  DRUG_NAME                dbSNP_ID  
 BDNF-AS     methadone                rs11030118    1
                                      rs1967554     1
 CHRNA5      nicotine                 rs16969968    1
 CLCN6       methotrexate             rs1801131     1
 CYP3A4      diazepam                 rs35599367    1
                                                   ..
ZNF432        fluticasone/salmeterol  rs3752120     1
             budesonide               rs3752120     1
ZNF697        interferon beta-1b      rs10494227    1
             interferon beta-1a       rs10494227    1
ZNF804A      antipsychotics           rs1344706     1
Length: 1458, dtype: int64

In [50]:
data_summ = data_gene_split.groupby(['GENE_SYMBOL','DRUG_NAME'])['dbSNP_ID'].agg(lambda x: ';'.join(x)).reset_index()
data_summ = data_summ.rename(columns = {'dbSNP_ID':'dbSNP_IDs'})
data_summ

Unnamed: 0,GENE_SYMBOL,DRUG_NAME,dbSNP_IDs
0,BDNF-AS,methadone,rs1967554;rs11030118
1,CHRNA5,nicotine,rs16969968
2,CLCN6,methotrexate,rs1801131
3,CYP3A4,diazepam,rs35599367
4,DRD2,naltrexone,rs1800497
...,...,...,...
1123,ZNF432,fluticasone/salmeterol,rs3752120
1124,ZNF432,budesonide,rs3752120
1125,ZNF697,interferon beta-1b,rs10494227
1126,ZNF697,interferon beta-1a,rs10494227


In [51]:
data_summ.to_csv('23andme_PharmGKB_summary.tsv',sep='\t')