In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [3]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHMBPRGPONC13_final.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHMBPRGPONC13_final.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [70]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHMBPRGPONC13_final.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,942335,rs6605066,C,G,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,18,4
1,chr1,944858,rs3748592,A,G,.,PASS,"ADP=26;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.07907,0...",1/1,146,27,26,0,26,100%,2.0165E-15,0,0,18,8
2,chr1,946247,rs2272757,G,A,.,PASS,"ADP=90;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5581,0....",0/1,149,90,90,49,41,45.56%,1.1691E-15,41,8,35,6
3,chr1,948245,rs4970378,A,G,.,PASS,"ADP=34;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,194,37,34,0,34,100%,3.5146E-20,0,0,29,5
4,chr1,952180,rs3748595,A,C,.,PASS,"ADP=14;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1002,0....",1/1,70,14,14,0,13,92.86%,9.6148E-8,0,0,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32941,chrY,11986608,rs879016544,T,C,.,PASS,ADP=83;WT=0;HET=0;HOM=1;NC=0;ASP;RS=879016544;...,1/1,255,83,83,0,80,100%,1.0864E-47,0,0,27,53
32942,chrY,11986732,rs867343413,C,T,.,PASS,ADP=14;WT=0;HET=0;HOM=1;NC=0;ASP;RS=867343413;...,1/1,76,14,14,0,14,100%,2.4927E-8,0,0,13,1
32943,chrY,56961138,.,A,G,.,PASS,ADP=51;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intergenic_...,0/1,133,51,51,18,33,64.71%,4.3121E-14,17,1,25,8
32944,chrY,56961295,.,G,T,.,PASS,ADP=19;WT=0;HET=0;HOM=1;NC=0;CSQ=T|intergenic_...,1/1,105,19,19,0,19,100%,2.8292E-11,0,0,14,5


In [71]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)
vcf = vcf[['rsID', 'Zygosity', 'REF', 'ALT', 'CHROM']]
#vcf['matched'] = 'yes'
vcf

Unnamed: 0,rsID,Zygosity,REF,ALT,CHROM
0,rs6605066,Homozygous,C,G,chr1
1,rs3748592,Homozygous,A,G,chr1
2,rs2272757,Heterozygous,G,A,chr1
3,rs4970378,Homozygous,A,G,chr1
4,rs3748595,Homozygous,A,C,chr1
...,...,...,...,...,...
32941,rs879016544,Homozygous,T,C,chrY
32942,rs867343413,Homozygous,C,T,chrY
32943,.,Heterozygous,A,G,chrY
32944,.,Homozygous,G,T,chrY


In [68]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Breast_cancer.xlsx')
data = data[['rsID']]
data.drop_duplicates(subset='rsID', inplace=True)
data['matched'] = 'yes'
#print(data.shape)
data

Unnamed: 0,rsID,matched
0,rs12248560,yes
2,rs3745274,yes
5,rs4244285,yes
8,rs2070096,yes
11,rs2284922,yes
14,rs2229046,yes
17,rs773123,yes
20,rs1136201,yes
23,rs1056836,yes
26,rs2231142,yes


In [74]:
df_map = pd.merge(data, vcf, on = 'rsID', how='left', sort=False)
df_map['Zygosity'] = df_map['Zygosity'].fillna("Wild")
df_map = df_map[['rsID', 'Zygosity', 'REF', 'ALT']]
df_map

Unnamed: 0,rsID,Zygosity,REF,ALT
0,rs12248560,Wild,,
1,rs3745274,Wild,,
2,rs4244285,Heterozygous,G,A
3,rs2070096,Wild,,
4,rs2284922,Wild,,
5,rs2229046,Wild,,
6,rs773123,Wild,,
7,rs1136201,Wild,,
8,rs1056836,Wild,,
9,rs2231142,Wild,,


In [85]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Breast_cancer.xlsx')
#data['REF'] = data['Allele'].str.split('>').str[0]
#data['ALT'] = data['Allele'].str.split('>').str[1]
#data['ALT'] = data['ALT'].str.split(',')
#data = data.explode('ALT')
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),CHROM,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,


In [86]:
final = pd.merge(df_map, data, on = ['rsID', 'Zygosity'], how='inner', sort=False)
final

Unnamed: 0,rsID,Zygosity,REF,ALT,Profile (Header),Category(Class of Drugs),CHROM,Gene,is present,Haplotype,Allele,Genotype,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs3745274,Wild,,,Breast Neoplasms,Neoplasms,chr19,CYP2B6,Covered,-,"G>A,T",TT,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
1,rs4244285,Heterozygous,G,A,Breast Neoplasms,Neoplasms,chr10,CYP2C19,Covered,-,"G>A,C,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202551,Patients with the AG genotype and Breast Cance...,Breast Neoplasms,Efficacy,Good,,,,,3,2,2,3.00,,Tier 1 VIP,,2021-03-24,
2,rs4244285,Heterozygous,G,A,Breast Neoplasms,Neoplasms,chr10,CYP2C19,Covered,-,"G>A,C,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718271,Patients with AG genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,High SE,,,,3,1,1,2.75,,Tier 1 VIP,,2021-03-24,
3,rs2070096,Wild,,,Breast Neoplasms,Neoplasms,chr2,BARD1,Covered,-,"C>A,G,T",CC,-,Breast Neoplasms,Carboplatin;Docetaxel;Trastuzumab,Carboplatin;Docetaxel;Trastuzumab,1449713649,Patients with the CC genotype and breast cance...,Breast Neoplasms,Efficacy,Good,,,,,3,1,1,2.50,,,,2021-03-24,
4,rs2284922,Wild,,,Breast Neoplasms,Neoplasms,chr6,RNF8,Covered,-,G>A,GG,-,Breast Neoplasms,Carboplatin;Docetaxel;Trastuzumab,Carboplatin;Docetaxel;Trastuzumab,1449713660,Patients with the GG genotype and breast cance...,Breast Neoplasms,Efficacy,Poor,,,,,3,1,1,2.50,,,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,rs351855,Wild,,,Breast Neoplasms,Neoplasms,chr5,FGFR4,Covered,-,G>A,GG,-,Breast Neoplasms,Cyclophosphamide;Fluorouracil;Methotrexate,Cyclophosphamide;Fluorouracil;Methotrexate,1447963611,Patients with the GG genotype and node-positiv...,Breast Neoplasms,Efficacy,Good,,,,,3,1,1,1.75,,,,2021-03-24,
73,rs1804645,Wild,,,Breast Neoplasms,Neoplasms,chr2,NCOA1,Covered,-,C>T,CC,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1448100644,Female patients with the CC genotype may have ...,Breast Neoplasms,Toxicity,,Low SE,,,Female,3,1,5,1.00,,,,2021-03-24,
74,rs7349683,Homozygous,C,T,Breast Neoplasms,Neoplasms,chr4,EPHA5,Covered,-,"C>A,T",TT,-,Breast Neoplasms,Paclitaxel,Paclitaxel,1448100826,Women with the TT genotype and breast or ovari...,Breast Neoplasms;Ovarian Neoplasms;Peripheral ...,Toxicity,,High SE,,,Female,3,4,4,2.00,,,,2021-03-24,
75,rs55633228,Wild,,,Breast Neoplasms,Neoplasms,chr19,PIK3R2,Covered,-,C>T,CC,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449271039,Patients with breast cancer and the CC genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Low SE,,,,3,1,1,0.00,,Rare Variant,,2021-03-24,


In [94]:
final.Gender.value_counts(dropna=False)

NaN       64
Female    13
Name: Gender, dtype: int64

In [116]:
final_new = final[['MODIFIED_Drugs(Molecules)', 'Drug response', 'Side effects', 'Metabolism status', 'Dosage status']]
final_new = final_new.rename(columns={'MODIFIED_Drugs(Molecules)': 'Molecules',
                                          'Drug response': 'Drug_response',
                                          'Side effects': 'Side_effects',
                                          'Metabolism status': 'Metabolism_status',
                                          'Dosage status': 'Dosage_status'})
final_new

Unnamed: 0,Molecules,Drug_response,Side_effects,Metabolism_status,Dosage_status
0,Cyclophosphamide;Doxorubicin,,,,Intermediate Dose
1,Cyclophosphamide;Doxorubicin,Good,,,
2,Cyclophosphamide;Doxorubicin;Fluorouracil,,High SE,,
3,Carboplatin;Docetaxel;Trastuzumab,Good,,,
4,Carboplatin;Docetaxel;Trastuzumab,Poor,,,
...,...,...,...,...,...
72,Cyclophosphamide;Fluorouracil;Methotrexate,Good,,,
73,Tamoxifen,,Low SE,,
74,Paclitaxel,,High SE,,
75,Cyclophosphamide;Epirubicin;Fluorouracil,,Low SE,,


In [129]:
drug = pd.pivot_table(final_new, index='Molecules', columns='Drug_response', aggfunc='size', fill_value=0)
drug = drug.reset_index()
drug

Drug_response,Molecules,Good,Intermediate,Poor
0,Anthracyclines and related substances;Taxanes,0,0,1
1,Bevacizumab,1,0,0
2,Capecitabine;Docetaxel,0,0,1
3,Carboplatin;Docetaxel;Trastuzumab,3,0,2
4,Cyclophosphamide,1,1,0
5,Cyclophosphamide;Doxorubicin,2,0,3
6,Cyclophosphamide;Doxorubicin;Fluorouracil,1,0,0
7,Cyclophosphamide;Doxorubicin;Fluorouracil;Meth...,1,0,1
8,Cyclophosphamide;Epirubicin,1,0,0
9,Cyclophosphamide;Epirubicin;Fluorouracil,2,0,0


In [130]:
side = pd.pivot_table(final_new, index='Molecules', columns='Side_effects', aggfunc='size', fill_value=0)
side = side.reset_index()
side

Side_effects,Molecules,High SE,Intermediate SE,Low SE
0,Anastrozole,1,0,1
1,Cyclophosphamide;Doxorubicin,0,0,1
2,Cyclophosphamide;Doxorubicin;Fluorouracil,7,0,9
3,Cyclophosphamide;Epirubicin,1,0,1
4,Cyclophosphamide;Epirubicin;Fluorouracil,0,0,3
5,Cyclophosphamide;Epirubicin;Paclitaxel,1,0,0
6,Docetaxel,1,0,1
7,Doxorubicin,0,0,2
8,Everolimus,0,2,0
9,Letrozole,1,1,2


In [131]:
Metabolism = pd.pivot_table(final_new, index='Molecules', columns='Metabolism_status', aggfunc='size', fill_value=0)
Metabolism = Metabolism.reset_index()
Metabolism

Metabolism_status,Molecules,Good Metabolism,Poor Metabolism
0,Anastrozole,0,1
1,Doxorubicin,1,0
2,Tamoxifen,1,0


In [132]:
Dosage = pd.pivot_table(final_new, index='Molecules', columns='Dosage_status', aggfunc='size', fill_value=0)
Dosage = Dosage.reset_index()
Dosage

Dosage_status,Molecules,Decreased Dose,Increased Dose,Intermediate Dose
0,Cyclophosphamide;Doxorubicin,0,0,1
1,Cyclophosphamide;Doxorubicin;Fluorouracil,1,0,0
2,Doxorubicin,0,1,0


In [146]:
mer1 = pd.merge(drug, side, on = 'Molecules', how = 'outer', sort=False)
mer1

Unnamed: 0,Molecules,Good,Intermediate,Poor,High SE,Intermediate SE,Low SE
0,Anthracyclines and related substances;Taxanes,0.0,0.0,1.0,,,
1,Bevacizumab,1.0,0.0,0.0,,,
2,Capecitabine;Docetaxel,0.0,0.0,1.0,,,
3,Carboplatin;Docetaxel;Trastuzumab,3.0,0.0,2.0,,,
4,Cyclophosphamide,1.0,1.0,0.0,,,
5,Cyclophosphamide;Doxorubicin,2.0,0.0,3.0,0.0,0.0,1.0
6,Cyclophosphamide;Doxorubicin;Fluorouracil,1.0,0.0,0.0,7.0,0.0,9.0
7,Cyclophosphamide;Doxorubicin;Fluorouracil;Meth...,1.0,0.0,1.0,,,
8,Cyclophosphamide;Epirubicin,1.0,0.0,0.0,1.0,0.0,1.0
9,Cyclophosphamide;Epirubicin;Fluorouracil,2.0,0.0,0.0,0.0,0.0,3.0


In [147]:
mer2 = pd.merge(mer1, Metabolism, on = 'Molecules', how = 'outer', sort=False)
mer2

Unnamed: 0,Molecules,Good,Intermediate,Poor,High SE,Intermediate SE,Low SE,Good Metabolism,Poor Metabolism
0,Anthracyclines and related substances;Taxanes,0.0,0.0,1.0,,,,,
1,Bevacizumab,1.0,0.0,0.0,,,,,
2,Capecitabine;Docetaxel,0.0,0.0,1.0,,,,,
3,Carboplatin;Docetaxel;Trastuzumab,3.0,0.0,2.0,,,,,
4,Cyclophosphamide,1.0,1.0,0.0,,,,,
5,Cyclophosphamide;Doxorubicin,2.0,0.0,3.0,0.0,0.0,1.0,,
6,Cyclophosphamide;Doxorubicin;Fluorouracil,1.0,0.0,0.0,7.0,0.0,9.0,,
7,Cyclophosphamide;Doxorubicin;Fluorouracil;Meth...,1.0,0.0,1.0,,,,,
8,Cyclophosphamide;Epirubicin,1.0,0.0,0.0,1.0,0.0,1.0,,
9,Cyclophosphamide;Epirubicin;Fluorouracil,2.0,0.0,0.0,0.0,0.0,3.0,,


In [148]:
mer3 = pd.merge(mer2, Dosage, on = 'Molecules', how = 'outer', sort=False)
mer3

Unnamed: 0,Molecules,Good,Intermediate,Poor,High SE,Intermediate SE,Low SE,Good Metabolism,Poor Metabolism,Decreased Dose,Increased Dose,Intermediate Dose
0,Anthracyclines and related substances;Taxanes,0.0,0.0,1.0,,,,,,,,
1,Bevacizumab,1.0,0.0,0.0,,,,,,,,
2,Capecitabine;Docetaxel,0.0,0.0,1.0,,,,,,,,
3,Carboplatin;Docetaxel;Trastuzumab,3.0,0.0,2.0,,,,,,,,
4,Cyclophosphamide,1.0,1.0,0.0,,,,,,,,
5,Cyclophosphamide;Doxorubicin,2.0,0.0,3.0,0.0,0.0,1.0,,,0.0,0.0,1.0
6,Cyclophosphamide;Doxorubicin;Fluorouracil,1.0,0.0,0.0,7.0,0.0,9.0,,,1.0,0.0,0.0
7,Cyclophosphamide;Doxorubicin;Fluorouracil;Meth...,1.0,0.0,1.0,,,,,,,,
8,Cyclophosphamide;Epirubicin,1.0,0.0,0.0,1.0,0.0,1.0,,,,,
9,Cyclophosphamide;Epirubicin;Fluorouracil,2.0,0.0,0.0,0.0,0.0,3.0,,,,,


In [149]:
mer3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHMBPRGPONC13_counts.xlsx', index=False)

In [142]:
final_new.Molecules.value_counts()

Cyclophosphamide;Doxorubicin;Fluorouracil                 16
Paclitaxel                                                 8
Cyclophosphamide;Doxorubicin                               7
Tamoxifen                                                  6
Cyclophosphamide;Epirubicin;Fluorouracil                   5
Carboplatin;Docetaxel;Trastuzumab                          5
Letrozole                                                  4
Trastuzumab                                                3
Docetaxel                                                  3
Doxorubicin                                                3
Cyclophosphamide;Epirubicin                                3
Anastrozole                                                3
Cyclophosphamide;Doxorubicin;Fluorouracil;Methotrexate     2
Cyclophosphamide                                           2
Everolimus                                                 2
Capecitabine;Docetaxel                                     1
Cyclophosphamide;Epirubi

In [145]:
alphabetical_counts = final_new['Molecules'].value_counts().sort_index()
alphabetical_counts

Anastrozole                                                3
Anthracyclines and related substances;Taxanes              1
Bevacizumab                                                1
Capecitabine;Docetaxel                                     1
Carboplatin;Docetaxel;Trastuzumab                          5
Cyclophosphamide                                           2
Cyclophosphamide;Doxorubicin                               7
Cyclophosphamide;Doxorubicin;Fluorouracil                 16
Cyclophosphamide;Doxorubicin;Fluorouracil;Methotrexate     2
Cyclophosphamide;Epirubicin                                3
Cyclophosphamide;Epirubicin;Fluorouracil                   5
Cyclophosphamide;Epirubicin;Paclitaxel                     1
Cyclophosphamide;Fluorouracil;Methotrexate                 1
Docetaxel                                                  3
Doxorubicin                                                3
Everolimus                                                 2
Letrozole               

#  @##############################################################

In [93]:
final.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHMBPRGPONC13_Breast_anticancer_pgx_main.xlsx', index=False)

In [99]:
final['MODIFIED_Drugs(Molecules)'].value_counts().sum()

77

In [96]:
mol1 = final[final['MODIFIED_Drugs(Molecules)'] == 'Cyclophosphamide;Doxorubicin']
mol1

Unnamed: 0,rsID,Zygosity,REF,ALT,Profile (Header),Category(Class of Drugs),CHROM,Gene,is present,Haplotype,Allele,Genotype,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs3745274,Wild,,,Breast Neoplasms,Neoplasms,chr19,CYP2B6,Covered,-,"G>A,T",TT,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.0,,Tier 1 VIP,,2021-03-24,
1,rs4244285,Heterozygous,G,A,Breast Neoplasms,Neoplasms,chr10,CYP2C19,Covered,-,"G>A,C,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202551,Patients with the AG genotype and Breast Cance...,Breast Neoplasms,Efficacy,Good,,,,,3,2,2,3.0,,Tier 1 VIP,,2021-03-24,
15,rs1143684,Wild,,,Breast Neoplasms,Neoplasms,chr6,NQO2,Covered,-,"C>A,G,T",CC,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,1153928101,Patients with the CC genotype and Breast Neopl...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,2,1.0,,,,2021-03-24,
16,rs3211371,Wild,,,Breast Neoplasms,Neoplasms,chr19,CYP2B6,Covered,-,"C>A,G,T",CC,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,1043880293,Patients withe the CC genotype may have decrea...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,0.0,,Tier 1 VIP,,2021-03-24,
28,rs12210538,Heterozygous,A,G,Breast Neoplasms,Neoplasms,chr6,SLC22A16,Covered,-,A>G,AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,1183680575,Patients with the AG genotype may have increas...,Breast Neoplasms,Toxicity,Poor,,,,Female,3,2,2,1.75,,,,2021-03-24,
29,rs12721655,Wild,,,Breast Neoplasms,Neoplasms,chr19,CYP2B6,Covered,-,A>G,AA,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,1043880298,Patients with the AA genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3,2,2,1.0,,Rare Variant; Tier 1 VIP,,2021-03-24,
47,rs2032582,Heterozygous,A,T,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,T",CT,-,Breast Neoplasms,Cyclophosphamide+Doxorubicin,Cyclophosphamide;Doxorubicin,981238025,Patients with the CT genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,2,1.5,,Tier 1 VIP,,2021-03-25,


In [98]:
mol1['Drug response'].value_counts()

Poor    3
Good    2
Name: Drug response, dtype: int64

In [83]:
final[final['rsID'] == 'rs3212986']

Unnamed: 0,rsID,Zygosity,REF,ALT,Profile (Header),Category(Class of Drugs),CHROM,Gene,is present,Haplotype,Allele,Genotype,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
31,rs3212986,Wild,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,


In [69]:
data.shape

(52, 2)

In [61]:
merged_df = pd.merge(vcf, data, on = 'rsID', how = 'left', sort=False)
merged_df

Unnamed: 0,rsID,Zygosity,REF,ALT,CHROM,matched
0,rs6605066,Homozygous,C,G,chr1,
1,rs3748592,Homozygous,A,G,chr1,
2,rs2272757,Heterozygous,G,A,chr1,
3,rs4970378,Homozygous,A,G,chr1,
4,rs3748595,Homozygous,A,C,chr1,
...,...,...,...,...,...,...
32941,rs879016544,Homozygous,T,C,chrY,
32942,rs867343413,Homozygous,C,T,chrY,
32943,.,Heterozygous,A,G,chrY,
32944,.,Homozygous,G,T,chrY,


In [63]:
merged_df.matched.value_counts(dropna=False)

NaN    32931
yes       15
Name: matched, dtype: int64

In [64]:
merged_df['matched'] = merged_df['matched'].fillna("Wild")
merged_df

Unnamed: 0,rsID,Zygosity,REF,ALT,CHROM,matched
0,rs6605066,Homozygous,C,G,chr1,Wild
1,rs3748592,Homozygous,A,G,chr1,Wild
2,rs2272757,Heterozygous,G,A,chr1,Wild
3,rs4970378,Homozygous,A,G,chr1,Wild
4,rs3748595,Homozygous,A,C,chr1,Wild
...,...,...,...,...,...,...
32941,rs879016544,Homozygous,T,C,chrY,Wild
32942,rs867343413,Homozygous,C,T,chrY,Wild
32943,.,Heterozygous,A,G,chrY,Wild
32944,.,Homozygous,G,T,chrY,Wild


In [65]:
merged_df['matched'] = merged_df.apply(lambda row: row['Zygosity'] if row['matched'] == 'yes' else row['matched'], axis=1)
merged_df

Unnamed: 0,rsID,Zygosity,REF,ALT,CHROM,matched
0,rs6605066,Homozygous,C,G,chr1,Wild
1,rs3748592,Homozygous,A,G,chr1,Wild
2,rs2272757,Heterozygous,G,A,chr1,Wild
3,rs4970378,Homozygous,A,G,chr1,Wild
4,rs3748595,Homozygous,A,C,chr1,Wild
...,...,...,...,...,...,...
32941,rs879016544,Homozygous,T,C,chrY,Wild
32942,rs867343413,Homozygous,C,T,chrY,Wild
32943,.,Heterozygous,A,G,chrY,Wild
32944,.,Homozygous,G,T,chrY,Wild


In [66]:
merged_df.matched.value_counts()

Wild            32931
Heterozygous       14
Homozygous          1
Name: matched, dtype: int64

In [32]:
data['REF'] = data['Allele'].str.split('>').str[0]
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),CHROM,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A


In [33]:
data['ALT'] = data['Allele'].str.split('>').str[1]
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),CHROM,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF,ALT
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,"A,T"
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,"A,T"
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C


In [34]:
data['ALT'] = data['ALT'].str.split(',')
data = data.explode('ALT')
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),CHROM,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF,ALT
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,A
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,T
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,A
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,T
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C


In [37]:
merged_data = pd.merge(vcf, data, on = ['rsID', 'Zygosity', 'CHROM', 'REF', 'ALT'], how = 'inner', sort = False)
merged_data

Unnamed: 0,rsID,Zygosity,REF,ALT,CHROM,matched,Profile (Header),Category(Class of Drugs),Gene,is present,Haplotype,Allele,Genotype,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs1801274,Heterozygous Mutant,A,G,chr1,yes,Breast Neoplasms,Neoplasms,FCGR2A,Covered,-,"A>C,G",AG,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003571,Patients with the AG genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,4,4.75,,,,2021-03-24,
1,rs4244285,Heterozygous Mutant,G,A,chr10,yes,Breast Neoplasms,Neoplasms,CYP2C19,Covered,-,"G>A,C,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202551,Patients with the AG genotype and Breast Cance...,Breast Neoplasms,Efficacy,Good,,,,,3,2,2,3.0,,Tier 1 VIP,,2021-03-24,
2,rs4244285,Heterozygous Mutant,G,A,chr10,yes,Breast Neoplasms,Neoplasms,CYP2C19,Covered,-,"G>A,C,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718271,Patients with AG genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,High SE,,,,3,1,1,2.75,,Tier 1 VIP,,2021-03-24,
3,rs2273697,Heterozygous Mutant,G,A,chr10,yes,Breast Neoplasms,Neoplasms,ABCC2,Covered,-,G>A,AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718277,Patients with AG genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.75,,,,2021-03-24,
4,rs700518,Heterozygous Mutant,T,C,chr15,yes,Breast Neoplasms,Neoplasms,CYP19A1,Covered,-,T>C,AC,-,Breast Neoplasms,Letrozole,Letrozole,1447682293,Post-menopausal women with breast cancer and t...,Breast Neoplasms;Menopause,Other,,Intermediate SE,,,Female,3,1,1,1.5,,,,2021-03-24,
5,rs2228100,Heterozygous Mutant,G,C,chr17,yes,Breast Neoplasms,Neoplasms,ALDH3A1,Covered,-,"G>A,C,T",CG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718300,Patients with CG genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,High SE,,,,3,1,1,2.75,,,,2021-03-24,
6,rs11615,Heterozygous Mutant,A,G,chr19,yes,Breast Neoplasms,Neoplasms,ERCC1,Covered,-,A>G,AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718317,Patients with AG genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,High SE,,,,3,1,1,3.25,,,,2021-03-24,
7,rs11615,Heterozygous Mutant,A,G,chr19,yes,Breast Neoplasms,Neoplasms,ERCC1,Covered,-,A>G,AG,-,Breast Neoplasms,Docetaxel,Docetaxel,1445401853,Patients with the AG genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,1.75,,,,2021-03-24,
8,rs2011425,Heterozygous Mutant,T,G,chr2,yes,Breast Neoplasms,Neoplasms,UGT1A4,Covered,-,"T>A,G",GT,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1448112196,Patients with the GT genotype and breast cance...,Breast Neoplasms,Metabolism/PK,,,Good Metabolism,,,3,2,2,1.75,,,,2021-03-24,
9,rs7349683,Homozygous Mutant,C,T,chr4,yes,Breast Neoplasms,Neoplasms,EPHA5,Covered,-,"C>A,T",TT,-,Breast Neoplasms,Paclitaxel,Paclitaxel,1448100826,Women with the TT genotype and breast or ovari...,Breast Neoplasms;Ovarian Neoplasms;Peripheral ...,Toxicity,,High SE,,,Female,3,4,4,2.0,,,,2021-03-24,


In [23]:
merged['matched'] = merged['matched'].fillna("Wild Type")
merged

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,matched
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,Wild Type
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,Wild Type
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,Wild Type
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,Wild Type
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,Wild Type
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,Wild Type


In [24]:
merged['matched'] = merged.apply(lambda row: row['Zygosity'] if row['matched'] == 'yes' else row['matched'], axis=1)
merged

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,matched
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,Wild Type
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,Wild Type
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,Wild Type
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,Wild Type
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,Wild Type
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,Wild Type
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,Wild Type


In [25]:
merged.matched.value_counts()

Wild Type              201
Heterozygous Mutant     28
Homozygous Mutant        1
Name: matched, dtype: int64

In [26]:
merged.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHMBPRGPONC13_Breast_anticancer_pgx.xlsx', index=False)

In [10]:
data['REF'] = data['Allele'].str.split('>').str[0]
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A


In [11]:
data.REF.value_counts()

A                      96
G                      54
C                      53
T                      24
TGGTCCCACTCTTCCCACA     3
Name: REF, dtype: int64

In [12]:
data['ALT'] = data['Allele'].str.split('>').str[1]
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF,ALT
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,"A,T"
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,"A,T"
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,"A,T"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C


In [13]:
data['ALT'] = data['ALT'].str.split(',')
data = data.explode('ALT')
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF,ALT
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,A
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,T
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,A
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,,C,T
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,,G,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,,C,T
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,,A,C


In [15]:
data.REF.value_counts()

A                      174
C                      106
G                       87
T                       39
TGGTCCCACTCTTCCCACA      3
Name: REF, dtype: int64

In [25]:
data[data['Variant'] == 'rs396991']

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,REF,ALT
173,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AA,Wild Type,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AA genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,C
173,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AA,Wild Type,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AA genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,G
173,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AA,Wild Type,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AA genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,T
174,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AC,Heterozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AC genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,C
174,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AC,Heterozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AC genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,G
174,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",AC,Heterozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the AC genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,3,4.25,,,,2021-03-24,,A,T
175,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",CC,Homozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the CC genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3,2,3,4.25,,,,2021-03-24,,A,C
175,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",CC,Homozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the CC genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3,2,3,4.25,,,,2021-03-24,,A,G
175,Breast Neoplasms,Neoplasms,chr1,FCGR3A,rs396991,Covered,-,"A>C,G,T",CC,Homozygous Mutant,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the CC genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3,2,3,4.25,,,,2021-03-24,,A,T


In [24]:
merged_data = pd.merge(vcf, data, on = ['Variant', 'Zygosity', 'REF', 'ALT'], how = 'inner', sort = False)
merged_data

Unnamed: 0,Variant,Zygosity,REF,ALT,Profile (Header),Category(Class of Drugs),Chromosome,Gene,is present,Haplotype,Allele,Genotype,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs1801274,Heterozygous Mutant,A,G,Breast Neoplasms,Neoplasms,chr1,FCGR2A,Covered,-,"A>C,G",AG,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003571,Patients with the AG genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3,2,4,4.75,,,,2021-03-24,
1,rs396991,Homozygous Mutant,A,C,Breast Neoplasms,Neoplasms,chr1,FCGR3A,Covered,-,"A>C,G,T",CC,-,Breast Neoplasms,Trastuzumab,Trastuzumab,1185003565,Patients with the CC genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3,2,3,4.25,,,,2021-03-24,
2,rs2273697,Homozygous Mutant,G,A,Breast Neoplasms,Neoplasms,chr10,ABCC2,Covered,-,G>A,AA,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718277,Patients with AA genotype and breast cancer ma...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.75,,,,2021-03-24,
3,rs1695,Heterozygous Mutant,A,G,Breast Neoplasms,Neoplasms,chr11,GSTP1,Covered,-,"A>G,T",AG,-,Breast Neoplasms,Cyclophosphamide;Epirubicin,Cyclophosphamide;Epirubicin,1451398140,Patients with breast cancer as the rs1695 AG g...,Breast Neoplasms,Efficacy,Good,,,,,3,1,1,2.0,,Tier 1 VIP,,2021-03-26,
4,rs1695,Heterozygous Mutant,A,G,Breast Neoplasms,Neoplasms,chr11,GSTP1,Covered,-,"A>G,T",AG,-,Breast Neoplasms,Cyclophosphamide;Epirubicin,Cyclophosphamide;Epirubicin,1451398160,Patients with breast cancer and the rs1695 AG ...,Breast Neoplasms;Drug Toxicity,Toxicity,,Low SE,,,,3,1,1,2.5,,Tier 1 VIP,,2021-03-26,
5,rs1695,Heterozygous Mutant,A,G,Breast Neoplasms,Neoplasms,chr11,GSTP1,Covered,-,"A>G,T",AG,-,Breast Neoplasms,Cyclophosphamide;Epirubicin,Cyclophosphamide;Epirubicin,1451398180,There is currently no available evidence regar...,Breast Neoplasms;Neutropenia,Toxicity,,,,,,3,1,1,2.5,,Tier 1 VIP,,2021-03-26,
6,rs1695,Heterozygous Mutant,A,G,Breast Neoplasms,Neoplasms,chr11,GSTP1,Covered,-,"A>G,T",AG,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718337,Patients with the AG genotype and breast cance...,Breast Neoplasms,Toxicity,,High SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,
7,rs4646,Heterozygous Mutant,A,C,Breast Neoplasms,Neoplasms,chr15,CYP19A1,Covered,-,A>C,AC,-,Breast Neoplasms,Letrozole,Letrozole,1447680068,Post-menopausal women with the AC genotype and...,Breast Neoplasms;Menopause,Other,,Intermediate SE,,,Female,3,1,1,1.5,,,,2021-03-24,
8,rs4646,Heterozygous Mutant,A,C,Breast Neoplasms,Neoplasms,chr15,CYP19A1,Covered,-,A>C,AC,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1446897383,Post-menopausal women with the AC genotype and...,Breast Neoplasms;Menopause,Efficacy,Good,,,,Female,3,2,2,5.75,,,,2021-03-24,
9,rs4646,Heterozygous Mutant,A,C,Breast Neoplasms,Neoplasms,chr15,CYP19A1,Covered,-,A>C,AC,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.0,,,,2021-03-24,


In [32]:
print(set(merged_data.Variant))

{'rs1128503', 'rs2284922', 'rs25487', 'rs2273697', 'rs1801274', 'rs396991', 'rs2228100', 'rs1799983', 'rs11615', 'rs2070744', 'rs4880', 'rs1045642', 'rs8133052', 'rs20572', 'rs2740574', 'rs7349683', 'rs2011425', 'rs1695', 'rs4646', 'rs351855', 'rs3745274'}


In [31]:
merged_data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/KHAPOLGPTTL17_sample_Breast_data.xlsx', index = False)