In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [5]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Remaining_variants_pharmgkb.vcf_', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT']]
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/variants_pos_INFO.xlsx', index=False)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT
0,1,196673103,rs800292,G,A
1,10,94761900,rs12248560,C,"A,T"
2,10,95038992,rs10509681,T,C
3,10,95067273,rs11572080,C,"A,T"
4,10,99844450,rs3740066,C,"G,T"
5,10,122454932,rs10490924,G,"C,T"
6,11,83983729,rs2449598,T,"A,C,G"
7,15,51237068,rs3759811,T,"A,C"
8,15,51243825,rs4775936,C,T
9,15,51257402,rs1008805,G,A


In [6]:
variants = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/variants_pos_INFO.xlsx')
variants

Unnamed: 0,CHROM,POS,rsID,REF,ALT
0,chr1,196673103,rs800292,G,A
1,chr10,94761900,rs12248560,C,"A,T"
2,chr10,95038992,rs10509681,T,C
3,chr10,95067273,rs11572080,C,"A,T"
4,chr10,99844450,rs3740066,C,"G,T"
5,chr10,122454932,rs10490924,G,"C,T"
6,chr11,83983729,rs2449598,T,"A,C,G"
7,chr15,51237068,rs3759811,T,"A,C"
8,chr15,51243825,rs4775936,C,T
9,chr15,51257402,rs1008805,G,A


In [8]:
import pandas as pd
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']
df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20
df['Gene'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df['Gene'] = df['Gene'].str.split(',').str[0]
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'Gene']]
df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,Gene
0,chr1,65489,65649,OR4F5
1,chr1,69007,70037,OR4F5
2,chr1,450710,451706,OR4F29
3,chr1,685686,686682,OR4F16
4,chr1,924401,924977,SAMD11
...,...,...,...,...
208906,chrY,25038781,25038941,BPY2C
208907,chrY,25041746,25041906,BPY2C
208908,chrY,25043888,25044048,BPY2C
208909,chrY,25622413,25624093,CDY1


In [9]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
variants['Covered/Not_Covered'] = variants.apply(check_coverage, axis=1)
variants

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Covered/Not_Covered
0,chr1,196673103,rs800292,G,A,Covered
1,chr10,94761900,rs12248560,C,"A,T",Covered
2,chr10,95038992,rs10509681,T,C,Not_Covered
3,chr10,95067273,rs11572080,C,"A,T",Covered
4,chr10,99844450,rs3740066,C,"G,T",Covered
5,chr10,122454932,rs10490924,G,"C,T",Covered
6,chr11,83983729,rs2449598,T,"A,C,G",Not_Covered
7,chr15,51237068,rs3759811,T,"A,C",Not_Covered
8,chr15,51243825,rs4775936,C,T,Not_Covered
9,chr15,51257402,rs1008805,G,A,Not_Covered


In [10]:
variants.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/covered_not_covered_variants_pos_INFO.xlsx', index=False)

In [14]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/covered_not_covered_file.xlsx')
df

Unnamed: 0,rsID,CHROM,POS,REF,ALT,Covered/Not_Covered
0,rs75527207,chr7,117587806.0,G,A,Covered
1,rs4149056,chr12,21178615.0,T,C,Covered
2,rs1799971,chr6,154039662.0,A,G,Covered
3,rs141033578,chr7,117606695.0,C,G,Covered
4,rs141033578,chr7,117606695.0,C,T,Covered
...,...,...,...,...,...,...
3645,rs1045642,chr7,87509329.0,A,"C,G,T",Covered
3646,rs6981827,chr8,3217516.0,C,"A,T",Not_Covered
3647,rs6990851,chr8,3269991.0,A,G,Not_Covered
3648,rs6151031,chr9,72953468.0,C,"CTGGTCAGGAGAGAACCC,CTGGTGAGGAGAGAACCC",Not_Covered


In [16]:
df[df['rsID'] == 'rs1000940']

Unnamed: 0,rsID,CHROM,POS,REF,ALT,Covered/Not_Covered
1677,rs1000940,chr17,5379957.0,A,G,Not_Covered
1678,rs1000940,chr17,5379957.0,A,T,Not_Covered
1679,rs1000940,chr17,5379957.0,A,C,Not_Covered


In [18]:
grouped = df.groupby(['rsID', 'CHROM', 'POS', 'REF'])['ALT', 'Covered/Not_Covered'].agg({
    'ALT': lambda x: ','.join(x),
    'Covered/Not_Covered': 'first'
}).reset_index()
grouped

Unnamed: 0,rsID,CHROM,POS,REF,ALT,Covered/Not_Covered
0,rs10007051,chr4,129244309.0,C,T,Not_Covered
1,rs1000940,chr17,5379957.0,A,"G,T,C",Not_Covered
2,rs10012,chr2,38075247.0,G,C,Covered
3,rs10028494,chr4,69105219.0,A,"T,C",Not_Covered
4,rs10030044,chr4,156090771.0,G,"C,T,A",Not_Covered
...,...,...,...,...,...,...
2329,rs9943291,chr1,119749667.0,T,G,Not_Covered
2330,rs9958628,chr18,6739625.0,A,T,Not_Covered
2331,rs9977268,chr21,45487373.0,C,T,Not_Covered
2332,rs997917,chr8,53239818.0,T,C,Not_Covered


In [19]:
grouped.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Covered_ALT_grouping_data.xlsx', index=False)

In [24]:
grouped = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Covered_ALT_grouping_data.xlsx')
grouped

Unnamed: 0,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered
0,rs10007051,chr4,129244309,C,T,Not_Covered
1,rs1000940,chr17,5379957,A,"G,T,C",Not_Covered
2,rs10012,chr2,38075247,G,C,Covered
3,rs10028494,chr4,69105219,A,"T,C",Not_Covered
4,rs10030044,chr4,156090771,G,"C,T,A",Not_Covered
...,...,...,...,...,...,...
2329,rs9943291,chr1,119749667,T,G,Not_Covered
2330,rs9958628,chr18,6739625,A,T,Not_Covered
2331,rs9977268,chr21,45487373,C,T,Not_Covered
2332,rs997917,chr8,53239818,T,C,Not_Covered


In [20]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/MAIN_mapped_rsid_covered_data.xlsx')
df_1

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",,CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",,CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",,CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",,CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [21]:
# Iterate through the DataFrame and shift values from 'Variant' to 'Haplotypes'
for index, row in df_1.iterrows():
    if not row['Variant'].startswith('rs'):
        df_1.at[index, 'Haplotypes'] = row['Variant']
        df_1.at[index, 'Variant'] = None
        
df_1

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [23]:
df_1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Pharmgkb_mapped_data.xlsx', index=False)

In [25]:
main = pd.merge(df_1, grouped, on = 'Variant', how = 'left', sort=False)
main

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,CHROM,POS,REF,ALT,Covered/Not_Covered
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Covered
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Covered
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Covered
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C,Covered
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C,Covered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,


In [26]:
main.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Main_pharmgkb_data.xlsx', index=False)

In [9]:
df1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/new_data_considered.xlsx')
df1.head()

Unnamed: 0,PharmGKB_ID,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric,CHROM,POS,Covered/Not_Covered
0,981201535,1A,rs2297595,DPYD,fluorouracil,Toxicity,Neoplasms,False,chr1,97699535.0,Covered
1,1451274140,1A,rs115232898,DPYD,fluorouracil,Other,,False,chr1,97699474.0,Covered
2,1448640327,3,TPMT*1; TPMT*3B; TPMT*3C,TPMT,fluorouracil,Efficacy,Neoplasms,False,,,
3,1446903310,3,rs17109924,LGR5,fluorouracil,Efficacy,Colonic Neoplasms,False,chr12,71584007.0,Covered
4,981202240,3,rs11280056,TYMS,fluorouracil,Efficacy,Colorectal Neoplasms,False,chr18,673444.0,Covered


In [10]:
df2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Ordered_db.xlsx')
df2.head()

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant(rsID),is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/TTAAAGTTA,Wild type,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the TTAAAGTTA/TTAAAGTTA genotype...,Colorectal Neoplasms,Efficacy,Good,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
1,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/del,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the TTAAAGTTA/del genotype who a...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
2,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,del/del,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the del/del genotype who also ha...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
3,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CC,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Patients with the CC genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Good,Low SE,,,,3,1,2,3.0,,,,44279,
4,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CT,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Patients with the CT genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Poor,High SE,,,,3,1,2,3.0,,,,44279,


In [12]:
# Find the common values between df1 and df2
common_values = set(df1['PharmGKB_ID']).intersection(df2['PharmGKB_ID'])

# Create DataFrames for matched and non-matched rows in df1
matched_rows_df1 = df1[df1['PharmGKB_ID'].isin(common_values)]
non_matched_rows_df1 = df1[~df1['PharmGKB_ID'].isin(common_values)]

# Add a "Match_Status" column to indicate match status
matched_rows_df1['Match_Status'] = 'Match'
non_matched_rows_df1['Match_Status'] = 'Not_Match'

# Concatenate the matched and non-matched DataFrames
result_df1 = pd.concat([matched_rows_df1, non_matched_rows_df1])

# Reset the index for the result DataFrame
result_df1 = result_df1.reset_index(drop=True)

# Print the result DataFrame
result_df1

Unnamed: 0,PharmGKB_ID,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric,CHROM,POS,Covered/Not_Covered,Match_Status
0,1446903310,3,rs17109924,LGR5,fluorouracil,Efficacy,Colonic Neoplasms,False,chr12,71584007.0,Covered,Match
1,981202240,3,rs11280056,TYMS,fluorouracil,Efficacy,Colorectal Neoplasms,False,chr18,673444.0,Covered,Match
2,982044867,3,rs9344,CCND1,fluorouracil,Efficacy,Colonic Neoplasms,False,chr11,69648142.0,Covered,Match
3,769171387,3,rs1695,GSTP1,fluorouracil,Toxicity,Rectal Neoplasms,False,chr11,67585218.0,Covered,Match
4,981345277,3,rs25487,XRCC1,fluorouracil,Efficacy,Colonic Neoplasms; Colorectal Neoplasms; Neopl...,False,chr19,43551574.0,Covered,Match
...,...,...,...,...,...,...,...,...,...,...,...,...
79,1447983184,3,rs370457585,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39661180.0,Covered,Not_Match
80,1447982825,3,rs374150125,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39663330.0,Covered,Not_Match
81,1447982818,3,rs201045130,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39659467.0,Covered,Not_Match
82,1444666032,3,UGT1A7*3,UGT1A7,irinotecan; oxaliplatin; tegafur / gimeracil /...,Toxicity,Colorectal Neoplasms,False,,,,Not_Match


In [13]:
# Find the common values between df1 and df2
common_values = set(df2['PharmGKB_ID']).intersection(df1['PharmGKB_ID'])

# Create DataFrames for matched and non-matched rows in df1
matched_rows_df2 = df2[df2['PharmGKB_ID'].isin(common_values)]
non_matched_rows_df2 = df2[~df2['PharmGKB_ID'].isin(common_values)]

# Add a "Match_Status" column to indicate match status
matched_rows_df2['Match_Status'] = 'Match'
non_matched_rows_df2['Match_Status'] = 'Not_Match'

# Concatenate the matched and non-matched DataFrames
result_df2 = pd.concat([matched_rows_df2, non_matched_rows_df2])

# Reset the index for the result DataFrame
result_df2 = result_df2.reset_index(drop=True)

# Print the result DataFrame
result_df2

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant(rsID),is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,Match_Status
0,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/TTAAAGTTA,Wild type,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the TTAAAGTTA/TTAAAGTTA genotype...,Colorectal Neoplasms,Efficacy,Good,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,Match
1,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/del,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the TTAAAGTTA/del genotype who a...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,Match
2,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,del/del,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Patients with the del/del genotype who also ha...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,Match
3,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CC,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Patients with the CC genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Good,Low SE,,,,3,1,2,3.00,,,,44279,,Match
4,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CT,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Patients with the CT genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Poor,High SE,,,,3,1,2,3.00,,,,44279,,Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,Colonic Neoplasms,Neoplasms,chr5,HMGCR,rs12654264,Covered,-,A>T,AT,Heterozygous Mutant,-,Colonic Neoplasms,HMG CoA reductase inhibitors,HMG CoA reductase inhibitors,1183533592,Patients with the AT genotype may have a highe...,Colonic Neoplasms,Other,Intermediate,,,,,3,1,3,5.50,,,,44279,,Not_Match
126,Colonic Neoplasms,Neoplasms,chr5,HMGCR,rs12654264,Covered,-,A>T,TT,Homozygous Mutant,-,Colonic Neoplasms,HMG CoA reductase inhibitors,HMG CoA reductase inhibitors,1183533592,Patients with the TT genotype may have a lower...,Colonic Neoplasms,Other,Poor,,,,,3,1,3,5.50,,,,44279,,Not_Match
127,Rectal Neoplasms,Neoplasms,chr4,EGF,rs4444903,Covered,-,A>G,AA,Wild type,-,Rectal Neoplasms,Cetuximab,Cetuximab,981202598,Patients with the rs4444903 AA genotype may ha...,Colorectal Neoplasms;Rectal Neoplasms,Efficacy,Poor,,,,,3,5,5,3.75,,,,44279,,Not_Match
128,Rectal Neoplasms,Neoplasms,chr4,EGF,rs4444903,Covered,-,A>G,AG,Heterozygous Mutant,-,Rectal Neoplasms,Cetuximab,Cetuximab,981202598,Patients with the rs4444903 AG genotype may ha...,Colorectal Neoplasms;Rectal Neoplasms,Efficacy,Intermediate,,,,,3,5,5,3.75,,,,44279,,Not_Match


In [15]:
result_df1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Final_new_data_considered.xlsx', index=False)
result_df2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Final_Ordered_db.xlsx', index=False)

In [30]:
new_df1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Final_new_data_considered.xlsx')
new_df2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Final_Ordered_db.xlsx')

In [17]:
vcf = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/KHHSPTGPCSP18_vcf_processed.xlsx')
vcf

Unnamed: 0,Gene Name,Rsid,Final Score,Consequence,Variant Consequence Score,Zygosity,Zygosity Score,CLNDN,Clinical Consequence Score,Weightage,Clinical System,Clinical Consequence,Clinical Significance,Clinical Significance Score,CLNREVSTAT,Review Status Score,Clinvar ID,Biotype,Strand,Protein Position and Amino Acid,Codons,Chromosome,Exon,Position,Allele
0,,,downstream gene variant,0,Heterozygous,0,,0,,,,0,,0,,protein_coding,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873
1,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873
2,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873
3,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873
4,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354603,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,8/8,14840785,"[ C , T]",LOW,rs767683335&COSV59296056
354604,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,6/6,14840785,"[ C , T]",LOW,rs767683335&COSV59296056
354605,NLGN4Y,rs767683335,non coding transcript exon variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding_CDS_not_defined,1.0,,,chrY,9/9,14840785,"[ C , T]",MODIFIER,rs767683335&COSV59296056
354606,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,7/7,14840785,"[ C , T]",LOW,rs767683335&COSV59296056


In [18]:
rsID = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/match_not_match_variants_pharmgkb_ID.xlsx')
rsID

Unnamed: 0,Rsid
0,rs2297595
1,rs115232898
2,TPMT*1; TPMT*3B; TPMT*3C
3,rs11280056
4,rs1801159
...,...
63,rs11692021
64,rs712829
65,rs712830
66,rs112445441


In [20]:
vcf['rsID_Match'] = 'No'

# Iterate through each rsid in df1
for rsid in vcf['Rsid']:
    if isinstance(rsid, str):  # Check if the gene value is a non-null string
        rsid_list = rsid.split(',')  # Split the rsid by comma to create a list
        match = any(rsid in rsID['Rsid'].values for rsid in rsid_list)  # Check if any gene in the list exists in df2
        if match:
            vcf.loc[vcf['Rsid'] == rsid, 'rsID_Match'] = 'Yes'
            
vcf

Unnamed: 0,Gene Name,Rsid,Final Score,Consequence,Variant Consequence Score,Zygosity,Zygosity Score,CLNDN,Clinical Consequence Score,Weightage,Clinical System,Clinical Consequence,Clinical Significance,Clinical Significance Score,CLNREVSTAT,Review Status Score,Clinvar ID,Biotype,Strand,Protein Position and Amino Acid,Codons,Chromosome,Exon,Position,Allele,rsID_Match
0,,,downstream gene variant,0,Heterozygous,0,,0,,,,0,,0,,protein_coding,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873,No
1,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873,No
2,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873,No
3,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873,No
4,,,"intron variant, non coding transcript variant",0,Heterozygous,0,,0,,,,0,,0,,processed_transcript,-1.0,,,chr1,,685694,"[ T , C]",MODIFIER,rs1553142294&COSV60458873,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354603,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,8/8,14840785,"[ C , T]",LOW,rs767683335&COSV59296056,No
354604,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,6/6,14840785,"[ C , T]",LOW,rs767683335&COSV59296056,No
354605,NLGN4Y,rs767683335,non coding transcript exon variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding_CDS_not_defined,1.0,,,chrY,9/9,14840785,"[ C , T]",MODIFIER,rs767683335&COSV59296056,No
354606,NLGN4Y,rs767683335,synonymous variant,0,Homozygous,0,not provided,0,,,Likely benign,0,"criteria provided, single submitter",0,770316.0,protein_coding,1.0,,ccC/ccT,chrY,7/7,14840785,"[ C , T]",LOW,rs767683335&COSV59296056,No


In [22]:
x = vcf[vcf['rsID_Match'] == 'Yes']
x = x[['Rsid', 'Variant Consequence Score', 'rsID_Match']]
x.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/rsid_zygosity.xlsx', index = False)

In [23]:
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/KHHSPTGPCSP18_vcf_processed_rsid_match.xlsx', index=False)

In [24]:
new_df1.head()

Unnamed: 0,PharmGKB_ID,Matched_Status,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric,CHROM,POS,Covered/Not_Covered
0,1446903310,Match,3,rs17109924,LGR5,fluorouracil,Efficacy,Colonic Neoplasms,False,chr12,71584007.0,Covered
1,981202240,Match,3,rs11280056,TYMS,fluorouracil,Efficacy,Colorectal Neoplasms,False,chr18,673444.0,Covered
2,982044867,Match,3,rs9344,CCND1,fluorouracil,Efficacy,Colonic Neoplasms,False,chr11,69648142.0,Covered
3,769171387,Match,3,rs1695,GSTP1,fluorouracil,Toxicity,Rectal Neoplasms,False,chr11,67585218.0,Covered
4,981345277,Match,3,rs25487,XRCC1,fluorouracil,Efficacy,Colonic Neoplasms; Colorectal Neoplasms; Neopl...,False,chr19,43551574.0,Covered


In [25]:
new_df2.head()

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant(rsID),is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID,Matched_Status,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/TTAAAGTTA,Wild type,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the TTAAAGTTA/TTAAAGTTA genotype...,Colorectal Neoplasms,Efficacy,Good,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
1,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/del,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the TTAAAGTTA/del genotype who a...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
2,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,del/del,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the del/del genotype who also ha...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.5,,Tier 1 VIP,,44279,
3,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CC,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Match,Patients with the CC genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Good,Low SE,,,,3,1,2,3.0,,,,44279,
4,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CT,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Match,Patients with the CT genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Poor,High SE,,,,3,1,2,3.0,,,,44279,


In [27]:
rsid_zygo = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/rsid_zygosity.xlsx')
rsid_zygo

Unnamed: 0,rsID,Matched_zygosity
0,rs1801131,Homozygous
1,rs2072671,Heterozygous
2,rs1801274,Homozygous
3,rs717620,Heterozygous
4,rs9344,Heterozygous
5,rs2306283,Homozygous
6,rs25487,Homozygous
7,rs13181,Heterozygous
8,rs11615,Heterozygous
9,rs1051266,Heterozygous


In [28]:
matched_zyo_new = pd.merge(new_df1, rsid_zygo, on = 'rsID', how = 'left', sort=False)
matched_zyo_new

Unnamed: 0,PharmGKB_ID,Matched_Status,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric,CHROM,POS,Covered/Not_Covered,Matched_zygosity
0,1446903310,Match,3,rs17109924,LGR5,fluorouracil,Efficacy,Colonic Neoplasms,False,chr12,71584007.0,Covered,
1,981202240,Match,3,rs11280056,TYMS,fluorouracil,Efficacy,Colorectal Neoplasms,False,chr18,673444.0,Covered,
2,982044867,Match,3,rs9344,CCND1,fluorouracil,Efficacy,Colonic Neoplasms,False,chr11,69648142.0,Covered,Heterozygous
3,769171387,Match,3,rs1695,GSTP1,fluorouracil,Toxicity,Rectal Neoplasms,False,chr11,67585218.0,Covered,
4,981345277,Match,3,rs25487,XRCC1,fluorouracil,Efficacy,Colonic Neoplasms; Colorectal Neoplasms; Neopl...,False,chr19,43551574.0,Covered,Homozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,1447983184,Not_Match,3,rs370457585,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39661180.0,Covered,
80,1447982825,Not_Match,3,rs374150125,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39663330.0,Covered,
81,1447982818,Not_Match,3,rs201045130,NT5C1A,cladribine; fluorouracil; gemcitabine,Other,,False,chr1,39659467.0,Covered,
82,1444666032,Not_Match,3,UGT1A7*3,UGT1A7,irinotecan; oxaliplatin; tegafur / gimeracil /...,Toxicity,Colorectal Neoplasms,False,,,,


In [31]:
matched_zyo_ordered = pd.merge(new_df2, rsid_zygo, on = 'rsID', how = 'left', sort=False)
matched_zyo_ordered

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,rsID,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID,Matched_Status,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function,Matched_zygosity
0,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/TTAAAGTTA,Wild type,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the TTAAAGTTA/TTAAAGTTA genotype...,Colorectal Neoplasms,Efficacy,Good,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,
1,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,TTAAAGTTA/del,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the TTAAAGTTA/del genotype who a...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,
2,Colorectal Neoplasms,Neoplasms,chr18,TYMS,rs11280056,Covered,-,TTTAAAG>T,del/del,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981202240,Match,Patients with the del/del genotype who also ha...,Colorectal Neoplasms,Efficacy,Poor,,,,,3,1,1,3.50,,Tier 1 VIP,,44279,,
3,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CC,Homozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Match,Patients with the CC genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Good,Low SE,,,,3,1,2,3.00,,,,44279,,Homozygous
4,Colorectal Neoplasms,Neoplasms,chr19,XRCC1,rs25487,Covered,-,"T>C,G",CT,Heterozygous Mutant,-,Colorectal Neoplasms,Fluorouracil,Fluorouracil,981345277,Match,Patients with the CT genotype and cancer may h...,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,Efficacy,Poor,High SE,,,,3,1,2,3.00,,,,44279,,Homozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,Colonic Neoplasms,Neoplasms,chr5,HMGCR,rs12654264,Covered,-,A>T,AT,Heterozygous Mutant,-,Colonic Neoplasms,HMG CoA reductase inhibitors,HMG CoA reductase inhibitors,1183533592,Not_Match,Patients with the AT genotype may have a highe...,Colonic Neoplasms,Other,Intermediate,,,,,3,1,3,5.50,,,,44279,,Heterozygous
126,Colonic Neoplasms,Neoplasms,chr5,HMGCR,rs12654264,Covered,-,A>T,TT,Homozygous Mutant,-,Colonic Neoplasms,HMG CoA reductase inhibitors,HMG CoA reductase inhibitors,1183533592,Not_Match,Patients with the TT genotype may have a lower...,Colonic Neoplasms,Other,Poor,,,,,3,1,3,5.50,,,,44279,,Heterozygous
127,Rectal Neoplasms,Neoplasms,chr4,EGF,rs4444903,Covered,-,A>G,AA,Wild type,-,Rectal Neoplasms,Cetuximab,Cetuximab,981202598,Not_Match,Patients with the rs4444903 AA genotype may ha...,Colorectal Neoplasms;Rectal Neoplasms,Efficacy,Poor,,,,,3,5,5,3.75,,,,44279,,Homozygous
128,Rectal Neoplasms,Neoplasms,chr4,EGF,rs4444903,Covered,-,A>G,AG,Heterozygous Mutant,-,Rectal Neoplasms,Cetuximab,Cetuximab,981202598,Not_Match,Patients with the rs4444903 AG genotype may ha...,Colorectal Neoplasms;Rectal Neoplasms,Efficacy,Intermediate,,,,,3,5,5,3.75,,,,44279,,Homozygous


In [32]:
matched_zyo_new.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/matched_rsid_Final_new_data_considered.xlsx', index=False)
matched_zyo_ordered.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/matched_rsid_Final_Ordered_db.xlsx', index=False)

# main file zygosity

In [8]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Main_pharmgkb_data.xlsx')
df.head(4)

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.0,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric


In [9]:
# Function to determine Zygosity
def determine_zygosity(row):
    # Skip rows with null values
    if pd.isnull(row['Genotype/Allele']) or pd.isnull(row['REF']) or pd.isnull(row['ALT']):
        return 'Unknown'

    if ',' in row['ALT']:
        # If ALT has a comma, take the first part
        alt = row['ALT'].split(',')[0]
    else:
        alt = row['ALT']

    if row['Genotype/Allele'] == row['REF'] + row['REF']:
        return 'Wildtype'
    elif row['Genotype/Allele'] == row['REF'] + alt or row['Genotype/Allele'] == alt + row['REF']:
        return 'Heterozygous'
    elif row['Genotype/Allele'] == alt + alt:
        return 'Homozygous'
    else:
        return 'Unknown'

# Apply the function to create the Zygosity column
df['Zygosity'] = df.apply(determine_zygosity, axis=1)
df

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Zygosity
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Homozygous
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Heterozygous
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Wildtype
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Homozygous
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown


In [10]:
df.Zygosity.value_counts()

Unknown         6559
Wildtype        3692
Homozygous      2704
Heterozygous    2698
Name: Zygosity, dtype: int64

In [11]:
df[df['Clinical Annotation ID'] == 982036328]

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Zygosity
1145,982036328,ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTC...,Patients with the ATACAGTCACTTTTTTTTTTTTTTTGAG...,,rs1799752,chr17,63488543.0,T,TTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCATACAGTCA...,Covered,,ACE,3,,Tier 1 VIP,2.25,Efficacy,15121491,1,1,spironolactone,Heart Failure,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,,Unknown
1146,982036328,ATACAGTCACTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTC...,Patients with the ATACAGTCACTTTTTTTTTTTTTTTGAG...,,rs1799752,chr17,63488543.0,T,TTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCATACAGTCA...,Covered,,ACE,3,,Tier 1 VIP,2.25,Efficacy,15121491,1,1,spironolactone,Heart Failure,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,,Unknown
1147,982036328,del/del,Patients with the del/del genotype may have le...,,rs1799752,chr17,63488543.0,T,TTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCATACAGTCA...,Covered,,ACE,3,,Tier 1 VIP,2.25,Efficacy,15121491,1,1,spironolactone,Heart Failure,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,,Unknown


In [12]:
df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/zygosity.xlsx', index=False)

## Dummy data

In [34]:
data = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/pharmgkb_variants_VR_output.vcf', sep = '\t', comment='#', header = None)
data.columns= ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'Filter', 'INFO']
data = data[~data['CHROM'].str.startswith('LRG_')]
data['CHROM'] = 'chr' + data['CHROM']
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,Filter,INFO
0,chr7,117587806,rs75527207,G,A,.,.,"VARID=rs75527207,CM980337,CM900053,CD910491;VC..."
2,chr12,21178615,rs4149056,T,C,.,.,"VARID=rs4149056,CM043777,COSV57010105;VCF=12-2..."
4,chr6,154039662,rs1799971,A,G,.,.,"VARID=rs1799971,COSV57673061,CM003770;VCF=6-15..."
6,chr7,117606695,rs141033578,C,G,.,.,"VARID=rs141033578,CM972956;VCF=7-117606695-C-G;"
8,chr7,117606695,rs141033578,C,T,.,.,"VARID=rs141033578,CM972956;VCF=7-117606695-C-T;"
...,...,...,...,...,...,...,...,...
5485,chr8,63038892,rs11545076,A,C,.,.,"VARID=rs11545076,CR035861;VCF=8-63038892-A-C;"
5486,chr14,90397013,rs12885713,C,G,.,.,"VARID=rs12885713,CR051276;VCF=14-90397013-C-G;"
5487,chr14,90397013,rs12885713,C,A,.,.,"VARID=rs12885713,CR051276;VCF=14-90397013-C-A;"
5488,chr14,90397013,rs12885713,C,T,.,.,"VARID=rs12885713,CR051276;VCF=14-90397013-C-T;"


In [36]:
grouped = data.groupby(['rsID', 'CHROM', 'POS', 'REF'])['ALT'].agg(ALT=lambda x: ','.join(set(x))).reset_index()
grouped

Unnamed: 0,rsID,CHROM,POS,REF,ALT
0,rs10007051,chr4,129244309,C,T
1,rs1000940,chr17,5379957,A,"T,G,C"
2,rs1001179,chr11,34438684,C,T
3,rs10011796,chr4,88169725,T,"G,C"
4,rs10012,chr2,38075247,G,C
...,...,...,...,...,...
2788,rs9973653,chr2,46320970,G,T
2789,rs9977268,chr21,45487373,C,T
2790,rs997917,chr8,53239818,T,C
2791,rs9981861,chr21,40043117,T,C


In [40]:
grouped.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Annotated_vep_data.xlsx', index = False)

In [38]:
duplicates = grouped[grouped.duplicated(subset=['rsID'], keep=False)]
duplicates

Unnamed: 0,rsID,CHROM,POS,REF,ALT
2287,rs6973474,chr7,96804,C,T
2288,rs6973474,chrHSCHR7_1_CTG1,96804,C,T
2289,rs6973474,chrHSCHR7_2_CTG1,99125,C,T


In [39]:
grouped.CHROM.value_counts()

chr6                275
chr1                261
chr19               205
chr7                201
chr2                183
chr12               156
chr10               150
chr4                145
chr16               140
chr11               137
chr3                134
chr5                121
chr9                101
chr17                90
chr15                88
chr8                 75
chr22                63
chr20                52
chr14                48
chr13                48
chrX                 39
chr21                36
chr18                34
chrMT                 9
chrHSCHR7_1_CTG1      1
chrHSCHR7_2_CTG1      1
Name: CHROM, dtype: int64

In [43]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Annotated_vep_data.xlsx')
df_1

Unnamed: 0,Variant,CHROM,POS,REF,ALT
0,rs10007051,chr4,129244309,C,T
1,rs1000940,chr17,5379957,A,"T,G,C"
2,rs1001179,chr11,34438684,C,T
3,rs10011796,chr4,88169725,T,"G,C"
4,rs10012,chr2,38075247,G,C
...,...,...,...,...,...
2786,rs9973653,chr2,46320970,G,T
2787,rs9977268,chr21,45487373,C,T
2788,rs997917,chr8,53239818,T,C
2789,rs9981861,chr21,40043117,T,C


In [46]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Pharmgkb_database_Updated_version4.xlsx')
df

Unnamed: 0,Covered/Not_Covered,Gene,Variant,Haplotypes,Genotype/Allele,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Covered,CFTR,rs75527207,,AA,,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,Covered,CFTR,rs75527207,,AG,,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,Covered,CFTR,rs75527207,,GG,,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,Covered,SLCO1B1,rs4149056,,CC,,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,Covered,SLCO1B1,rs4149056,,CT,,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [45]:
merged_df = pd.merge(df, df_1, on = 'Variant', how = 'left', sort = False)
merged_df

Unnamed: 0,Covered/Not_Covered,Gene,Variant,Haplotypes,Genotype/Allele,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,CHROM,POS,REF,ALT
0,Covered,CFTR,rs75527207,,AA,,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A
1,Covered,CFTR,rs75527207,,AG,,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A
2,Covered,CFTR,rs75527207,,GG,,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A
3,Covered,SLCO1B1,rs4149056,,CC,,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C
4,Covered,SLCO1B1,rs4149056,,CT,,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,
15649,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,
15650,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,
15651,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,


# main zygo code

In [47]:
import pandas as pd

def determine_zygosity(row):
    # Skip rows with null values
    if pd.isnull(row['Genotype/Allele']) or pd.isnull(row['REF']) or pd.isnull(row['ALT']):
        return 'Unknown'

    ref = row['REF']
    genotype = row['Genotype/Allele']
    alts = row['ALT'].split(',')

    for alt in alts:
        if genotype == ref + ref:
            return 'Wildtype'
        elif genotype == alt + alt:
            return 'Homozygous'
        elif genotype == ref + alt or genotype == alt + ref:
            return 'Heterozygous'

    return 'Unknown'

# Apply the function to create the Zygosity column
merged_df['Zygosity'] = merged_df.apply(determine_zygosity, axis=1)
merged_df

Unnamed: 0,Covered/Not_Covered,Gene,Variant,Haplotypes,Genotype/Allele,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,CHROM,POS,REF,ALT,Zygosity
0,Covered,CFTR,rs75527207,,AA,,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Homozygous
1,Covered,CFTR,rs75527207,,AG,,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Heterozygous
2,Covered,CFTR,rs75527207,,GG,,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,chr7,117587806.0,G,A,Wildtype
3,Covered,SLCO1B1,rs4149056,,CC,,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C,Homozygous
4,Covered,SLCO1B1,rs4149056,,CT,,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,chr12,21178615.0,T,C,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,Unknown
15649,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,Unknown
15650,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,Unknown
15651,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,,,,,Unknown


In [50]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/new_data/Pharmgkb_database_Updated_version4.xlsx')
df_1

Unnamed: 0,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Zygosity,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,chr7,117587806.0,CFTR,rs75527207,,AA,G,A,,Homozygous,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,chr7,117587806.0,CFTR,rs75527207,,AG,G,A,,Heterozygous,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,chr7,117587806.0,CFTR,rs75527207,,GG,G,A,,Wildtype,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,chr12,21178615.0,SLCO1B1,rs4149056,,CC,T,C,,Homozygous,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Heterozygous,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Unknown,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Unknown,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Unknown,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Unknown,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [51]:
import pandas as pd
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']
df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20
df['Gene'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df['Gene'] = df['Gene'].str.split(',').str[0]
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'Gene']]
df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,Gene
0,chr1,65489,65649,OR4F5
1,chr1,69007,70037,OR4F5
2,chr1,450710,451706,OR4F29
3,chr1,685686,686682,OR4F16
4,chr1,924401,924977,SAMD11
...,...,...,...,...
208906,chrY,25038781,25038941,BPY2C
208907,chrY,25041746,25041906,BPY2C
208908,chrY,25043888,25044048,BPY2C
208909,chrY,25622413,25624093,CDY1


In [53]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
df_1['Covered/Not_Covered'] = df_1.apply(check_coverage, axis=1)
df_1

Unnamed: 0,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Zygosity,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Covered/Not_Covered
0,chr7,117587806.0,CFTR,rs75527207,,AA,G,A,,Homozygous,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Covered
1,chr7,117587806.0,CFTR,rs75527207,,AG,G,A,,Heterozygous,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Covered
2,chr7,117587806.0,CFTR,rs75527207,,GG,G,A,,Wildtype,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Covered
3,chr12,21178615.0,SLCO1B1,rs4149056,,CC,T,C,,Homozygous,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Covered
4,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Heterozygous,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Covered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Unknown,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Not_Covered
15649,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Unknown,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Not_Covered
15650,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Unknown,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Not_Covered
15651,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Unknown,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Not_Covered


In [54]:
df_1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/new_data/Pharmgkb_database_Updated_version4.xlsx', index=False)

In [6]:
df[df['Zygosity_new'] == 'Unknown']

Unnamed: 0,Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Zygosity_new
21,,,,HLA-B,,HLA-B*15:02,*15:02,,,Presence,Patients with one or two copies of the HLA-B*1...,981419266,1A,,Tier 1 VIP,315.750,Toxicity,"18637831, 20235791, 21216202, 22500513, 236924...",18,23,phenytoin,drug reaction with eosinophilia and systemic s...,2022-06-22,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Unknown
22,,,,CYP2D6,,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",*1,,,Normal function,The CYP2D6*1 allele is assigned as a normal fu...,1451259580,1A,,Tier 1 VIP,211.375,Toxicity,"28350251, 18070221, 21614669, 15590749, 170088...",6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
23,,,,CYP2D6,,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",*1xN,,,Increased function,The CYP2D6*1xN alleles (*1x2 and *1x≥3) have b...,1451259580,1A,,Tier 1 VIP,211.375,Toxicity,"28350251, 18070221, 21614669, 15590749, 170088...",6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
24,,,,CYP2D6,,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",*2,,,Normal function,The CYP2D6*2 allele is assigned as a normal fu...,1451259580,1A,,Tier 1 VIP,211.375,Toxicity,"28350251, 18070221, 21614669, 15590749, 170088...",6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
25,,,,CYP2D6,,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2...",*3,,,No function,The CYP2D6*3 allele is assigned as a no functi...,1451259580,1A,,Tier 1 VIP,211.375,Toxicity,"28350251, 18070221, 21614669, 15590749, 170088...",6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15649,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15650,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown
15651,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown


In [7]:
df.Zygosity_new.value_counts(dropna=False)

Unknown         4496
Homozygous      3741
Heterozygous    3724
Wildtype        3692
Name: Zygosity_new, dtype: int64

In [13]:
# Function to determine Zygosity
def determine_zygosity(row):
    # Skip rows with null values
    if pd.isnull(row['Genotype/Allele']) or pd.isnull(row['REF']) or pd.isnull(row['ALT']):
        return 'Unknown'

    if ',' in row['ALT']:
        # If ALT has a comma, take the first part
        alt = row['ALT'].split(',')[0]
    else:
        alt = row['ALT']

    if row['Genotype/Allele'] == row['REF'] + row['REF']:
        return 'Wildtype'
    elif row['Genotype/Allele'] == alt + alt:
        return 'Homozygous'
    elif row['Genotype/Allele'] == row['REF'] + alt or row['Genotype/Allele'] == alt + row['REF']:
        return 'Heterozygous'
    else:
        return 'Unknown'

# Apply the function to create the Zygosity column
df['Zygosity_new'] = df.apply(determine_zygosity, axis=1)
df

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Zygosity,Zygosity_new
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Homozygous,Homozygous
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Heterozygous,Heterozygous
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Wildtype,Wildtype
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Homozygous,Homozygous
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Heterozygous,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown


In [15]:
# Function to determine Zygosity
def determine_zygosity(row):
    # Skip rows with null values
    if pd.isnull(row['Genotype/Allele']) or pd.isnull(row['REF']) or pd.isnull(row['ALT']):
        return 'Unknown'

    if ',' in row['ALT']:
        # If ALT has a comma, count the number of values
        alt_values = row['ALT'].split(',')
        if len(alt_values) == 1:
            # If only one value, consider it as Homozygous
            alt = alt_values[0]
            if row['Genotype/Allele'] == row['REF'] + alt or row['Genotype/Allele'] == alt + row['REF']:
                return 'Homozygous'
        elif len(alt_values) == 2:
            # If two values, consider it as Heterozygous
            alt1, alt2 = alt_values
            if row['Genotype/Allele'] == row['REF'] + alt1 or row['Genotype/Allele'] == alt1 + row['REF'] or \
               row['Genotype/Allele'] == row['REF'] + alt2 or row['Genotype/Allele'] == alt2 + row['REF']:
                return 'Heterozygous'

    if row['Genotype/Allele'] == row['REF'] + row['REF']:
        return 'Wildtype'
    elif row['Genotype/Allele'] == row['ALT'] + row['ALT']:
        return 'Homozygous'
    elif row['Genotype/Allele'] == row['REF'] + row['ALT'] or row['Genotype/Allele'] == row['ALT'] + row['REF']:
        return 'Heterozygous'
    else:
        return 'Unknown'

# Apply the function to create the Zygosity column
df['Zygosity_2'] = df.apply(determine_zygosity, axis=1)
df

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,Variant,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population,Zygosity,Zygosity_new,Zygosity_2
0,981755803,AA,Patients with the rs75527207 AA genotype (two ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Homozygous,Homozygous,Homozygous
1,981755803,AG,Patients with the rs75527207 AG genotype (one ...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Heterozygous,Heterozygous,Heterozygous
2,981755803,GG,Patients with the rs75527207 GG genotype (do n...,,rs75527207,chr7,117587806.0,G,A,Covered,,CFTR,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric,Wildtype,Wildtype,Wildtype
3,1449311190,CC,Patients with the CC genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Homozygous,Homozygous,Homozygous
4,1449311190,CT,Patients with the CT genotype and Precursor Ce...,,rs4149056,chr12,21178615.0,T,C,Covered,,SLCO1B1,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric,Heterozygous,Heterozygous,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,1449000354,*64,Patients with the CYP2D6*64 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown,Unknown
15649,1449000354,*65,Patients with the CYP2D6*65 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown,Unknown
15650,1449000354,*70,Patients with the CYP2D6*70 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown,Unknown
15651,1449000354,*71,Patients with the CYP2D6*71 allele may have de...,Uncertain function,,,,,,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",CYP2D6,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,,Unknown,Unknown,Unknown


In [16]:
df.iloc[12]

Clinical Annotation ID                                                     1449191746
Genotype/Allele                                                                    AA
Annotation Text                     Patients with the AA genotype (two copies of t...
Allele Function                                                                   NaN
Variant                                                                    rs78769542
CHROM                                                                            chr7
POS                                                                       117611650.0
REF                                                                                 G
ALT                                                                               C,A
Covered/Not_Covered                                                           Covered
Haplotypes                                                                        NaN
Gene                                                  

In [23]:
import pandas as pd
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/Lung_rsID.xlsx')
df

Unnamed: 0,rsID
0,rs316019
1,rs3740066
2,rs4149015
3,rs1517114
4,rs3115672
5,rs3832043
6,rs7779029
7,rs1695
8,rs2227983
9,rs5877


In [24]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAPOLGPTTL15.xlsx')
df_1

Unnamed: 0,rsID,Zygosity
0,rs1473003496,Heterozygous
1,rs1187492588,Heterozygous
2,rs1416222198,Heterozygous
3,rs1260343719,Heterozygous
4,rs2808347,Heterozygous
...,...,...
35873,rs559165,Homozygous
35874,rs306888,Heterozygous
35875,rs2037999,Heterozygous
35876,rs77413923,Heterozygous


In [25]:
zygo_merge = pd.merge(df, df_1, on = 'rsID', how= 'left', sort = False)
zygo_merge

Unnamed: 0,rsID,Zygosity
0,rs316019,Homozygous
1,rs3740066,
2,rs4149015,
3,rs1517114,
4,rs3115672,
5,rs3832043,
6,rs7779029,Heterozygous
7,rs1695,
8,rs2227983,
9,rs5877,


In [18]:
zygo_merge.Zygosity.value_counts(dropna=False)

NaN             38
Heterozygous     5
Homozygous       4
Name: Zygosity, dtype: int64

In [26]:
zygo_merge.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/KHAPOLGPTTL15_rsID_new.xlsx', index=False)

In [9]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/Lung_diseases.xlsx')
df

Unnamed: 0,Phenotype(s)
0,Lung Neoplasms
1,"Adenocarcinoma;Carcinoma, Non-Small-Cell Lung;..."
2,"Carcinoma, Non-Small-Cell Lung"
3,"Carcinoma, Non-Small-Cell Lung;Colorectal Neop..."
4,"Carcinoma, Non-Small-Cell Lung;Neoplasms"
5,"Carcinoma, Non-Small-Cell Lung;Mesothelioma;Pa..."
6,"Carcinoma, Non-Small-Cell Lung;Colorectal Neop..."
7,"Carcinoma, Non-Small-Cell Lung;Mesothelioma"
8,"Carcinoma, Non-Small-Cell Lung;gastrointestina..."
9,"Carcinoma, Non-Small-Cell Lung;pneumonitis"


In [10]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Pharmgkb_database_Updated_version4.xlsx')
df_1

Unnamed: 0,Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Covered,chr7,117587806.0,CFTR,rs75527207,,AA,G,A,,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,Covered,chr7,117587806.0,CFTR,rs75527207,,AG,G,A,,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,Covered,chr7,117587806.0,CFTR,rs75527207,,GG,G,A,,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CC,T,C,,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [11]:
merged_1 = pd.merge(df, df_1, on = 'Phenotype(s)', how = 'inner', sort=False)
merged_1

Unnamed: 0,Phenotype(s),Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,AA,G,A,,Patients with the AA genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
1,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,AG,G,A,,Patients with the AG genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
2,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,GG,G,A,,Patients with the GG genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
3,Lung Neoplasms,Not_Covered,chr14,104793397.0,AKT1,rs1130214,,AA,C,A,,Patients with the AA genotype and lung cancer ...,1183630276,3,,,2.00,Efficacy,20447721,1,1,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
4,Lung Neoplasms,Not_Covered,chr14,104793397.0,AKT1,rs1130214,,AC,C,A,,Patients with the AC genotype and lung cancer ...,1183630276,3,,,2.00,Efficacy,20447721,1,1,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr1,206496132.0,IKBKE,rs3748022,,CT,C,T,,Patients with non-small cell lung cancer and t...,1451131820,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
394,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr1,206496132.0,IKBKE,rs3748022,,TT,C,T,,Patients with non-small cell lung cancer and t...,1451131820,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
395,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr14,35402011.0,NFKBIA,rs8904,,AA,G,"A,T,C",,Patients with non-small cell lung cancer and t...,1451131860,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
396,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr14,35402011.0,NFKBIA,rs8904,,AG,G,"A,T,C",,Patients with non-small cell lung cancer and t...,1451131860,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [21]:
merged_1[merged_1['Variant'] == 'rs1058932']

Unnamed: 0,Phenotype(s),Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
353,"Carcinoma, Non-Small-Cell Lung;Thrombocytopenia",Covered,chr10,95037104.0,CYP2C8,rs1058932,,AA,G,"C,A",,Patients with non-small cell lung cancer and t...,1451125646,3,,Tier 1 VIP,2.5,Toxicity,31616045,1,1,carboplatin;gemcitabine,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
354,"Carcinoma, Non-Small-Cell Lung;Thrombocytopenia",Covered,chr10,95037104.0,CYP2C8,rs1058932,,AG,G,"C,A",,Patients with non-small cell lung cancer and t...,1451125646,3,,Tier 1 VIP,2.5,Toxicity,31616045,1,1,carboplatin;gemcitabine,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
355,"Carcinoma, Non-Small-Cell Lung;Thrombocytopenia",Covered,chr10,95037104.0,CYP2C8,rs1058932,,GG,G,"C,A",,Patients with non-small cell lung cancer and t...,1451125646,3,,Tier 1 VIP,2.5,Toxicity,31616045,1,1,carboplatin;gemcitabine,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [12]:
merged_1_covered = merged_1[merged_1['Covered/Not_Covered'] == 'Covered']
merged_1_covered

Unnamed: 0,Phenotype(s),Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,AA,G,A,,Patients with the AA genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
1,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,AG,G,A,,Patients with the AG genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
2,Lung Neoplasms,Covered,chr10,119080683.0,EIF3A,rs3740556,,GG,G,A,,Patients with the GG genotype and lung cancer ...,1183615367,3,,Rare Variant,4.25,Efficacy,23127338,1,3,carboplatin;cisplatin,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
12,Lung Neoplasms,Covered,chr4,88131171.0,ABCG2,rs2231142,,GG,G,"C,T",,Patients with the GG genotype and lung cancer ...,1447963662,4,,Tier 1 VIP,-0.25,Toxicity,"17148776, 21332310, 20035425, 25554506",4,4,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
13,Lung Neoplasms,Covered,chr4,88131171.0,ABCG2,rs2231142,,GT,G,"C,T",,Patients with the GT genotype and lung cancer ...,1447963662,4,,Tier 1 VIP,-0.25,Toxicity,"17148776, 21332310, 20035425, 25554506",4,4,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr1,206496132.0,IKBKE,rs3748022,,CT,C,T,,Patients with non-small cell lung cancer and t...,1451131820,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
394,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr1,206496132.0,IKBKE,rs3748022,,TT,C,T,,Patients with non-small cell lung cancer and t...,1451131820,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
395,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr14,35402011.0,NFKBIA,rs8904,,AA,G,"A,T,C",,Patients with non-small cell lung cancer and t...,1451131860,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
396,"Carcinoma, Non-Small-Cell Lung;Exanthema",Covered,chr14,35402011.0,NFKBIA,rs8904,,AG,G,"A,T,C",,Patients with non-small cell lung cancer and t...,1451131860,3,,,2.00,Toxicity,31664190,1,1,gefitinib,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [13]:
print(set(merged_1_covered.Variant))

{'rs316019', 'rs3740066', 'rs4149015', 'rs1517114', 'rs3115672', 'rs3832043', 'rs7779029', 'rs1695', 'rs2227983', 'rs5877', 'rs2228000', 'rs1799782', 'rs1058932', 'rs11615', 'rs3130985', 'rs1049709', 'rs121434568', 'rs861539', 'rs8904', 'rs6504649', 'rs3748022', 'rs9262132', 'rs1128503', 'rs2233980', 'rs2293347', 'rs2839698', 'rs1052555', 'rs2242046', 'rs4149117', 'rs2228130', 'rs11545078', 'rs11572078', 'rs4149056', 'rs9262143', 'rs1051266', 'rs11229', 'rs6113', 'rs73450548', 'rs3130907', 'rs2072671', 'rs6119', 'rs10491684', 'rs2231137', 'rs3094086', 'rs13181', 'rs2231142', 'rs3740556', 'rs7921977', 'rs780668', 'rs1045642', 'rs12721627', 'rs10885', 'rs6118', 'rs430397', 'rs2234922', 'rs7311358'}


In [22]:
merged_1_covered.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/variants_mapped_old.xlsx', index=False)

In [13]:
print(set(merged_1.Variant))

{'rs25487', 'rs2269577', 'rs9535826', 'rs861539', 'rs2233914', 'rs10759637', 'rs4752219', 'rs1409314', 'rs4978536', 'rs1042522', 'rs619586', 'rs13181', 'rs1051266', 'rs4752220', 'rs1799782', 'rs11868547', 'rs316019', 'rs430397', 'rs1800566', 'rs4979223', 'rs4541111', 'rs10817464', 'rs1690924', 'rs50872', 'rs2839698', 'rs9535828', 'rs4413407', 'rs10510050', 'rs1695', 'rs12819505', 'rs3738948', 'rs7958904', 'rs12621220', 'rs2228000', 'rs1799793', 'rs1143623', 'rs1052555', 'rs3212986', 'rs10878232', 'rs1045642', 'rs3213239', 'rs6983267', 'rs7170924', 'rs1800975', 'rs1799801', 'rs7091672', 'rs1128503'}


In [15]:
rsid_sample = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/KHAPOLGPTTL15_rsID.xlsx')
rsid_sample.head()

Unnamed: 0,Variant,Zygosity
0,rs25487,Homozygous
1,rs2269577,Wildtype
2,rs9535826,Wildtype
3,rs861539,Wildtype
4,rs2233914,Wildtype


In [14]:
rsid = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/variants_old_data.xlsx')
rsid.head()

Unnamed: 0,Variant
0,rs316019
1,rs3740066
2,rs4149015
3,rs1517114
4,rs3115672


In [17]:
new_rsId = pd.merge(rsid, rsid_sample, on = 'Variant', how = 'left', sort=False)
new_rsId

Unnamed: 0,Variant,Zygosity
0,rs316019,Homozygous
1,rs3740066,
2,rs4149015,
3,rs1517114,
4,rs3115672,
5,rs3832043,
6,rs7779029,
7,rs1695,Wildtype
8,rs2227983,
9,rs5877,


In [39]:
new_data = pd.merge(new_rsId, df_1, on = 'Variant', how = 'inner', sort = False)
new_data

Unnamed: 0,Variant,Zygosity,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function,CHROM,POS,REF,ALT,Covered/Not_Covered,Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,rs25487,Homozygous,981345277,CC,Patients with the CC genotype and cancer may h...,,chr19,43551574.0,T,"G,C",Covered,,XRCC1,3,,,3.00,Efficacy,23314736,1,2,fluorouracil,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
1,rs25487,Homozygous,981345277,CT,Patients with the CT genotype and cancer may h...,,chr19,43551574.0,T,"G,C",Covered,,XRCC1,3,,,3.00,Efficacy,23314736,1,2,fluorouracil,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
2,rs25487,Homozygous,981345277,TT,Patients with the TT genotype and cancer may h...,,chr19,43551574.0,T,"G,C",Covered,,XRCC1,3,,,3.00,Efficacy,23314736,1,2,fluorouracil,Colonic Neoplasms;Colorectal Neoplasms;Neoplas...,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
3,rs25487,Homozygous,981345285,CC,Patients with the CC genotype may have 1) incr...,,chr19,43551574.0,T,"G,C",Covered,,XRCC1,4,,,-0.25,Efficacy;Toxicity,"19786980, 22188361",2,4,cyclophosphamide,Neoplasms;Ovarian Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
4,rs25487,Homozygous,981345285,CT,Patients with the CT genotype may have 1) decr...,,chr19,43551574.0,T,"G,C",Covered,,XRCC1,4,,,-0.25,Efficacy;Toxicity,"19786980, 22188361",2,4,cyclophosphamide,Neoplasms;Ovarian Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,rs7170924,Wildtype,1448256872,GT,Patients with the GT genotype and non-small ce...,,chr15,81290798.0,G,"T,C",Not_Covered,,IL16,3,,,2.50,Toxicity,27498158,1,1,Platinum compounds,"Carcinoma, Non-Small-Cell Lung",2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
152,rs7170924,Wildtype,1448256872,TT,Patients with the TT genotype and non-small ce...,,chr15,81290798.0,G,"T,C",Not_Covered,,IL16,3,,,2.50,Toxicity,27498158,1,1,Platinum compounds,"Carcinoma, Non-Small-Cell Lung",2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
153,rs7091672,Wildtype,1447983050,CC,Patients with the CC genotype and non-small-ce...,,chr10,118836909.0,T,"A,C,G",Not_Covered,,EIF3A,3,,,3.25,Toxicity,25732572,1,1,Platinum compounds,"Carcinoma, Non-Small-Cell Lung",2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
154,rs7091672,Wildtype,1447983050,CT,Patients with the CT genotype and non-small-ce...,,chr10,118836909.0,T,"A,C,G",Not_Covered,,EIF3A,3,,,3.25,Toxicity,25732572,1,1,Platinum compounds,"Carcinoma, Non-Small-Cell Lung",2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [40]:
new_data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/KHAPOLGPTTL15_rsID_new_data.xlsx', index=False)

In [18]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/Old_anticancer_data.xlsx')
df_2

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,Lung Neoplasms,Neoplasms,chr10,EIF3A,rs3740556,Covered,-,G>A,AA,Homozygous Mutant,-,Lung Neoplasms,Carboplatin;Cisplatin,Carboplatin;Cisplatin,1183615367,Patients with the AA genotype and lung cancer ...,Lung Neoplasms,Efficacy,Good,,,,,3,1,3,4.25,,Rare Variant,,2021-03-24,
1,Lung Neoplasms,Neoplasms,chr10,EIF3A,rs3740556,Covered,-,G>A,AG,Heterozygous Mutant,-,Lung Neoplasms,Carboplatin;Cisplatin,Carboplatin;Cisplatin,1183615367,Patients with the AG genotype and lung cancer ...,Lung Neoplasms,Efficacy,Good,,,,,3,1,3,4.25,,Rare Variant,,2021-03-24,
2,Lung Neoplasms,Neoplasms,chr10,EIF3A,rs3740556,Covered,-,G>A,GG,Wild type,-,Lung Neoplasms,Carboplatin;Cisplatin,Carboplatin;Cisplatin,1183615367,Patients with the GG genotype and lung cancer ...,Lung Neoplasms,Efficacy,Poor,,,,,3,1,3,4.25,,Rare Variant,,2021-03-24,
3,Lung Neoplasms,Neoplasms,chr7,EGFR,rs121434569,Covered,-,C>T,CC,Wild type,-,Lung Neoplasms,Erlotinib,EGFR inhibitors,981475450,Patients with the somatic rs121434569 CC genot...,"Adenocarcinoma;Carcinoma, Non-Small-Cell Lung;...",Efficacy,Poor,,,,,2B,11,11,9.00,,Rare Variant,,2021-03-24,
4,Lung Neoplasms,Neoplasms,chr7,EGFR,rs121434569,Covered,-,C>T,CT,Heterozygous Mutant,-,Lung Neoplasms,Erlotinib,EGFR inhibitors,981475450,Patients with the somatic rs121434569 CT genot...,"Adenocarcinoma;Carcinoma, Non-Small-Cell Lung;...",Efficacy,Good,,,,,2B,11,11,9.00,,Rare Variant,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,Lung Neoplasms,Neoplasms,chr14,NFKBIA,rs8904,Covered,-,"G>A,C,T",AG,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Gefitinib,Gefitinib,1451131860,Patients with non-small cell lung cancer and t...,"Carcinoma, Non-Small-Cell Lung;Exanthema",Toxicity,,High SE,,,,3,1,1,2.00,,,,2021-03-24,
193,Lung Neoplasms,Neoplasms,chr14,NFKBIA,rs8904,Covered,-,"G>A,C,T",GG,Wild type,-,"Carcinoma, Non-Small-Cell Lung",Gefitinib,Gefitinib,1451131860,Patients with non-small cell lung cancer and t...,"Carcinoma, Non-Small-Cell Lung;Exanthema",Toxicity,,High SE,,,,3,1,1,2.00,,,,2021-03-24,
194,Lung Neoplasms,Neoplasms,chr19,ERCC1,rs11615,Covered,-,A>G,AA,Wild type,-,"Carcinoma, Non-Small-Cell Lung",Cisplatin;Gemcitabine,Cisplatin;Gemcitabine,1451551426,Patients with the rs11615 AA genotype may have...,"Carcinoma, Non-Small-Cell Lung",Efficacy,Good,,,,,3,1,1,1.75,,,,2021-10-20,
195,Lung Neoplasms,Neoplasms,chr19,ERCC1,rs11615,Covered,-,A>G,AG,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Cisplatin;Gemcitabine,Cisplatin;Gemcitabine,1451551426,Patients with the rs11615 AG genotype may have...,"Carcinoma, Non-Small-Cell Lung",Efficacy,Poor,,,,,3,1,1,1.75,,,,2021-10-20,


In [19]:
old_data = pd.merge(new_rsId, df_2, on = 'Variant', how = 'inner', sort = False)
old_data

Unnamed: 0,Variant,Zygosity_x,Profile (Header),Category(Class of Drugs),Chromosome,Gene,is present,Haplotype,Allele,Genotype,Zygosity_y,Hap-zygosity,Given Cancer type as Input,Drugs(Molecules),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs316019,Homozygous,Lung Neoplasms,Neoplasms,chr6,SLC22A2,Covered,-,A>C,AA,Wild type,-,"Carcinoma, Non-Small-Cell Lung",Platinum compounds,Platinum compounds,1448266986,Patients with the AA genotype and non-small ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,High SE,,,,3,1,2,5.00,,,,2021-03-24,
1,rs316019,Homozygous,Lung Neoplasms,Neoplasms,chr6,SLC22A2,Covered,-,A>C,AC,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Platinum compounds,Platinum compounds,1448266986,Patients with the AC genotype and non-small ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,High SE,,,,3,1,2,5.00,,,,2021-03-24,
2,rs316019,Homozygous,Lung Neoplasms,Neoplasms,chr6,SLC22A2,Covered,-,A>C,CC,Homozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Platinum compounds,Platinum compounds,1448266986,Patients with the CC genotype and non-small ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,Low SE,,,,3,1,2,5.00,,,,2021-03-24,
3,rs3740066,,Lung Neoplasms,Neoplasms,chr10,ABCC2,Covered,-,"C>G,T",CC,Wild type,-,"Carcinoma, Non-Small-Cell Lung",Irinotecan,Irinotecan,1183533945,Patients with the CC genotype and non-small ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,High SE,,,,3,1,2,1.00,,,,2021-03-24,
4,rs3740066,,Lung Neoplasms,Neoplasms,chr10,ABCC2,Covered,-,"C>G,T",CT,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Irinotecan,Irinotecan,1183533945,Patients with the CT genotype and non-small ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,Low SE,,,,3,1,2,1.00,,,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,rs2234922,,Lung Neoplasms,Neoplasms,chr1,EPHX1,Covered,-,"A>G,T",AG,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Docetaxel,Docetaxel,655386320,Patients with the AG genotype may have decreas...,"Carcinoma, Non-Small-Cell Lung",Dosage,,,,Decreased Dose,,3,1,1,0.25,,,,2021-03-24,
151,rs2234922,,Lung Neoplasms,Neoplasms,chr1,EPHX1,Covered,-,"A>G,T",GG,Homozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Docetaxel,Docetaxel,655386320,Patients with the GG genotype may have decreas...,"Carcinoma, Non-Small-Cell Lung",Dosage,,,,Decreased Dose,,3,1,1,0.25,,,,2021-03-24,
152,rs7311358,,Lung Neoplasms,Neoplasms,chr12,SLCO1B3,Covered,-,G>A,AA,Homozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Carboplatin;Paclitaxel,Carboplatin;Paclitaxel,1447982768,Patients with the AA genotype and Non-Small-Ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,High SE,,,,3,1,2,2.50,,,,2021-03-24,
153,rs7311358,,Lung Neoplasms,Neoplasms,chr12,SLCO1B3,Covered,-,G>A,AG,Heterozygous Mutant,-,"Carcinoma, Non-Small-Cell Lung",Carboplatin;Paclitaxel,Carboplatin;Paclitaxel,1447982768,Patients with the AG genotype and Non-Small-Ce...,"Carcinoma, Non-Small-Cell Lung",Toxicity,,Low SE,,,,3,1,2,2.50,,,,2021-03-24,


In [20]:
old_data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/KHAPOLGPTTL15_rsID_old_data_new_variants.xlsx', index=False)

In [2]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/genes_scores.xlsx')
df

Unnamed: 0,Gene Name,score
0,TRDN-AS1,6
1,PMS2,8
2,AASS,4
3,ABCA13,4
4,ABCA7,8
...,...,...
3946,MLYCD,6
3947,ZSWIM6,4
3948,PDHX,6
3949,LOC105378457,4


In [3]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/sample_genes.xlsx')
df_1

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list
0,TSPAN1,Muscular_health,Muscular_health,Muscular_health
1,POMGNT1,Muscular_health,Muscular_health,Muscular_health
2,TOR1AIP1,Muscular_health,Muscular_health,Muscular_health
3,DYSF,Muscular_health,Muscular_health,Muscular_health
4,LIMS2,Muscular_health,Muscular_health,Muscular_health
...,...,...,...,...
86,ITGA7,Muscular_health,Muscular_health,Muscular_health
87,ACTA1,Muscular_health,Muscular_health,Muscular_health
88,INPP5K,Muscular_health,Muscular_health,Muscular_health
89,LOC123864065,Muscular_health,Muscular_health,Muscular_health


In [5]:
df_new = pd.merge(df_1, df, on = 'Gene Name', how = 'left', sort = False)
df_new

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list,score
0,TSPAN1,Muscular_health,Muscular_health,Muscular_health,4.0
1,POMGNT1,Muscular_health,Muscular_health,Muscular_health,4.0
2,TOR1AIP1,Muscular_health,Muscular_health,Muscular_health,4.0
3,DYSF,Muscular_health,Muscular_health,Muscular_health,6.0
4,LIMS2,Muscular_health,Muscular_health,Muscular_health,4.0
...,...,...,...,...,...
86,ITGA7,Muscular_health,Muscular_health,Muscular_health,4.0
87,ACTA1,Muscular_health,Muscular_health,Muscular_health,4.0
88,INPP5K,Muscular_health,Muscular_health,Muscular_health,
89,LOC123864065,Muscular_health,Muscular_health,Muscular_health,


In [6]:
df_new.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/sample_scores.xlsx', index = False)

# Zygosity mapping

In [7]:
zygo = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/Pharmgkb_database_Updated_version4.xlsx')
zygo

Unnamed: 0,Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Covered,chr7,117587806.0,CFTR,rs75527207,,AA,G,A,,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,Covered,chr7,117587806.0,CFTR,rs75527207,,AG,G,A,,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,Covered,chr7,117587806.0,CFTR,rs75527207,,GG,G,A,,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CC,T,C,,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


# Brust Cancer

In [79]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/KHAPOLGPTTL17_rsID.xlsx')
df

Unnamed: 0,Variant,Zygosity
0,rs201219564,Homozygous
1,rs2691305,Homozygous
2,rs200676709,Heterozygous
3,rs112703963,Homozygous
4,rs6672356,Homozygous
...,...,...
41045,rs782092227,Heterozygous
41046,rs57301248,Heterozygous
41047,rs559165,Homozygous
41048,rs2037999,Homozygous


In [80]:
df = df.drop_duplicates(subset='Variant', keep='first')
df

Unnamed: 0,Variant,Zygosity
0,rs201219564,Homozygous
1,rs2691305,Homozygous
2,rs200676709,Heterozygous
3,rs112703963,Homozygous
4,rs6672356,Homozygous
...,...,...
41045,rs782092227,Heterozygous
41046,rs57301248,Heterozygous
41047,rs559165,Homozygous
41048,rs2037999,Homozygous


In [63]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/Breast_data.xlsx')
data

Unnamed: 0,Profile (Header),Category(Class of Drugs),Chromosome,Gene,Variant,is present,Haplotype,Allele,Genotype,Zygosity,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the CT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,
1,Breast Neoplasms,Neoplasms,chr10,CYP2C19,rs12248560,Covered,-,"C>A,T",TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin;Fluorouracil,Cyclophosphamide;Doxorubicin;Fluorouracil,1449718265,Patients with the TT genotype and breast cance...,Breast Neoplasms,Toxicity,,Low SE,,,,3,1,1,2.25,,Tier 1 VIP,,2021-03-24,
2,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GG,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GG genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Decreased Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
3,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",GT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the GT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
4,Breast Neoplasms,Neoplasms,chr19,CYP2B6,rs3745274,Covered,-,"G>A,T",TT,Wild Type,-,Breast Neoplasms,Cyclophosphamide;Doxorubicin,Cyclophosphamide;Doxorubicin,981202356,Patients with the TT genotype and Breast Cance...,Breast Neoplasms,Dosage,,,,Intermediate Dose,,3,1,1,2.00,,Tier 1 VIP,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the CT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,Intermediate SE,,,,3,1,1,0.00,,,,2021-03-24,
226,Breast Neoplasms,Neoplasms,chr19,PIK3R2,rs56022120,Covered,-,C>T,TT,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Epirubicin;Fluorouracil,Cyclophosphamide;Epirubicin;Fluorouracil,1449270997,Patients with breast cancer and the TT genotyp...,Breast Neoplasms;Neutropenia,Toxicity,,High SE,,,,3,1,1,0.00,,,,2021-03-24,
227,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AA genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,
228,Breast Neoplasms,Neoplasms,chr15,CYP19A1,rs4646,Covered,-,A>C,AC,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1451282140,Pre-menopausal women with the AC genotype and ...,Breast Neoplasms,Efficacy,Good,,,,Female,3,2,2,5.00,,,,2021-03-24,


In [64]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/new_data/Pharmgkb_database_Updated_version4.xlsx')
df_1

Unnamed: 0,Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Zygosity,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Covered,chr7,117587806.0,CFTR,rs75527207,,AA,G,A,,Homozygous,Patients with the rs75527207 AA genotype (two ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
1,Covered,chr7,117587806.0,CFTR,rs75527207,,AG,G,A,,Heterozygous,Patients with the rs75527207 AG genotype (one ...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
2,Covered,chr7,117587806.0,CFTR,rs75527207,,GG,G,A,,Wildtype,Patients with the rs75527207 GG genotype (do n...,981755803,1A,,Rare Variant; Tier 1 VIP,234.875,Efficacy,"21083385, 22047557, 23590265, 23757361, 238913...",28,30,ivacaftor,Cystic Fibrosis,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/98...,Pediatric
3,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CC,T,C,,Homozygous,Patients with the CC genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
4,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Heterozygous,Patients with the CT genotype and Precursor Ce...,1449311190,3,,Tier 1 VIP,2.000,Dosage,29683944,1,1,mercaptopurine;methotrexate,Precursor Cell Lymphoblastic Leukemia-Lymphoma,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,Pediatric
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15648,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*64,,,Uncertain function,Haplotype,Patients with the CYP2D6*64 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15649,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*65,,,Uncertain function,Haplotype,Patients with the CYP2D6*65 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15650,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*70,,,Uncertain function,Haplotype,Patients with the CYP2D6*70 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,
15651,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D...",*71,,,Uncertain function,Haplotype,Patients with the CYP2D6*71 allele may have de...,1449000354,3,,Tier 1 VIP,0.000,Metabolism/PK,24647041,1,6,n-desmethyltamoxifen,,2023-08-31,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [67]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/Breast_drugs.xlsx')
df_2['Drug(s)'] = df_2['Drug(s)'].str.lower()
df_2

Unnamed: 0,Drug(s)
0,cyclophosphamide;doxorubicin;fluorouracil
1,cyclophosphamide;doxorubicin
2,carboplatin;docetaxel;trastuzumab
3,cyclophosphamide;epirubicin
4,cyclophosphamide;epirubicin;fluorouracil
5,tamoxifen
6,anthracyclines and related substances;taxanes
7,capecitabine;docetaxel
8,paclitaxel
9,doxorubicin;doxorubicinol


In [68]:
merged_1 = pd.merge(df_1, df_2, on = 'Drug(s)', how = 'inner', sort=False)
merged_1

Unnamed: 0,Covered/Not_Covered,CHROM,POS,Gene,Variant,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Zygosity,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",*1,,,Normal function,Haplotype,The CYP2D6*1 allele is assigned as a normal fu...,1451285240,1A,,Tier 1 VIP,226.5625,Efficacy,"24329190, 23100173, 23842856, 25091503, 223956...",56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
1,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",*2,,,Normal function,Haplotype,The CYP2D6*2 allele is assigned as a normal fu...,1451285240,1A,,Tier 1 VIP,226.5625,Efficacy,"24329190, 23100173, 23842856, 25091503, 223956...",56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
2,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",*3,,,No function,Haplotype,The CYP2D6*3 allele is assigned as a no functi...,1451285240,1A,,Tier 1 VIP,226.5625,Efficacy,"24329190, 23100173, 23842856, 25091503, 223956...",56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
3,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",*4,,,No function,Haplotype,The CYP2D6*4 allele is assigned as a no functi...,1451285240,1A,,Tier 1 VIP,226.5625,Efficacy,"24329190, 23100173, 23842856, 25091503, 223956...",56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
4,Not_Covered,,,CYP2D6,,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6...",*5,,,No function,Haplotype,The CYP2D6*5 allele is assigned as a no functi...,1451285240,1A,,Tier 1 VIP,226.5625,Efficacy,"24329190, 23100173, 23842856, 25091503, 223956...",56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/14...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,CT,T,C,,Heterozygous,Patients with the CT genotype and hormone inse...,1448112147,3,,Tier 1 VIP,1.5000,Toxicity,27234217,1,1,cyclophosphamide;docetaxel;doxorubicin;epirubi...,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
709,Covered,chr12,21178615.0,SLCO1B1,rs4149056,,TT,T,C,,Wildtype,Patients with the TT genotype and hormone inse...,1448112147,3,,Tier 1 VIP,1.5000,Toxicity,27234217,1,1,cyclophosphamide;docetaxel;doxorubicin;epirubi...,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
710,Covered,chr5,177093242.0,FGFR4,rs351855,,AA,G,A,,Homozygous,Patients with the AA genotype and node-positiv...,1447963611,3,,,1.7500,Efficacy,16822847,1,1,cyclophosphamide;fluorouracil;methotrexate,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
711,Covered,chr5,177093242.0,FGFR4,rs351855,,AG,G,A,,Heterozygous,Patients with the AG genotype and node-positiv...,1447963611,3,,,1.7500,Efficacy,16822847,1,1,cyclophosphamide;fluorouracil;methotrexate,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [69]:
merged_2 = merged_1[['Variant']]
merged_2 = merged_2.drop_duplicates(subset='Variant', keep='first')
merged_2

Unnamed: 0,Variant
0,
26,rs1045642
36,rs4986938
39,rs9340799
46,rs11023197
...,...
692,rs2073618
695,rs7984870
698,rs10046
701,rs2289105


In [70]:
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/merged_rsID.xlsx', index = False)

In [71]:
merged_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/merged_rsID.xlsx')
merged_2

Unnamed: 0,Variant
0,rs1045642
1,rs4986938
2,rs9340799
3,rs11023197
4,rs478437
...,...
173,rs2073618
174,rs7984870
175,rs10046
176,rs2289105


In [72]:
rsID_data = pd.merge(merged_2, df, on = 'Variant', how = 'left', sort = False)
rsID_data

Unnamed: 0,Variant,Zygosity
0,rs1045642,Heterozygous
1,rs4986938,
2,rs9340799,
3,rs11023197,
4,rs478437,
...,...,...
173,rs2073618,
174,rs7984870,
175,rs10046,
176,rs2289105,


In [73]:
rsID_data.Zygosity.value_counts()

Heterozygous    25
Homozygous      16
Name: Zygosity, dtype: int64

In [74]:
final = pd.merge(rsID_data, data, on = 'Variant', how = 'left', sort = False)
final

Unnamed: 0,Variant,Zygosity_x,Profile (Header),Category(Class of Drugs),Chromosome,Gene,is present,Haplotype,Allele,Genotype,Zygosity_y,Hap-zygosity,Given Cancer type as Input,Drug(s),MODIFIED_Drugs(Molecules),PharmGKB_ID/Clinical_Annotation_ID,Clinical Phenotype,Disease(Phenotype),Phenotype Category,Drug response,Side effects,Metabolism status,Dosage status,Gender,L_o_E (Level Of Evidence),PMID Count,Evidence Count,Total Score,Spacialty Population,Level Modifiers,Level Override,Latest History Date (YYYY-MM-DD),Allele Function
0,rs1045642,Heterozygous,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,G,T",AA,Wild Type,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1.183698e+09,Women with the AA genotype and breast cancer m...,Breast Neoplasms,Efficacy,Good,,,,Female,3.0,1.0,2.0,2.00,,Tier 1 VIP,,2021-03-24,
1,rs1045642,Heterozygous,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,G,T",AG,Heterozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1.183698e+09,Women with the AG genotype and breast cancer m...,Breast Neoplasms,Efficacy,Poor,,,,Female,3.0,1.0,2.0,2.00,,Tier 1 VIP,,2021-03-24,
2,rs1045642,Heterozygous,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,G,T",GG,Homozygous Mutant,-,Breast Neoplasms,Tamoxifen,Tamoxifen,1.183698e+09,Women with the GG genotype and breast cancer m...,Breast Neoplasms,Efficacy,Good,,,,Female,3.0,1.0,2.0,2.00,,Tier 1 VIP,,2021-03-24,
3,rs1045642,Heterozygous,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,G,T",AA,Wild Type,-,Breast Neoplasms,Anthracyclines and related substances;Taxanes,Anthracyclines and related substances;Taxanes,1.183632e+09,Patients with the AA genotype may have increas...,Breast Neoplasms,Efficacy,Good,,,,,3.0,1.0,1.0,1.75,,Tier 1 VIP,,2021-03-24,
4,rs1045642,Heterozygous,Breast Neoplasms,Neoplasms,chr7,ABCB1,Covered,-,"A>C,G,T",AG,Heterozygous Mutant,-,Breast Neoplasms,Anthracyclines and related substances;Taxanes,Anthracyclines and related substances;Taxanes,1.183632e+09,Patients with the AG genotype may have decreas...,Breast Neoplasms,Efficacy,Poor,,,,,3.0,1.0,1.0,1.75,,Tier 1 VIP,,2021-03-24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,rs10046,,Breast Neoplasms,Neoplasms,chr15,CYP19A1,Covered,-,G>A,GG,Wild Type,-,Breast Neoplasms,HDL cholesterol;Letrozole;Triglycerides,Letrozole,1.447682e+09,Post-menopausal women with the GG genotype and...,Breast Neoplasms;Menopause,Other,,High SE,,,Female,3.0,1.0,2.0,2.75,,,,2021-03-24,
352,rs2289105,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,
353,rs4149056,,Breast Neoplasms,Neoplasms,chr12,SLCO1B1,Covered,-,T>C,CC,Homozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Docetaxel;Doxorubicin;Epirubi...,Cyclophosphamide;Doxorubicin;Fluorouracil,1.448112e+09,Patients with the CC genotype and hormone inse...,Breast Neoplasms,Toxicity,,Low SE,,,,3.0,1.0,1.0,1.50,,Tier 1 VIP,,2021-03-24,
354,rs4149056,,Breast Neoplasms,Neoplasms,chr12,SLCO1B1,Covered,-,T>C,CT,Heterozygous Mutant,-,Breast Neoplasms,Cyclophosphamide;Docetaxel;Doxorubicin;Epirubi...,Cyclophosphamide;Doxorubicin;Fluorouracil,1.448112e+09,Patients with the CT genotype and hormone inse...,Breast Neoplasms,Toxicity,,Low SE,,,,3.0,1.0,1.0,1.50,,Tier 1 VIP,,2021-03-24,


In [77]:
rsID_data_final = pd.merge(rsID_data, merged_1, on = 'Variant', how = 'inner', sort = False)
rsID_data_final

Unnamed: 0,Variant,Zygosity_x,Covered/Not_Covered,CHROM,POS,Gene,Haplotypes,Genotype/Allele,REF,ALT,Allele Function,Zygosity_y,Annotation Text,Clinical Annotation ID,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
0,rs1045642,Heterozygous,Covered,chr7,87509329.0,ABCB1,,AA,A,"T,G,C",,Wildtype,Women with the AA genotype and breast cancer m...,1183697570,3,,Tier 1 VIP,2.00,Efficacy,24019753,1,2,tamoxifen,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
1,rs1045642,Heterozygous,Covered,chr7,87509329.0,ABCB1,,AG,A,"T,G,C",,Heterozygous,Women with the AG genotype and breast cancer m...,1183697570,3,,Tier 1 VIP,2.00,Efficacy,24019753,1,2,tamoxifen,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
2,rs1045642,Heterozygous,Covered,chr7,87509329.0,ABCB1,,GG,A,"T,G,C",,Homozygous,Women with the GG genotype and breast cancer m...,1183697570,3,,Tier 1 VIP,2.00,Efficacy,24019753,1,2,tamoxifen,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/11...,
3,rs1045642,Heterozygous,Covered,chr7,87509329.0,ABCB1,,AA,A,"T,G,C",,Wildtype,Postmenopausal women with HR+ breast cancer an...,1448615137,3,,Tier 1 VIP,2.00,Toxicity,27747906,1,1,anastrozole,Arthralgia;Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
4,rs1045642,Heterozygous,Covered,chr7,87509329.0,ABCB1,,AG,A,"T,G,C",,Heterozygous,Postmenopausal women with HR+ breast cancer an...,1448615137,3,,Tier 1 VIP,2.00,Toxicity,27747906,1,1,anastrozole,Arthralgia;Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,rs2289105,,Not_Covered,chr15,51215311.0,CYP19A1,,CT,T,"A,G,C",,Heterozygous,Post-menopausal women with the CT genotype and...,1447681831,3,,,2.75,Other,26463708,1,2,hdl cholesterol;letrozole;triglycerides,Breast Neoplasms;Menopause,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
641,rs2289105,,Not_Covered,chr15,51215311.0,CYP19A1,,TT,T,"A,G,C",,Wildtype,Post-menopausal women with the TT genotype and...,1447681831,3,,,2.75,Other,26463708,1,2,hdl cholesterol;letrozole;triglycerides,Breast Neoplasms;Menopause,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
642,rs4149056,,Covered,chr12,21178615.0,SLCO1B1,,CC,T,C,,Homozygous,Patients with the CC genotype and hormone inse...,1448112147,3,,Tier 1 VIP,1.50,Toxicity,27234217,1,1,cyclophosphamide;docetaxel;doxorubicin;epirubi...,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,
643,rs4149056,,Covered,chr12,21178615.0,SLCO1B1,,CT,T,C,,Heterozygous,Patients with the CT genotype and hormone inse...,1448112147,3,,Tier 1 VIP,1.50,Toxicity,27234217,1,1,cyclophosphamide;docetaxel;doxorubicin;epirubi...,Breast Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/14...,


In [78]:
rsID_data_final.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/anticancer_pgx/KHMBPRGPONC9_data_new.xlsx', index = False)