In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [50]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGPRX959_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
df['HET'] = df['INFO'].str.extract(r'HET=(\d)')
df['HOM'] = df['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
df['Zygosity'] = ''
df.loc[df['HOM'] == '1', 'Zygosity'] = 'Homozygous'
df.loc[df['HET'] == '1', 'Zygosity'] = 'Heterozygous'
df = df.drop(columns=['HET', 'HOM'], axis = 1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/RSIDS_POSITION_DBSNP.xlsx')
df = pd.merge(df, data, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'inner', sort = False)
df2 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/13_Genes_Key_Variants_Coordinates.csv', sep = '\t')
df2 = df2[['CHROM', 'POS', 'REF', 'ALT', 'Haplotype']]
merged = pd.merge(df, df2, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
madhu = merged.copy()
madhu['rsID_mapped'] = madhu.groupby('Haplotype')['rsID'].transform(lambda x: ','.join(set(x)))
madhu['Zygosity_mapped'] = madhu.groupby(['Haplotype'])['Zygosity'].transform(lambda x: ','.join(x))
madhu.drop_duplicates(subset='Haplotype', inplace=True)
madhu

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated,Haplotype,rsID_mapped,Zygosity_mapped
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*4,"rs1799930,rs1041983,rs1208,rs1799931","Homozygous,Heterozygous,Homozygous,Heterozygous"
1,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6A,"rs1799930,rs1041983","Homozygous,Heterozygous"
2,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6J,"rs1799930,rs1041983,rs1799931","Homozygous,Heterozygous,Heterozygous"
3,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6O,"rs1799930,rs1041983","Homozygous,Heterozygous"
4,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*7B,"rs1041983,rs1799931","Homozygous,Heterozygous"
5,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*7G,"rs1041983,rs1799931","Homozygous,Heterozygous"
6,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*13A,rs1041983,Homozygous
9,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=28;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:38:28:28:17:11:39.29%:1.4422E-4:67:60:14:3...,Heterozygous,rs1799930,NAT2*6B,rs1799930,Heterozygous
13,chr8,18400806,rs1208,G,A,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.3229,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:254:44:44:0:44:100%:3.8097E-26:0:63:0:0:39:5,Homozygous,rs1208,NAT2*5B,rs1208,Homozygous
14,chr8,18400806,rs1208,G,A,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.3229,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:254:44:44:0:44:100%:3.8097E-26:0:63:0:0:39:5,Homozygous,rs1208,NAT2*5C,rs1208,Homozygous


In [51]:
madhu['rsID_mapped_sorted'] = madhu['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
madhu

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated,Haplotype,rsID_mapped,Zygosity_mapped,rsID_mapped_sorted
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*4,"rs1799930,rs1041983,rs1208,rs1799931","Homozygous,Heterozygous,Homozygous,Heterozygous","rs1041983,rs1208,rs1799930,rs1799931"
1,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6A,"rs1799930,rs1041983","Homozygous,Heterozygous","rs1041983,rs1799930"
2,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6J,"rs1799930,rs1041983,rs1799931","Homozygous,Heterozygous,Heterozygous","rs1041983,rs1799930,rs1799931"
3,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*6O,"rs1799930,rs1041983","Homozygous,Heterozygous","rs1041983,rs1799930"
4,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*7B,"rs1041983,rs1799931","Homozygous,Heterozygous","rs1041983,rs1799931"
5,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*7G,"rs1041983,rs1799931","Homozygous,Heterozygous","rs1041983,rs1799931"
6,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:33:32:0:32:100%:5.4567E-19:0:57:0:0:24:8,Homozygous,rs1041983,NAT2*13A,rs1041983,Homozygous,rs1041983
9,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=28;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:38:28:28:17:11:39.29%:1.4422E-4:67:60:14:3...,Heterozygous,rs1799930,NAT2*6B,rs1799930,Heterozygous,rs1799930
13,chr8,18400806,rs1208,G,A,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.3229,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:254:44:44:0:44:100%:3.8097E-26:0:63:0:0:39:5,Homozygous,rs1208,NAT2*5B,rs1208,Homozygous,rs1208
14,chr8,18400806,rs1208,G,A,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.3229,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:254:44:44:0:44:100%:3.8097E-26:0:63:0:0:39:5,Homozygous,rs1208,NAT2*5C,rs1208,Homozygous,rs1208


In [69]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGPRX959_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
df['HET'] = df['INFO'].str.extract(r'HET=(\d)')
df['HOM'] = df['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
df['Zygosity'] = ''
df.loc[df['HOM'] == '1', 'Zygosity'] = 'Homozygous'
df.loc[df['HET'] == '1', 'Zygosity'] = 'Heterozygous'
df = df.drop(columns=['HET', 'HOM'], axis = 1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/RSIDS_POSITION_DBSNP.xlsx')
df = pd.merge(df, data, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'inner', sort = False)
df2 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/13_Genes_Key_Variants_Coordinates.csv', sep = '\t')
df2 = df2[['CHROM', 'POS', 'REF', 'ALT', 'Haplotype']]
merged = pd.merge(df, df2, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
madhu = merged.copy()
madhu['rsID_mapped'] = madhu.groupby('Haplotype')['rsID'].transform(lambda x: ','.join(set(x)))
madhu['Zygosity_mapped'] = madhu.groupby(['Haplotype'])['Zygosity'].transform(lambda x: ','.join(x))
madhu.drop_duplicates(subset='Haplotype', inplace=True)

madhu['rsID_mapped_sorted'] = madhu['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
madhu = madhu.drop(columns=['rsID_updated', 'Haplotype'], axis=1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Multiple_Positions.xlsx')
data = data.rename({'Haplotype':'Haplotype_updated', 'rsID':'rsID_mapped'}, axis = 1)
data['rsID_mapped_sorted'] = data['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
data = data.drop(columns=['rsID_mapped'], axis=1)
mapped_df = pd.merge(madhu, data, on = 'rsID_mapped_sorted', how = 'inner', sort = False)
mapped_df.drop_duplicates(subset='Haplotype_updated', inplace=True)
mapped_df = mapped_df.drop(columns=['Zygosity'], axis=1)
mapped_df = mapped_df[['rsID_mapped_sorted', 'Zygosity_mapped', 'Haplotype_updated']]
mapped_df

Unnamed: 0,rsID_mapped_sorted,Zygosity_mapped,Haplotype_updated
0,"rs1041983,rs1799930","Homozygous,Heterozygous",NAT2*6A
2,"rs1041983,rs1799930,rs1799931","Homozygous,Heterozygous,Heterozygous",NAT2*6J
3,"rs1041983,rs1799931","Homozygous,Heterozygous",NAT2*7B
5,"rs12769205,rs4244285","Heterozygous,Heterozygous",CYP2C19*2


## check the sorting

In [52]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGPRX959_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
df['HET'] = df['INFO'].str.extract(r'HET=(\d)')
df['HOM'] = df['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
df['Zygosity'] = ''
df.loc[df['HOM'] == '1', 'Zygosity'] = 'Homozygous'
df.loc[df['HET'] == '1', 'Zygosity'] = 'Heterozygous'
df = df.drop(columns=['HET', 'HOM'], axis = 1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/RSIDS_POSITION_DBSNP.xlsx')
df = pd.merge(df, data, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'inner', sort = False)
df2 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/13_Genes_Key_Variants_Coordinates.csv', sep = '\t')
df2 = df2[['CHROM', 'POS', 'REF', 'ALT', 'Haplotype']]
merged = pd.merge(df, df2, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
madhu = merged.copy()
madhu['rsID_mapped'] = madhu.groupby('Haplotype')['rsID'].transform(lambda x: ','.join(set(x)))
madhu['Zygosity_mapped'] = madhu.groupby(['Haplotype'])['Zygosity'].transform(lambda x: ','.join(x))
madhu.drop_duplicates(subset='Haplotype', inplace=True)

madhu['rsID_mapped_sorted'] = madhu['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
madhu = madhu.drop(columns=['rsID_updated', 'Haplotype'], axis=1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Multiple_Positions.xlsx')
data = data.rename({'Haplotype':'Haplotype_updated', 'rsID':'rsID_mapped'}, axis = 1)
data['rsID_mapped_sorted'] = data['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
data = data.drop(columns=['rsID_mapped'], axis=1)
mapped_df = pd.merge(madhu, data, on = 'rsID_mapped_sorted', how = 'inner', sort = False)
mapped_df.drop_duplicates(subset='Haplotype_updated', inplace=True)
mapped_df = mapped_df.drop(columns=['Zygosity'], axis=1)
mapped_df = mapped_df[['rsID_mapped', 'Zygosity_mapped', 'Haplotype_updated']]
mapped_df

Unnamed: 0,rsID_mapped,Zygosity_mapped,Haplotype_updated
0,"rs1799930,rs1041983","Homozygous,Heterozygous",NAT2*6A
2,"rs1799930,rs1041983,rs1799931","Homozygous,Heterozygous,Heterozygous",NAT2*6J
3,"rs1041983,rs1799931","Homozygous,Heterozygous",NAT2*7B
5,"rs12769205,rs4244285","Heterozygous,Heterozygous",CYP2C19*2


In [70]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGPRX959_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
df['HET'] = df['INFO'].str.extract(r'HET=(\d)')
df['HOM'] = df['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
df['Zygosity'] = ''
df.loc[df['HOM'] == '1', 'Zygosity'] = 'Homozygous'
df.loc[df['HET'] == '1', 'Zygosity'] = 'Heterozygous'
df = df.drop(columns=['HET', 'HOM'], axis = 1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/RSIDS_POSITION_DBSNP.xlsx')
df = pd.merge(df, data, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'inner', sort = False)
df[df['rsID_updated'] == 'rs1799930']

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated
1,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=28;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:38:28:28:17:11:39.29%:1.4422E-4:67:60:14:3...,Heterozygous,rs1799930


In [4]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGPRX981_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
df['HET'] = df['INFO'].str.extract(r'HET=(\d)')
df['HOM'] = df['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
df['Zygosity'] = ''
df.loc[df['HOM'] == '1', 'Zygosity'] = 'Homozygous'
df.loc[df['HET'] == '1', 'Zygosity'] = 'Heterozygous'
df = df.drop(columns=['HET', 'HOM'], axis = 1)
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/RSIDS_POSITION_DBSNP.xlsx')
df = pd.merge(df, data, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'inner', sort = False)
df

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983
1,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,rs1799930
2,chr10,94775367,rs12769205,A,G,.,PASS,"ADP=51;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.7716,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:51:51:1:50:98.04%:1.3013E-28:74:51:1:0...,Homozygous,rs12769205
3,chr19,41006936,rs3745274,G,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.684...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:35:22:22:12:10:45.45%:2.6061E-4:61:55:11:1...,Heterozygous,rs3745274
4,chr19,41009358,rs2279343,A,G,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEIN...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:31:22:22:13:9:40.91%:7.0165E-4:60:52:12:1:4:5,Heterozygous,rs2279343


In [9]:
df2 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/13_Genes_Key_Variants_Coordinates.csv', sep = '\t')
df2 = df2[['CHROM', 'POS', 'REF', 'ALT', 'Haplotype']]
df2

Unnamed: 0,CHROM,POS,REF,ALT,Haplotype
0,chr19,40848628.0,A,T,CYP2A6*2
1,chr19,40843869.0,A,G,CYP2A6*7
2,chr19,40843749.0,CC,C,CYP2A6*7
3,chr19,40843746.0,T,C,CYP2A6*7
4,chr19,40843742.0,G,A,CYP2A6*7
...,...,...,...,...,...
724,chrX,154536019.0,G,A,G6PD*Musashino
725,chrX,154536151.0,G,A,G6PD*Kambos
726,chrX,154532258.0,G,A,"G6PD*Kamiube, Keelung"
727,chrX,154546116.0,C,T,G6PD*Lages


In [10]:
merged = pd.merge(df, df2, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
merged

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated,Haplotype
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*4
1,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6A
2,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6J
3,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6O
4,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*7B
5,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*7G
6,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*13A
7,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,rs1799930,NAT2*4
8,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,rs1799930,NAT2*6A
9,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,rs1799930,NAT2*6B


In [41]:
madhu = merged.copy()
madhu['rsID_mapped'] = madhu.groupby('Haplotype')['rsID'].transform(lambda x: ','.join(set(x)))
madhu['Zygosity_mapped'] = madhu.groupby(['Haplotype'])['Zygosity'].transform(lambda x: ','.join(x))
madhu.drop_duplicates(subset='Haplotype', inplace=True)
madhu

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,rsID_updated,Haplotype,rsID_mapped,Zygosity_mapped
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*4,"rs1799930,rs1041983","Heterozygous,Heterozygous"
1,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6A,"rs1799930,rs1041983","Heterozygous,Heterozygous"
2,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6J,"rs1799930,rs1041983","Heterozygous,Heterozygous"
3,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*6O,"rs1799930,rs1041983","Heterozygous,Heterozygous"
4,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*7B,rs1041983,Heterozygous
5,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*7G,rs1041983,Heterozygous
6,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,rs1041983,NAT2*13A,rs1041983,Heterozygous
9,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,rs1799930,NAT2*6B,rs1799930,Heterozygous
12,chr10,94775367,rs12769205,A,G,.,PASS,"ADP=51;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.7716,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:51:51:1:50:98.04%:1.3013E-28:74:51:1:0...,Homozygous,rs12769205,CYP2C19*2,rs12769205,Homozygous
13,chr19,41006936,rs3745274,G,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.684...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:35:22:22:12:10:45.45%:2.6061E-4:61:55:11:1...,Heterozygous,rs3745274,CYP2B6*6,"rs2279343,rs3745274","Heterozygous,Heterozygous"


In [42]:
madhu['rsID_mapped_sorted'] = madhu['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
madhu = madhu.drop(columns=['rsID_mapped', 'rsID_updated', 'Haplotype'], axis=1)
madhu

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity,Zygosity_mapped,rsID_mapped_sorted
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,"Heterozygous,Heterozygous","rs1041983,rs1799930"
1,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,"Heterozygous,Heterozygous","rs1041983,rs1799930"
2,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,"Heterozygous,Heterozygous","rs1041983,rs1799930"
3,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,"Heterozygous,Heterozygous","rs1041983,rs1799930"
4,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,Heterozygous,rs1041983
5,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,Heterozygous,rs1041983
6,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,Heterozygous,Heterozygous,rs1041983
9,chr8,18400593,rs1799930,G,A,.,PASS,"ADP=35;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.735,0.2...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:57:35:35:19:16:45.71%:1.637E-6:60:54:15:4:...,Heterozygous,Heterozygous,rs1799930
12,chr10,94775367,rs12769205,A,G,.,PASS,"ADP=51;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.7716,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:51:51:1:50:98.04%:1.3013E-28:74:51:1:0...,Homozygous,Homozygous,rs12769205
13,chr19,41006936,rs3745274,G,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.684...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:35:22:22:12:10:45.45%:2.6061E-4:61:55:11:1...,Heterozygous,"Heterozygous,Heterozygous","rs2279343,rs3745274"


In [43]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Multiple_Positions.xlsx')
data = data.rename({'Haplotype':'Haplotype_updated', 'rsID':'rsID_mapped'}, axis = 1)
data['rsID_mapped_sorted'] = data['rsID_mapped'].apply(lambda x: ','.join(sorted(x.split(','))))
data = data.drop(columns=['rsID_mapped'], axis=1)
data

Unnamed: 0,Haplotype_updated,rsID_mapped_sorted
0,NAT2*7B,"rs1041983,rs1799931"
1,NAT2*7G,"chr8_18400229_T_G,rs1041983,rs1799931"
2,CYP2D6*10,"rs1058164,rs1065852,rs1135840"
3,CYP2D6*2,"rs1058164,rs1135840,rs16947"
4,NAT2*6A,"rs1041983,rs1799930"
5,NAT2*6J,"rs1041983,rs1799930,rs1799931"
6,NAT2*6O,"rs1041983,rs1799930,rs56393504"
7,TPMT*3A,"rs1142345,rs1800460"
8,NAT2*5C,"rs1208,rs1801280"
9,NAT2*5A,"rs1799929,rs1801280"


In [45]:
mapped_df = pd.merge(madhu, data, on = 'rsID_mapped_sorted', how = 'inner', sort = False)
mapped_df.drop_duplicates(subset='Haplotype_updated', inplace=True)
mapped_df = mapped_df.drop(columns=['Zygosity'], axis=1)
mapped_df

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Zygosity_mapped,rsID_mapped_sorted,Haplotype_updated
0,chr8,18400285,rs1041983,C,T,.,PASS,"ADP=34;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6026,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:77:34:34:14:20:58.82%:1.6951E-8:58:56:13:1...,"Heterozygous,Heterozygous","rs1041983,rs1799930",NAT2*6A
4,chr19,41006936,rs3745274,G,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.684...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:35:22:22:12:10:45.45%:2.6061E-4:61:55:11:1...,"Heterozygous,Heterozygous","rs2279343,rs3745274",CYP2B6*6
5,chr19,41006936,rs3745274,G,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.684...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:35:22:22:12:10:45.45%:2.6061E-4:61:55:11:1...,"Heterozygous,Heterozygous","rs2279343,rs3745274",CYP2B6*26


# zygosity classes excel files 

In [74]:
import pandas as pd
import os
import sys

# Assuming df is your DataFrame with 'Zygosity' column
# You may need to replace 'your_data.csv' with your actual data file
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/All_1232_updated_haplo_data_16_12_2023.xlsx')

# Get unique values in the 'Zygosity' column
zygosity_values = df['Zygosity_mapped'].unique()

# Create a folder to save the Excel files
output_folder = r'C:/Users/GenepoweRx_Madhu/Downloads/new_haplo_data_16_12_2023/'
os.makedirs(output_folder, exist_ok=True)

# Iterate through unique zygosity values
for zygosity_value in zygosity_values:
    # Create a subset DataFrame for each zygosity value
    subset_df = df[df['Zygosity_mapped'] == zygosity_value]

    # Save the subset as an Excel file
    output_filename = os.path.join(output_folder, f'{zygosity_value}.xlsx')
    subset_df.to_excel(output_filename, index=False)

print("Excel files saved successfully.")

Excel files saved successfully.


In [78]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/All_1232_updated_haplo_data_16_12_2023.xlsx')
df

Unnamed: 0,rsID_mapped_sorted,Zygosity_mapped,Haplotype_updated,Sample
0,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,KHAIGPRX9
1,"rs1799929,rs1801280","Heterozygous,Heterozygous",NAT2*5A,KHAIGPRX9
2,"rs1208,rs1799929,rs1801280","Heterozygous,Heterozygous,Heterozygous",NAT2*5B,KHAIGPRX9
3,"rs1208,rs1801280","Heterozygous,Heterozygous",NAT2*5C,KHAIGPRX9
4,"rs1041983,rs1799930","Heterozygous,Heterozygous",NAT2*6A,KHAIGPRX99
...,...,...,...,...
3763,"rs12769205,rs4244285","Heterozygous,Heterozygous",CYP2C19*2,KHAIGPRX1005
3764,"rs1799929,rs1801280","Homozygous,Homozygous",NAT2*5A,KHAIGPRX1002
3765,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,KHAIGPRX1001
3766,"rs1041983,rs1799930","Heterozygous,Heterozygous",NAT2*6A,KHAIGPRX1000


In [85]:
new_df = df.copy()
new_df['rsID_new'] = new_df['rsID_mapped_sorted'].str.split(',')
new_df['Zygosity_new'] = new_df['Zygosity_mapped'].str.split(',')
# Explode the lists into separate rows
new_df = new_df.explode('rsID_new')
new_df = new_df.explode('Zygosity_new')
new_df.drop_duplicates(subset=['rsID_new', 'Zygosity_new', 'Sample'], inplace=True)
new_df

Unnamed: 0,rsID_mapped_sorted,Zygosity_mapped,Haplotype_updated,Sample,rsID_new,Zygosity_new
0,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,KHAIGPRX9,rs1041983,Heterozygous
0,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,KHAIGPRX9,rs1799931,Heterozygous
1,"rs1799929,rs1801280","Heterozygous,Heterozygous",NAT2*5A,KHAIGPRX9,rs1799929,Heterozygous
1,"rs1799929,rs1801280","Heterozygous,Heterozygous",NAT2*5A,KHAIGPRX9,rs1801280,Heterozygous
2,"rs1208,rs1799929,rs1801280","Heterozygous,Heterozygous,Heterozygous",NAT2*5B,KHAIGPRX9,rs1208,Heterozygous
...,...,...,...,...,...,...
3765,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,KHAIGPRX1001,rs1799931,Heterozygous
3766,"rs1041983,rs1799930","Heterozygous,Heterozygous",NAT2*6A,KHAIGPRX1000,rs1041983,Heterozygous
3766,"rs1041983,rs1799930","Heterozygous,Heterozygous",NAT2*6A,KHAIGPRX1000,rs1799930,Heterozygous
3767,"rs1799929,rs1801280","Heterozygous,Heterozygous",NAT2*5A,KHAIGPRX1000,rs1799929,Heterozygous


In [88]:
grouped_new = new_df.groupby(['rsID_new', 'Zygosity_new', 'Haplotype_updated']).agg({'Sample': lambda x: ','.join(x.unique())}).reset_index()
grouped_new['Sample_count'] = grouped_new['Sample'].apply(lambda x: len(x.split(',')))
grouped_new

Unnamed: 0,rsID_new,Zygosity_new,Haplotype_updated,Sample,Sample_count
0,rs1041983,Heterozygous,NAT2*6A,"KHAIGPRX99,KHAIGPRX997,KHAIGPRX995,KHAIGPRX991...",488
1,rs1041983,Heterozygous,NAT2*6J,"KHAIGPRX559,KHAIGPRX544,KHAIGPRX1078,KHAIGPRX1049",4
2,rs1041983,Heterozygous,NAT2*7B,"KHAIGPRX9,KHAIGPRX983,KHAIGPRX97,KHAIGPRX951,K...",89
3,rs1041983,Homozygous,NAT2*6A,"KHAIGPRX999,KHAIGPRX998,KHAIGPRX989,KHAIGPRX97...",178
4,rs1041983,Homozygous,NAT2*6J,"KHAIGPRX559,KHAIGPRX544,KHAIGPRX1078,KHAIGPRX1049",4
5,rs1041983,Homozygous,NAT2*7B,"KHAIGPRX97,KHAIGPRX90,KHAIGPRX758,KHAIGPRX662,...",7
6,rs11045819,Heterozygous,SLCO1B1*14,"KHAIGPRX952_RS,KHAIGPRX94,KHAIGPRX932,KHAIGPRX...",33
7,rs11045819,Homozygous,SLCO1B1*14,"KHAIGPRX882,KHAIGPRX852,KHAIGPRX821,KHAIGPRX77...",18
8,rs1142345,Heterozygous,TPMT*3A,"KHAIGPRX636,KHAIGPRX503,KHAIGPRX446,KHAIGPRX37...",6
9,rs1208,Heterozygous,NAT2*5B,"KHAIGPRX9,KHAIGPRX99,KHAIGPRX997,KHAIGPRX992,K...",422


In [89]:
grouped_new.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/haplotype_rsID_data.xlsx', index = False)

In [77]:
grouped = df.groupby(['rsID_mapped_sorted', 'Zygosity_mapped', 'Haplotype_updated']).agg({'Sample': lambda x: ', '.join(x.unique())}).reset_index()
grouped['Sample_count'] = grouped['Sample'].apply(lambda x: len(x.split(',')))
grouped.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/new_haplo_data_16_12_2023/1232_samples_Haplotype_updated.xlsx', index=False)
grouped

Unnamed: 0,rsID_mapped_sorted,Zygosity_mapped,Haplotype_updated,Sample,Sample_count
0,"rs1041983,rs1799930","Heterozygous,Heterozygous",NAT2*6A,"KHAIGPRX99, KHAIGPRX997, KHAIGPRX995, KHAIGPRX...",434
1,"rs1041983,rs1799930","Heterozygous,Homozygous",NAT2*6A,"KHAIGPRX431, KHAIGPRX279",2
2,"rs1041983,rs1799930","Homozygous,Heterozygous",NAT2*6A,"KHAIGPRX989, KHAIGPRX959, KHAIGPRX909, KHAIGPR...",56
3,"rs1041983,rs1799930","Homozygous,Homozygous",NAT2*6A,"KHAIGPRX999, KHAIGPRX998, KHAIGPRX972, KHAIGPR...",124
4,"rs1041983,rs1799930,rs1799931","Homozygous,Heterozygous,Heterozygous",NAT2*6J,"KHAIGPRX989, KHAIGPRX959, KHAIGPRX909, KHAIGPR...",52
5,"rs1041983,rs1799930,rs56393504","Heterozygous,Heterozygous,Heterozygous",NAT2*6O,KHAIGPRX370,1
6,"rs1041983,rs1799931","Heterozygous,Heterozygous",NAT2*7B,"KHAIGPRX9, KHAIGPRX983, KHAIGPRX951, KHAIGPRX9...",86
7,"rs1041983,rs1799931","Homozygous,Heterozygous",NAT2*7B,"KHAIGPRX989, KHAIGPRX97, KHAIGPRX959, KHAIGPRX...",55
8,"rs1041983,rs1799931","Homozygous,Homozygous",NAT2*7B,"KHAIGPRX90, KHAIGPRX758, KHAIGPRX662, KHAIGPRX456",4
9,"rs11045819,rs2306283","Heterozygous,Heterozygous",SLCO1B1*14,"KHAIGPRX952_RS, KHAIGPRX94, KHAIGPRX932, KHAIG...",15


# condition3_ haplotype_new mapped genes positions along with mapped data... 

In [90]:
import pandas as pd
variants = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/haplo_20_samples/KHAIGPRX1_final_DP.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
variants.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
variants

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,11786195,rs4846048,G,A,.,PASS,"ADP=28;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.2935,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:51:28:28:14:14:50%:6.911E-6:55:44:10:4:8:6
1,chr1,11786390,rs4845884,G,A,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.06669,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:158:28:28:0:28:100%:1.3074E-16:0:53:0:0:23:5
2,chr1,11787392,rs3737967,G,A,.,PASS,"ADP=25;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9257,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:39:25:25:14:11:44%:1.1933E-4:54:47:14:0:9:2
3,chr1,11787715,rs3820192,G,T,.,PASS,"ADP=24;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8876,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:27:24:24:16:8:33.33%:1.949E-3:47:50:8:8:3:5
4,chr1,11788011,rs1537514,G,C,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8824,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:46:29:29:16:13:44.83%:2.1506E-5:59:50:11:5...
...,...,...,...,...,...,...,...,...,...,...
108,chr21,45532180,rs914231,T,C,.,PASS,"ADP=17;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4856,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:43:17:17:6:11:64.71%:4.3258E-5:47:47:4:2:5:6
109,chr21,45534483,rs150745916,A,C,.,PASS,"ADP=23;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.998,0.0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:54:23:23:9:14:60.87%:3.4067E-6:42:50:7:2:13:1
110,chrX,154531643,rs1050757,C,T,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5846,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:158:28:28:0:28:100%:1.3074E-16:0:44:0:0:16:12
111,chrX,154532293,rs2071429,G,A,.,PASS,"ADP=19;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5852,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:105:19:19:0:19:100%:2.8292E-11:0:47:0:0:16:3


In [91]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/haplo_20_samples/13genes_coordinates_haplotypes_07122023.xlsx', header = None)
df.columns = ['Gene', 'chromosome', 'Extended_Start_pos', 'Extended_End_pos']
df

Unnamed: 0,Gene,chromosome,Extended_Start_pos,Extended_End_pos
0,GENE,CHR,Start_POS,END_POS
1,CYP2A6,chr19,40843541,40850447
2,CYP2B6,chr19,40991282,41018398
3,CYP2C19,chr10,94762681,94855547
4,CYP2C9,chr10,94938658,94990091
5,CYP2D6,chr22,42126499,42130810
6,CYP3A4,chr7,99756967,99784184
7,CYP3A5,chr7,99648194,99679996
8,NAT2,chr8,18391282,18401218
9,NUDT15,chr13,48037726,48047221


In [92]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    gene = row['Gene']  # Assuming 'Gene' is the name of your gene column
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos, gene))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end, gene in ranges:
            if start <= pos <= end:
                return 'Covered', start, end, gene
    return 'Not_Covered', None, None, None  # Return None for start, end, and gene if not covered

# Step 3: Apply the function to create the new columns in data
variants['Covered/Not_Covered'], variants['Start_Pos_Covered'], variants['End_Pos_Covered'], variants['Gene'] = zip(*variants.apply(check_coverage, axis=1))

# Step 4: Create new columns for Covered rows
covered_rows = variants['Covered/Not_Covered'] == 'Covered'
variants.loc[covered_rows, 'Covered_Chromosome'] = variants.loc[covered_rows, 'CHROM']
variants.loc[covered_rows, 'Covered_Start_Pos'] = variants.loc[covered_rows, 'Start_Pos_Covered']
variants.loc[covered_rows, 'Covered_End_Pos'] = variants.loc[covered_rows, 'End_Pos_Covered']
variants.loc[covered_rows, 'Gene'] = variants.loc[covered_rows, 'Gene']

# Drop temporary columns
variants.drop(['Start_Pos_Covered', 'End_Pos_Covered'], axis=1, inplace=True)

# Display the DataFrame
variants = variants[variants['Covered/Not_Covered'] == 'Covered']
variants

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,Covered/Not_Covered,Gene,Covered_Chromosome,Covered_Start_Pos,Covered_End_Pos
45,chr6,18128428,rs7886,T,A,.,PASS,"ADP=17;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.2372,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:17:17:10:7:41.18%:3.6151E-3:40:47:1:9:3:4,Covered,TPMT,chr6,18128311.0,18155077.0
52,chr7,99662739,rs4646453,C,A,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8546,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:34:15:15:6:9:60%:3.4983E-4:55:43:5:1:8:1,Covered,CYP3A5,chr7,99648194.0,99679996.0
53,chr7,99763843,rs2242480,C,T,.,PASS,"ADP=26;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5783,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:146:26:26:0:26:100%:2.0165E-15:0:49:0:0:21:5,Covered,CYP3A4,chr7,99756967.0,99784184.0
56,chr8,18400344,rs1801280,T,C,.,PASS,"ADP=25;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.7073,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:49:61:9:3:10:3,Covered,NAT2,chr8,18391282.0,18401218.0
57,chr8,18400806,rs1208,G,A,.,PASS,"ADP=23;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3229,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:49:23:23:10:13:56.52%:1.1242E-5:54:57:8:2:...,Covered,NAT2,chr8,18391282.0,18401218.0
58,chr10,94762804,rs17885098,C,T,.,PASS,"ADP=23;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9081,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:129:23:23:0:23:100%:1.2146E-13:0:54:0:0:17:6,Covered,CYP2C19,chr10,94762681.0,94855547.0
59,chr10,94942093,rs9332120,T,C,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8558,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:254:44:44:0:44:100%:3.8097E-26:0:52:0:0:31:13,Covered,CYP2C9,chr10,94938658.0,94990091.0
60,chr12,21239628,rs4149087,T,G,.,PASS,"ADP=19;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5523,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:28:19:19:11:8:42.11%:1.5455E-3:43:44:8:3:6:2,Covered,SLCO1B1,chr12,21131194.0,21239796.0
61,chr12,21239652,rs4149088,A,G,.,PASS,"ADP=17;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5515,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:20:17:17:11:6:35.29%:9.2021E-3:56:57:8:3:5:1,Covered,SLCO1B1,chr12,21131194.0,21239796.0
62,chr13,48040905,rs747497670,G,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=NUDT...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:11:7:38.89%:3.8123E-3:56:41:11:0:7:0,Covered,NUDT15,chr13,48037726.0,48047221.0


In [None]:
RSIDS_POSITION_DBSNP