In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px

pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/sen_spe_files_07_09_2023/VCC_input_files/12652705_BCFTOOL.vcf', comment= '#', sep = '\t', header=None, low_memory=False, encoding='latin-1')
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,817514,rs3131971,T,C,118.3520,.,"DP=248;ADF=0,201;ADR=1,28;AD=1,229;SCR=0;VDB=0...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:145,255,0:230:0:0,201:1,28:1,229:0"
1,chr1,826893,rs3115849,G,A,34.4159,.,"DP=97;ADF=0,38;ADR=0,55;AD=0,93;SCR=0;VDB=0.66...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:64,255,0:93:0:0,38:0,55:0,93:0"
2,chr1,930939,rs9988021,G,A,225.4170,.,"DP=67;ADF=0,53;ADR=0,6;AD=0,59;SCR=0;VDB=0.002...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:255,178,0:59:0:0,53:0,6:0,59:0"
3,chr1,941119,rs4372192,A,G,205.4170,.,"DP=23;ADF=0,15;ADR=0,2;AD=0,17;SCR=0;VDB=0.124...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:235,51,0:17:0:0,15:0,2:0,17:0"
4,chr1,942451,rs6672356,T,C,34.4159,.,"DP=2;ADF=0,2;ADR=0,0;AD=0,2;SCR=0;VDB=0.02;SGB...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:64,6,0:2:0:0,2:0,0:0,2:0"
...,...,...,...,...,...,...,...,...,...,...
79442,chrY,1387425,.,C,G,23.4340,.,"DP=65;ADF=0,14;ADR=0,41;AD=0,55;SCR=0;VDB=0.59...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:53,166,0:55:0:0,14:0,41:0,55:0"
79443,chrY,1418109,.,C,G,25.4267,.,"DP=111;ADF=0,26;ADR=0,73;AD=0,99;SCR=0;VDB=0.3...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:55,255,0:99:0:0,26:0,73:0,99:0"
79444,chrY,1425778,.,G,A,26.4242,.,"DP=79;ADF=0,17;ADR=0,52;AD=0,69;SCR=0;VDB=0.61...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:56,208,0:69:0:0,17:0,52:0,69:0"
79445,chrY,1428198,.,C,A,23.4340,.,"DP=57;ADF=0,12;ADR=0,37;AD=0,49;SCR=0;VDB=0.03...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:53,148,0:49:0:0,12:0,37:0,49:0"


In [3]:
vcf['AD'] = vcf['SAMPLE'].str.split(':').str[6]
vcf['RD'] = vcf['AD'].str.split(',').str[0].astype(int)
vcf['A_D'] = vcf['AD'].str.split(',').str[1].astype(int)
vcf['VAF'] = vcf['A_D'] / (vcf['RD'] + vcf['A_D'])
vcf['DP'] = vcf['SAMPLE'].str.split(':').str[2].fillna('0').astype(int)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,AD,RD,A_D,VAF,DP
0,chr1,817514,rs3131971,T,C,118.3520,.,"DP=248;ADF=0,201;ADR=1,28;AD=1,229;SCR=0;VDB=0...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:145,255,0:230:0:0,201:1,28:1,229:0",1229,1,229,0.995652,230
1,chr1,826893,rs3115849,G,A,34.4159,.,"DP=97;ADF=0,38;ADR=0,55;AD=0,93;SCR=0;VDB=0.66...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:64,255,0:93:0:0,38:0,55:0,93:0",093,0,93,1.000000,93
2,chr1,930939,rs9988021,G,A,225.4170,.,"DP=67;ADF=0,53;ADR=0,6;AD=0,59;SCR=0;VDB=0.002...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:255,178,0:59:0:0,53:0,6:0,59:0",059,0,59,1.000000,59
3,chr1,941119,rs4372192,A,G,205.4170,.,"DP=23;ADF=0,15;ADR=0,2;AD=0,17;SCR=0;VDB=0.124...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:235,51,0:17:0:0,15:0,2:0,17:0",017,0,17,1.000000,17
4,chr1,942451,rs6672356,T,C,34.4159,.,"DP=2;ADF=0,2;ADR=0,0;AD=0,2;SCR=0;VDB=0.02;SGB...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:64,6,0:2:0:0,2:0,0:0,2:0",02,0,2,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79442,chrY,1387425,.,C,G,23.4340,.,"DP=65;ADF=0,14;ADR=0,41;AD=0,55;SCR=0;VDB=0.59...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:53,166,0:55:0:0,14:0,41:0,55:0",055,0,55,1.000000,55
79443,chrY,1418109,.,C,G,25.4267,.,"DP=111;ADF=0,26;ADR=0,73;AD=0,99;SCR=0;VDB=0.3...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:55,255,0:99:0:0,26:0,73:0,99:0",099,0,99,1.000000,99
79444,chrY,1425778,.,G,A,26.4242,.,"DP=79;ADF=0,17;ADR=0,52;AD=0,69;SCR=0;VDB=0.61...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:56,208,0:69:0:0,17:0,52:0,69:0",069,0,69,1.000000,69
79445,chrY,1428198,.,C,A,23.4340,.,"DP=57;ADF=0,12;ADR=0,37;AD=0,49;SCR=0;VDB=0.03...",GT:PL:DP:SP:ADF:ADR:AD:SCR,"1/1:53,148,0:49:0:0,12:0,37:0,49:0",049,0,49,1.000000,49


In [4]:
vcf_1 = vcf[vcf['VAF'] >= 0.1]
vcf_1.shape

(79447, 15)

In [13]:
vcf['DP'] = vcf['INFO'].str.extract(r'DP=(\d+)')[0].fillna('0').astype(int)
vcf['Allele_Count'] = vcf['ALT'].apply(lambda x: len(x))
vcf['VAF'] = vcf['Allele_Count'] / vcf['DP']
vcf_new = vcf[vcf['VAF'] >= 0.01]
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,DP,Allele_Count,VAF
1,chr1,826893,rs3115849,G,A,34.42,.,"AC=2;AN=2;ASP;CAF=0.2682,0.7318;COMMON=1;CSQ=A...",GT:PL,"1/1:64,255,0",97,1,0.010309
2,chr1,930939,rs9988021,G,A,225.42,.,"AC=2;AN=2;ASP;CAF=0.04653,0.9535,.;COMMON=1;CS...",GT:PL,"1/1:255,178,0",67,1,0.014925
3,chr1,941119,rs4372192,A,G,205.42,.,"AC=2;AN=2;ASP;CAF=0.08686,0.9131;COMMON=1;CSQ=...",GT:PL,"1/1:235,51,0",23,1,0.043478
4,chr1,942451,rs6672356,T,C,34.42,.,"AC=2;AN=2;ASP;CAF=0,1;COMMON=0;CSQ=C|downstrea...",GT:PL,"1/1:64,6,0",2,1,0.500000
5,chr1,944296,rs6605067,G,A,71.41,.,"AC=2;AN=2;ASP;CAF=0.114,0.886;COMMON=1;CSQ=A|3...",GT:PL,"1/1:101,9,0",3,1,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79440,chrX,154929926,rs1800292,T,G,222.38,.,"AC=1;AN=2;ASP;BQB=0.999581;CAF=0.8395,0.1605;C...",GT:PL,"0/1:255,0,255",86,1,0.011628
79441,chrY,1286017,.,C,A,27.42,.,AC=2;AN=2;CSQ=A|intron_variant|MODIFIER|CSF2RA...,GT:PL,"1/1:57,151,0",54,1,0.018519
79442,chrY,1387425,.,C,G,23.43,.,AC=2;AN=2;CSQ=G|downstream_gene_variant|MODIFI...,GT:PL,"1/1:53,166,0",65,1,0.015385
79444,chrY,1425778,.,G,A,26.42,.,AC=2;AN=2;CSQ=A|intron_variant|MODIFIER|ASMTL|...,GT:PL,"1/1:56,208,0",79,1,0.012658


In [14]:
vcf_new = vcf_new[vcf_new['DP'] >= 10]
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,DP,Allele_Count,VAF
1,chr1,826893,rs3115849,G,A,34.42,.,"AC=2;AN=2;ASP;CAF=0.2682,0.7318;COMMON=1;CSQ=A...",GT:PL,"1/1:64,255,0",97,1,0.010309
2,chr1,930939,rs9988021,G,A,225.42,.,"AC=2;AN=2;ASP;CAF=0.04653,0.9535,.;COMMON=1;CS...",GT:PL,"1/1:255,178,0",67,1,0.014925
3,chr1,941119,rs4372192,A,G,205.42,.,"AC=2;AN=2;ASP;CAF=0.08686,0.9131;COMMON=1;CSQ=...",GT:PL,"1/1:235,51,0",23,1,0.043478
7,chr1,944858,rs3748592,A,G,225.42,.,"AC=2;AN=2;ASP;CAF=0.07907,0.9209;COMMON=1;CSQ=...",GT:PL,"1/1:255,36,0",14,1,0.071429
8,chr1,946653,rs2272756,G,A,191.42,.,"AC=2;AN=2;ASP;CAF=0.8632,0.1368,.;COMMON=1;CSQ...",GT:PL,"1/1:221,72,0",24,1,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79440,chrX,154929926,rs1800292,T,G,222.38,.,"AC=1;AN=2;ASP;BQB=0.999581;CAF=0.8395,0.1605;C...",GT:PL,"0/1:255,0,255",86,1,0.011628
79441,chrY,1286017,.,C,A,27.42,.,AC=2;AN=2;CSQ=A|intron_variant|MODIFIER|CSF2RA...,GT:PL,"1/1:57,151,0",54,1,0.018519
79442,chrY,1387425,.,C,G,23.43,.,AC=2;AN=2;CSQ=G|downstream_gene_variant|MODIFI...,GT:PL,"1/1:53,166,0",65,1,0.015385
79444,chrY,1425778,.,G,A,26.42,.,AC=2;AN=2;CSQ=A|intron_variant|MODIFIER|ASMTL|...,GT:PL,"1/1:56,208,0",79,1,0.012658


In [23]:
vcf['REF_Count'] = vcf['REF'].apply(lambda x: len(x))
vcf['ALT_Count'] = vcf['ALT'].apply(lambda x: len(x))
non_matching_rows = vcf[vcf['REF_Count'] != vcf['ALT_Count']]
non_matching_rows

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,REF_Count,ALT_Count
9,chr1,935662,rs199537431,C,CA,42.0,PASS,"ASP;CAF=0.9916,0.008387;COMMON=1;GENEINFO=SAMD...",GT:GQ:DP:AD:VAF:PL,"0/1:24:51:19,32:0.627451:41,0,24",1,2
10,chr1,939398,rs568340123,GCCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA,G,0.0,RefCall,ASP;GENEINFO=SAMD11:148398;RS=568340123;RSPOS=...,GT:GQ:DP:AD:VAF:PL,"0/0:23:97:78,19:0.195876:0,23,39",49,1
17,chr1,945259,rs35916504;rs397844509,TC,T,46.2,PASS,"ASP;CAF=0.8119,0.1881,0.8119,0.1881;COMMON=1,1...",GT:GQ:DP:AD:VAF:PL,"1/1:35:26:1,25:0.961538:46,34,0",2,1
22,chr1,948728,.,T,TC,0.2,RefCall,CSQ=C|intron_variant|MODIFIER|NOC2L|ENSG000001...,GT:GQ:DP:AD:VAF:PL,"./.:14:13:11,2:0.153846:0,14,34",1,2
23,chr1,948730,.,C,CCT,0.0,RefCall,CSQ=CT|intron_variant|MODIFIER|NOC2L|ENSG00000...,GT:GQ:DP:AD:VAF:PL,"0/0:28:12:10,2:0.166667:0,29,33",1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
113459,chrX,155089365,rs1434044365,ATG,A,0.0,RefCall,ASP;GENEINFO=BRCC3:79184;INT;RS=1434044365;RSP...,GT:GQ:DP:AD:VAF:PL,"0/0:30:85:52,33:0.388235:0,30,44",3,1
113460,chrX,155099185,rs782693515,G,GT,0.0,RefCall,ASP;GENEINFO=BRCC3:79184;INT;RS=782693515;RSPO...,GT:GQ:DP:AD:VAF:PL,"0/0:30:14:11,2:0.142857:0,29,50",1,2
113461,chrX,155228363,rs781912204,A,AT,0.0,RefCall,ASP;GENEINFO=VBP1:7411;INT;NOV;RS=781912204;RS...,GT:GQ:DP:AD:VAF:PL,"0/0:33:37:29,7:0.189189:0,33,57",1,2
113462,chrX,155491679,.,CA,C,0.0,RefCall,CSQ=-|splice_polypyrimidine_tract_variant&intr...,GT:GQ:DP:AD:VAF:PL,"0/0:24:151:97,41:0.271523:0,25,28",2,1


In [20]:
a = str('GGAGAGAGAGAGAGAGAGAG')
len(a)

20

In [12]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/sen_spe_files_07_09_2023/VCC_input_files/12652705_VARSCAN2.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=211;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:212:211:0:210:99.53%:9.4917E-126:0:39:...
1,chr1,817514,rs3131971,T,C,.,PASS,"ADP=368;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2468,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:369:368:1:367:99.73%:3.4721E-218:25:38...
2,chr1,826893,rs3115849,G,A,.,PASS,"ADP=91;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2682,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:91:0:91:100%:2.7621E-54:0:36:0:0:36:55
3,chr1,827209,rs3115848,G,C,.,PASS,"ADP=125;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:127:125:15:110:88%:5.9951E-55:36:38:0:...
4,chr1,827212,rs3131950,C,G,.,PASS,"ADP=128;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:129:128:15:113:88.28%:1.325E-56:36:38:...
...,...,...,...,...,...,...,...,...,...,...
69181,chrY,57084531,.,A,G,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:88:62:62:37:25:40.32%:1.4168E-9:38:40:36:1...
69182,chrY,57128393,.,G,C,.,PASS,ADP=223;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_pol...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:255:223:223:112:111:49.78%:3.4158E-42:38:4...
69183,chrY,57184462,.,T,G,.,PASS,ADP=43;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:72:43:43:23:20:46.51%:5.2508E-8:38:37:0:23...
69184,chrY,57189953,.,T,C,.,PASS,ADP=50;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_poly...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:70:50:50:30:20:40%:8.793E-8:37:37:15:15:11:9


In [13]:
vcf['DP'] = vcf['SAMPLE'].str.split(':').str[3].fillna('0').astype(int)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,DP
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=211;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:212:211:0:210:99.53%:9.4917E-126:0:39:...,211
1,chr1,817514,rs3131971,T,C,.,PASS,"ADP=368;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2468,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:369:368:1:367:99.73%:3.4721E-218:25:38...,368
2,chr1,826893,rs3115849,G,A,.,PASS,"ADP=91;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2682,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:91:0:91:100%:2.7621E-54:0:36:0:0:36:55,91
3,chr1,827209,rs3115848,G,C,.,PASS,"ADP=125;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:127:125:15:110:88%:5.9951E-55:36:38:0:...,125
4,chr1,827212,rs3131950,C,G,.,PASS,"ADP=128;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:129:128:15:113:88.28%:1.325E-56:36:38:...,128
...,...,...,...,...,...,...,...,...,...,...,...
69181,chrY,57084531,.,A,G,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:88:62:62:37:25:40.32%:1.4168E-9:38:40:36:1...,62
69182,chrY,57128393,.,G,C,.,PASS,ADP=223;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_pol...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:255:223:223:112:111:49.78%:3.4158E-42:38:4...,223
69183,chrY,57184462,.,T,G,.,PASS,ADP=43;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:72:43:43:23:20:46.51%:5.2508E-8:38:37:0:23...,43
69184,chrY,57189953,.,T,C,.,PASS,ADP=50;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_poly...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:70:50:50:30:20:40%:8.793E-8:37:37:15:15:11:9,50


In [14]:
vcf['Allele_Count'] = vcf['ALT'].apply(lambda x: len(x))
vcf['VAF'] = vcf['Allele_Count'] / vcf['DP']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,DP,Allele_Count,VAF
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=211;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:212:211:0:210:99.53%:9.4917E-126:0:39:...,211,1,0.004739
1,chr1,817514,rs3131971,T,C,.,PASS,"ADP=368;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2468,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:369:368:1:367:99.73%:3.4721E-218:25:38...,368,1,0.002717
2,chr1,826893,rs3115849,G,A,.,PASS,"ADP=91;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2682,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:91:0:91:100%:2.7621E-54:0:36:0:0:36:55,91,1,0.010989
3,chr1,827209,rs3115848,G,C,.,PASS,"ADP=125;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:127:125:15:110:88%:5.9951E-55:36:38:0:...,125,1,0.008000
4,chr1,827212,rs3131950,C,G,.,PASS,"ADP=128;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2484,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:129:128:15:113:88.28%:1.325E-56:36:38:...,128,1,0.007812
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69181,chrY,57084531,.,A,G,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:88:62:62:37:25:40.32%:1.4168E-9:38:40:36:1...,62,1,0.016129
69182,chrY,57128393,.,G,C,.,PASS,ADP=223;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_pol...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:255:223:223:112:111:49.78%:3.4158E-42:38:4...,223,1,0.004484
69183,chrY,57184462,.,T,G,.,PASS,ADP=43;WT=0;HET=1;HOM=0;NC=0;CSQ=G|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:72:43:43:23:20:46.51%:5.2508E-8:38:37:0:23...,43,1,0.023256
69184,chrY,57189953,.,T,C,.,PASS,ADP=50;WT=0;HET=1;HOM=0;NC=0;CSQ=C|splice_poly...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:70:50:50:30:20:40%:8.793E-8:37:37:15:15:11:9,50,1,0.020000


In [15]:
vcf_new = vcf[vcf['VAF'] >= 0.01]
vcf_new.shape

(57360, 13)

In [8]:
vcf.ALT.value_counts()

G                                                        20415
C                                                        20250
A                                                        19801
T                                                        19534
CT                                                          98
                                                         ...  
GGAGAGAGAGAGAGAGAGAG                                         1
GTATTATTATTA                                                 1
AGAATG                                                       1
CCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTC        1
AATATATATAT                                                  1
Name: ALT, Length: 3012, dtype: int64

In [29]:
vcf = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Epilepsy_cases_novel_variants.xlsx')
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Barcode
0,chr9,24899,.,G,A,.,PASS,ADP=95;WT=0;HET=1;HOM=0;NC=0;CSQ=A|upstream_ge...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:65:95:95:75:20:21.05%:3.1216E-7:56:52:47:2...,KHCNCGPCSP3
1,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:140:62:62:26:36:58.06%:9.6098E-15:58:63:19...,KHCNCGPCSP1
2,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:75:75:60:15:20%:1.404E-5:55:46:41:19:7:8,KHCNCGPCSP2
3,chr16,61216,.,G,C,.,PASS,ADP=33;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:63:33:33:16:17:51.52%:4.6376E-7:52:51:9:7:...,KHCNCGPCSP2
4,chr1_KI270762v1_alt,69758,.,G,A,.,PASS,ADP=40;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:44:40:40:27:13:32.5%:3.8184E-5:56:63:20:7:...,KHCNCGPCSP4
...,...,...,...,...,...,...,...,...,...,...,...
1244,chr1,233294415,.,T,C,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:51:44:11:1:12:1,KHCNCGPCSP4
1245,chr2,233671871,.,C,T,.,PASS,ADP=32;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:45:32:32:19:13:40.62%:2.6443E-5:47:49:15:4...,KHCNCGPCSP4
1246,chr1,236900958,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:52:52:19:33:63.46%:5.4755E-14:59:48:15...,KHCNCGPCSP3
1247,chr2,238150925,.,A,C,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:144:52:52:17:35:67.31%:3.7678E-15:52:59:13...,KHCNCGPCSP1


In [30]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Barcode,CSQ
0,chr9,24899,.,G,A,.,PASS,ADP=95;WT=0;HET=1;HOM=0;NC=0;CSQ=A|upstream_ge...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:65:95:95:75:20:21.05%:3.1216E-7:56:52:47:2...,KHCNCGPCSP3,A|upstream_gene_variant|MODIFIER|MIR1302-9HG|E...
1,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:140:62:62:26:36:58.06%:9.6098E-15:58:63:19...,KHCNCGPCSP1,T|intergenic_variant|MODIFIER|||||||||||||||||...
2,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:75:75:60:15:20%:1.404E-5:55:46:41:19:7:8,KHCNCGPCSP2,T|intergenic_variant|MODIFIER|||||||||||||||||...
3,chr16,61216,.,G,C,.,PASS,ADP=33;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:63:33:33:16:17:51.52%:4.6376E-7:52:51:9:7:...,KHCNCGPCSP2,C|missense_variant|MODERATE|RHBDF1|ENSG0000000...
4,chr1_KI270762v1_alt,69758,.,G,A,.,PASS,ADP=40;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:44:40:40:27:13:32.5%:3.8184E-5:56:63:20:7:...,KHCNCGPCSP4,A|intergenic_variant|MODIFIER|||||||||||||||||...
...,...,...,...,...,...,...,...,...,...,...,...,...
1244,chr1,233294415,.,T,C,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:51:44:11:1:12:1,KHCNCGPCSP4,C|intron_variant|MODIFIER|PCNX2|ENSG0000013574...
1245,chr2,233671871,.,C,T,.,PASS,ADP=32;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:45:32:32:19:13:40.62%:2.6443E-5:47:49:15:4...,KHCNCGPCSP4,T|intron_variant|MODIFIER|UGT1A10|ENSG00000242...
1246,chr1,236900958,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:52:52:19:33:63.46%:5.4755E-14:59:48:15...,KHCNCGPCSP3,G|downstream_gene_variant|MODIFIER|MTR|ENSG000...
1247,chr2,238150925,.,A,C,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:144:52:52:17:35:67.31%:3.7678E-15:52:59:13...,KHCNCGPCSP1,C|missense_variant|MODERATE|KLHL30|ENSG0000016...


In [34]:
def get_fourth_position(row):
    elements = row.split(',')
    fourth_element = ''
    for item in elements:
        sub_elements = item.split('|')
        if len(sub_elements) >= 1:
            fourth_element += sub_elements[3] + ','
    return fourth_element.rstrip(',')

vcf['Gene'] = vcf['CSQ'].apply(get_fourth_position)
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Barcode,CSQ,Gene
0,chr9,24899,.,G,A,.,PASS,ADP=95;WT=0;HET=1;HOM=0;NC=0;CSQ=A|upstream_ge...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:65:95:95:75:20:21.05%:3.1216E-7:56:52:47:2...,KHCNCGPCSP3,A|upstream_gene_variant|MODIFIER|MIR1302-9HG|E...,"MIR1302-9HG,WASHC1,WASHC1,WASHC1,WASHC1,WASHC1"
1,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:140:62:62:26:36:58.06%:9.6098E-15:58:63:19...,KHCNCGPCSP1,T|intergenic_variant|MODIFIER|||||||||||||||||...,
2,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:75:75:60:15:20%:1.404E-5:55:46:41:19:7:8,KHCNCGPCSP2,T|intergenic_variant|MODIFIER|||||||||||||||||...,
3,chr16,61216,.,G,C,.,PASS,ADP=33;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:63:33:33:16:17:51.52%:4.6376E-7:52:51:9:7:...,KHCNCGPCSP2,C|missense_variant|MODERATE|RHBDF1|ENSG0000000...,"RHBDF1,SNRNP25,SNRNP25,SNRNP25,RHBDF1,SNRNP25,..."
4,chr1_KI270762v1_alt,69758,.,G,A,.,PASS,ADP=40;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:44:40:40:27:13:32.5%:3.8184E-5:56:63:20:7:...,KHCNCGPCSP4,A|intergenic_variant|MODIFIER|||||||||||||||||...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244,chr1,233294415,.,T,C,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:51:44:11:1:12:1,KHCNCGPCSP4,C|intron_variant|MODIFIER|PCNX2|ENSG0000013574...,"PCNX2,RPS7P3"
1245,chr2,233671871,.,C,T,.,PASS,ADP=32;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:45:32:32:19:13:40.62%:2.6443E-5:47:49:15:4...,KHCNCGPCSP4,T|intron_variant|MODIFIER|UGT1A10|ENSG00000242...,"UGT1A10,UGT1A9,UGT1A10,UGT1A8"
1246,chr1,236900958,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:52:52:19:33:63.46%:5.4755E-14:59:48:15...,KHCNCGPCSP3,G|downstream_gene_variant|MODIFIER|MTR|ENSG000...,"MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MT..."
1247,chr2,238150925,.,A,C,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:144:52:52:17:35:67.31%:3.7678E-15:52:59:13...,KHCNCGPCSP1,C|missense_variant|MODERATE|KLHL30|ENSG0000016...,"KLHL30,KLHL30-AS1,KLHL30-AS1"


In [35]:
vcf.CSQ.iloc[1]

'T|intergenic_variant|MODIFIER|||||||||||||||||||||||||'

In [36]:
vcf['GENE'] = vcf['Gene'].apply(lambda x: ','.join(sorted(set(x.split(',')))))
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Barcode,CSQ,Gene,GENE
0,chr9,24899,.,G,A,.,PASS,ADP=95;WT=0;HET=1;HOM=0;NC=0;CSQ=A|upstream_ge...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:65:95:95:75:20:21.05%:3.1216E-7:56:52:47:2...,KHCNCGPCSP3,A|upstream_gene_variant|MODIFIER|MIR1302-9HG|E...,"MIR1302-9HG,WASHC1,WASHC1,WASHC1,WASHC1,WASHC1","MIR1302-9HG,WASHC1"
1,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:140:62:62:26:36:58.06%:9.6098E-15:58:63:19...,KHCNCGPCSP1,T|intergenic_variant|MODIFIER|||||||||||||||||...,,
2,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:75:75:60:15:20%:1.404E-5:55:46:41:19:7:8,KHCNCGPCSP2,T|intergenic_variant|MODIFIER|||||||||||||||||...,,
3,chr16,61216,.,G,C,.,PASS,ADP=33;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:63:33:33:16:17:51.52%:4.6376E-7:52:51:9:7:...,KHCNCGPCSP2,C|missense_variant|MODERATE|RHBDF1|ENSG0000000...,"RHBDF1,SNRNP25,SNRNP25,SNRNP25,RHBDF1,SNRNP25,...","RHBDF1,SNRNP25"
4,chr1_KI270762v1_alt,69758,.,G,A,.,PASS,ADP=40;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:44:40:40:27:13:32.5%:3.8184E-5:56:63:20:7:...,KHCNCGPCSP4,A|intergenic_variant|MODIFIER|||||||||||||||||...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244,chr1,233294415,.,T,C,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:51:44:11:1:12:1,KHCNCGPCSP4,C|intron_variant|MODIFIER|PCNX2|ENSG0000013574...,"PCNX2,RPS7P3","PCNX2,RPS7P3"
1245,chr2,233671871,.,C,T,.,PASS,ADP=32;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:45:32:32:19:13:40.62%:2.6443E-5:47:49:15:4...,KHCNCGPCSP4,T|intron_variant|MODIFIER|UGT1A10|ENSG00000242...,"UGT1A10,UGT1A9,UGT1A10,UGT1A8","UGT1A10,UGT1A8,UGT1A9"
1246,chr1,236900958,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:52:52:19:33:63.46%:5.4755E-14:59:48:15...,KHCNCGPCSP3,G|downstream_gene_variant|MODIFIER|MTR|ENSG000...,"MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MTR,MT...",MTR
1247,chr2,238150925,.,A,C,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:144:52:52:17:35:67.31%:3.7678E-15:52:59:13...,KHCNCGPCSP1,C|missense_variant|MODERATE|KLHL30|ENSG0000016...,"KLHL30,KLHL30-AS1,KLHL30-AS1","KLHL30,KLHL30-AS1"


In [37]:
vcf.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'Sample1', 'Barcode', 'CSQ', 'Gene', 'GENE'],
      dtype='object')

In [38]:
vcf = vcf[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'Sample1', 'Barcode', 'GENE']]
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Sample1,Barcode,GENE
0,chr9,24899,.,G,A,.,PASS,ADP=95;WT=0;HET=1;HOM=0;NC=0;CSQ=A|upstream_ge...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:65:95:95:75:20:21.05%:3.1216E-7:56:52:47:2...,KHCNCGPCSP3,"MIR1302-9HG,WASHC1"
1,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=62;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:140:62:62:26:36:58.06%:9.6098E-15:58:63:19...,KHCNCGPCSP1,
2,chr2_KI270894v1_alt,41636,.,C,T,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:75:75:60:15:20%:1.404E-5:55:46:41:19:7:8,KHCNCGPCSP2,
3,chr16,61216,.,G,C,.,PASS,ADP=33;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:63:33:33:16:17:51.52%:4.6376E-7:52:51:9:7:...,KHCNCGPCSP2,"RHBDF1,SNRNP25"
4,chr1_KI270762v1_alt,69758,.,G,A,.,PASS,ADP=40;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:44:40:40:27:13:32.5%:3.8184E-5:56:63:20:7:...,KHCNCGPCSP4,
...,...,...,...,...,...,...,...,...,...,...,...,...
1244,chr1,233294415,.,T,C,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:48:25:25:12:13:52%:1.4654E-5:51:44:11:1:12:1,KHCNCGPCSP4,"PCNX2,RPS7P3"
1245,chr2,233671871,.,C,T,.,PASS,ADP=32;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:45:32:32:19:13:40.62%:2.6443E-5:47:49:15:4...,KHCNCGPCSP4,"UGT1A10,UGT1A8,UGT1A9"
1246,chr1,236900958,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:132:52:52:19:33:63.46%:5.4755E-14:59:48:15...,KHCNCGPCSP3,MTR
1247,chr2,238150925,.,A,C,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:144:52:52:17:35:67.31%:3.7678E-15:52:59:13...,KHCNCGPCSP1,"KLHL30,KLHL30-AS1"


In [39]:
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Epilepsy_cases_novel_variants_new.xlsx', index=False)

In [7]:
sample_cols = vcf['SAMPLE1'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'GT', 'GQ', 'DP', 'FREQ', 'GENE']]
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,FILTER,GT,GQ,DP,FREQ,GENE
0,chr1,69568737,.,C,G,PASS,0/1,29,40,22.5%,LRRC7
1,chr1,103688940,.,T,C,PASS,0/1,47,28,46.43%,AMY1B
2,chr1,149860814,.,G,T,PASS,0/1,142,141,29.79%,H4C15
3,chr1,152217438,.,C,G,PASS,0/1,32,11,72.73%,"FLG-AS1,HRNR"
4,chr1,152217444,.,T,G,PASS,0/1,32,15,53.33%,"FLG-AS1,HRNR"
...,...,...,...,...,...,...,...,...,...,...,...
225,chrX,102366132,.,G,A,PASS,0/1,52,69,23.19%,NXF2B
226,chrX,154265039,.,A,G,PASS,0/1,92,83,32.53%,OPN1MW3
227,chrX,154265043,.,A,C,PASS,0/1,92,83,32.53%,OPN1MW3
228,chrX,154265051,.,C,G,PASS,0/1,99,87,33.33%,OPN1MW3


In [13]:
vcf['dbsnp'] = vcf.apply(lambda row: '_'.join([str(row['CHROM']), str(row['POS']), str(row['REF']), str(row['ALT'])]), axis=1)
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,FILTER,GT,GQ,DP,FREQ,GENE,dbsnp
0,chr1,69568737,.,C,G,PASS,0/1,29,40,22.5%,LRRC7,chr1_69568737_C_G
1,chr1,103688940,.,T,C,PASS,0/1,47,28,46.43%,AMY1B,chr1_103688940_T_C
2,chr1,149860814,.,G,T,PASS,0/1,142,141,29.79%,H4C15,chr1_149860814_G_T
3,chr1,152217438,.,C,G,PASS,0/1,32,11,72.73%,"FLG-AS1,HRNR",chr1_152217438_C_G
4,chr1,152217444,.,T,G,PASS,0/1,32,15,53.33%,"FLG-AS1,HRNR",chr1_152217444_T_G
...,...,...,...,...,...,...,...,...,...,...,...,...
225,chrX,102366132,.,G,A,PASS,0/1,52,69,23.19%,NXF2B,chrX_102366132_G_A
226,chrX,154265039,.,A,G,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265039_A_G
227,chrX,154265043,.,A,C,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265043_A_C
228,chrX,154265051,.,C,G,PASS,0/1,99,87,33.33%,OPN1MW3,chrX_154265051_C_G


In [15]:
vcf['Zygosity'] = ''

vcf.loc[vcf['GT'] == '1/1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['GT'] == '0/1', 'Zygosity'] = 'Heterozygous'
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,FILTER,GT,GQ,DP,FREQ,GENE,dbsnp,Zygosity
0,chr1,69568737,.,C,G,PASS,0/1,29,40,22.5%,LRRC7,chr1_69568737_C_G,Heterozygous
1,chr1,103688940,.,T,C,PASS,0/1,47,28,46.43%,AMY1B,chr1_103688940_T_C,Heterozygous
2,chr1,149860814,.,G,T,PASS,0/1,142,141,29.79%,H4C15,chr1_149860814_G_T,Heterozygous
3,chr1,152217438,.,C,G,PASS,0/1,32,11,72.73%,"FLG-AS1,HRNR",chr1_152217438_C_G,Heterozygous
4,chr1,152217444,.,T,G,PASS,0/1,32,15,53.33%,"FLG-AS1,HRNR",chr1_152217444_T_G,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,chrX,102366132,.,G,A,PASS,0/1,52,69,23.19%,NXF2B,chrX_102366132_G_A,Heterozygous
226,chrX,154265039,.,A,G,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265039_A_G,Heterozygous
227,chrX,154265043,.,A,C,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265043_A_C,Heterozygous
228,chrX,154265051,.,C,G,PASS,0/1,99,87,33.33%,OPN1MW3,chrX_154265051_C_G,Heterozygous


In [16]:
vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/variants_ADHD_31_noval_format.xlsx', index=False)

In [18]:
vcf['new_column'] = vcf.apply(lambda row: '{}-{}_{}_{}'.format(row['CHROM'], row['POS'], row['REF'], row['ALT']), axis=1)
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,FILTER,GT,GQ,DP,FREQ,GENE,dbsnp,Zygosity,new_column
0,chr1,69568737,.,C,G,PASS,0/1,29,40,22.5%,LRRC7,chr1_69568737_C_G,Heterozygous,chr1-69568737_C_G
1,chr1,103688940,.,T,C,PASS,0/1,47,28,46.43%,AMY1B,chr1_103688940_T_C,Heterozygous,chr1-103688940_T_C
2,chr1,149860814,.,G,T,PASS,0/1,142,141,29.79%,H4C15,chr1_149860814_G_T,Heterozygous,chr1-149860814_G_T
3,chr1,152217438,.,C,G,PASS,0/1,32,11,72.73%,"FLG-AS1,HRNR",chr1_152217438_C_G,Heterozygous,chr1-152217438_C_G
4,chr1,152217444,.,T,G,PASS,0/1,32,15,53.33%,"FLG-AS1,HRNR",chr1_152217444_T_G,Heterozygous,chr1-152217444_T_G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,chrX,102366132,.,G,A,PASS,0/1,52,69,23.19%,NXF2B,chrX_102366132_G_A,Heterozygous,chrX-102366132_G_A
226,chrX,154265039,.,A,G,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265039_A_G,Heterozygous,chrX-154265039_A_G
227,chrX,154265043,.,A,C,PASS,0/1,92,83,32.53%,OPN1MW3,chrX_154265043_A_C,Heterozygous,chrX-154265043_A_C
228,chrX,154265051,.,C,G,PASS,0/1,99,87,33.33%,OPN1MW3,chrX_154265051_C_G,Heterozygous,chrX-154265051_C_G


In [18]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_bed.xlsx')
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df.to_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/krishna_bed.bed', header=None, sep='\t', index=False)
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr7,44143213,44198170


In [2]:
import pandas as pd
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65645
1,chr1,65811,65993
2,chr1,69461,69620
3,chr1,785981,786159
4,chr1,786130,786446
...,...,...,...
230714,chrY,57190028,57190328
230715,chrY,57190299,57190439
230716,chrY,57190874,57191014
230717,chrY,57191846,57192058


In [3]:
# Add a new row at the bottom
new_row = {'chromosome': 'chrM', 'Start_pos': 1, 'End_pos': 16569}
df = df.append(new_row, ignore_index=True)
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65645
1,chr1,65811,65993
2,chr1,69461,69620
3,chr1,785981,786159
4,chr1,786130,786446
...,...,...,...
230715,chrY,57190299,57190439
230716,chrY,57190874,57191014
230717,chrY,57191846,57192058
230718,chrY,57194102,57194243


In [22]:
import pandas as pd
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...
...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i..."


In [23]:
df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977
...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093


In [24]:
df['Gene'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df['Gene'] = df['Gene'].str.split(',').str[0]
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos,Gene
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649,OR4F5
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037,OR4F5
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706,OR4F29
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682,OR4F16
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977,SAMD11
...,...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941,BPY2C
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906,BPY2C
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048,BPY2C
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093,CDY1


In [25]:
data = df[df['Gene'] == 'GCK']
data = data[['chromosome', 'Extended_Start_pos', 'Extended_End_pos']]
data

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos
80561,chr7,44144154,44144314
80562,chr7,44145106,44145308
80563,chr7,44145464,44145768
80564,chr7,44146028,44146188
80565,chr7,44146430,44146654
80566,chr7,44147616,44147862
80567,chr7,44149727,44150093
80568,chr7,44150931,44151091
80569,chr7,44152235,44152461
80570,chr7,44153255,44153503


In [27]:
# Function to check if a row satisfies the conditions
def check_coverage(row):
    if row['chromosome'] == 'chr7' and 44143213 <= row['Extended_Start_pos'] <= 44198170 and 44143213 <= row['Extended_End_pos'] <= 44198170:
        return 'covered'
    else:
        return 'not_covered'

# Apply the function to create a new 'coverage' column
data['coverage'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,coverage
80561,chr7,44144154,44144314,covered
80562,chr7,44145106,44145308,covered
80563,chr7,44145464,44145768,covered
80564,chr7,44146028,44146188,covered
80565,chr7,44146430,44146654,covered
80566,chr7,44147616,44147862,covered
80567,chr7,44149727,44150093,covered
80568,chr7,44150931,44151091,covered
80569,chr7,44152235,44152461,covered
80570,chr7,44153255,44153503,covered


In [28]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/GCK_covered_positions.xlsx', index=False)

In [4]:
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos']]
#new_row = {'chromosome': 'chrM', 'Extended_Start_pos': 1, 'Extended_End_pos': 16569}

# Append the new row to the DataFrame
#df = df.append(new_row, ignore_index=True)
df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos
0,chr1,65489,65649
1,chr1,69007,70037
2,chr1,450710,451706
3,chr1,685686,686682
4,chr1,924401,924977
...,...,...,...
208906,chrY,25038781,25038941
208907,chrY,25041746,25041906
208908,chrY,25043888,25044048
208909,chrY,25622413,25624093


In [None]:
44149722

In [16]:
# Value to check
value = 44146159

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Start_pos'] <= value) & (df['End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Start_pos,End_pos


In [43]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/sainee/sample_4.xlsx')
data

Unnamed: 0,CHROM,POS,rsID,DP,REF,ALT
0,chr1,1091327,rs4074137,22,C,A
1,chr1,1676661,rs770523611,36,C,G
2,chr1,1734736,rs61777509,55,C,T
3,chr1,7830024,rs1357370388,56,A,C
4,chr1,11098048,rs62623443,18,C,T
...,...,...,...,...,...,...
614,chrX,141589872,rs1397854713,33,G,A
615,chrX,141698301,rs782600137,40,C,T
616,chrX,141906704,rs59853306,90,G,A
617,chrX,153807864,rs58306331,23,G,A


In [12]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

KeyError: 'POS'

In [45]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/sainee/sample_4_updated.xlsx', index=False)

In [4]:
df.to_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered_MITO.bed', header=None, sep='\t', index=False)

In [7]:
# Value to check
value = 9360634

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Extended_Start_pos'] <= value) & (df['Extended_End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos


In [6]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/some_genes_covered.xlsx')
data

Unnamed: 0,Gene
0,SLC22A5
1,CPT1A
2,SLC25A20
3,EIF2AK3
4,HSPA5
5,HSPA5
6,PDIA4
7,PPP1R15A
8,DDIT3
9,MAPK8


In [7]:
merged_genes = pd.merge(data, df, how = 'inner', on = 'Gene', sort=False)
merged_genes

Unnamed: 0,Gene,chromosome,Extended_Start_pos,Extended_End_pos
0,SLC22A5,chr5,132369824,132370394
1,SLC22A5,chr5,132378118,132378278
2,SLC22A5,chr5,132378340,132378500
3,SLC22A5,chr5,132384115,132384331
4,SLC22A5,chr5,132385297,132385571
...,...,...,...,...
176,HMGB1,chr13,30462630,30462790
177,MCAT,chr22,43133012,43133514
178,MCAT,chr22,43137051,43137327
179,MCAT,chr22,43140997,43141277


In [8]:
merged_genes.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/genes_covered.xlsx', index = False)

In [26]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,rsID,CHROM,POS,RET,ALT,Covered/Not_Covered
0,rs75527207,chr7,117587806.0,G,A,Covered
1,rs4149056,chr12,21178615.0,T,C,Covered
2,rs1799971,chr6,154039662.0,A,G,Covered
3,rs141033578,chr7,117606695.0,C,G,Covered
4,rs141033578,chr7,117606695.0,C,T,Covered
...,...,...,...,...,...,...
3615,rs1059513,chr12,57095926.0,T,C,Not_Covered
3616,rs12885713,chr14,90397013.0,C,G,Not_Covered
3617,rs12885713,chr14,90397013.0,C,T,Not_Covered
3618,rs12885713,chr14,90397013.0,C,A,Not_Covered


In [25]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHBSTLGPTTL1_updated.xlsx', index=False)

In [21]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
gene_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    gene = row['Gene']  # Assuming 'Gene' is the column name for gene information
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
        gene_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))
    gene_dict[chromosome].append(gene)

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    gene = row['Gene']  # Assuming 'GENE' is the column name for gene information
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        genes = gene_dict[chromosome]
        for i, (start, end) in enumerate(ranges):
            if start <= pos <= end:
                if gene == genes[i]:
                    return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

KeyError: 'Gene'

In [7]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,rsID,Gene,CHROM,POS,Covered/Not_Covered
0,rs55886062,DPYD,chr1,97515787,Covered
1,rs11615,ERCC1,chr19,45420395,Covered
2,rs699517,TYMS,chr18,673016,Not_Covered
3,rs3218592,REV3L,chr6,111322635,Covered
4,rs1045642,ABCB1,chr7,87509329,Covered
...,...,...,...,...,...
171,rs9389568,PERP,chr6,138145853,Not_Covered
172,rs2960436,,chr7,45937683,Not_Covered
173,rs117458836,CYP2C8,chr10,95013625,Not_Covered
174,rs118088833,TP53AIP1,chr11,128937143,Not_Covered


In [9]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/DYPD gene_Capcetabine, flurouracil, oxaliplatin.xlsx')
df_1

Unnamed: 0,PharmGKB ID,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric
0,1451276160,1A,rs55886062,DPYD,capecitabine,Toxicity,Neoplasms,False
1,1445401125,3,rs11615,ERCC1,capecitabine; radiotherapy,Efficacy,Rectal Neoplasms,False
2,1448616922,3,rs699517,TYMS,capecitabine,Toxicity,Asthenia; Nausea; Neoplasms; Vomiting,False
3,1444667292,3,rs3218592,REV3L,capecitabine; fluorouracil,Efficacy,Neoplasm Metastasis,False
4,981204466,3,rs1045642,ABCB1,capecitabine,Toxicity,Neoplasms,False
...,...,...,...,...,...,...,...,...
256,1184986201,3,rs11479,TYMP,capecitabine; fluorouracil,Toxicity,Neoplasms,False
257,981204864,3,rs1801131,MTHFR,capecitabine; fluorouracil; leucovorin; oxalip...,Efficacy,Neoplasms,False
258,1449270937,3,rs117458836,CYP2C8,cyclophosphamide; epirubicin; fluorouracil,Toxicity,Breast Neoplasms; Neutropenia,False
259,1449270943,3,rs118088833,TP53AIP1,cyclophosphamide; epirubicin; fluorouracil,Toxicity,Breast Neoplasms; Neutropenia,False


In [10]:
merged_1 = pd.merge(df_1, data, on = 'rsID', how='left', sort=False)
merged_1

Unnamed: 0,PharmGKB ID,Level,rsID,Gene_x,Drugs,Phenotype Categories,Phenotype,Pediatric,Gene_y,CHROM,POS,Covered/Not_Covered
0,1451276160,1A,rs55886062,DPYD,capecitabine,Toxicity,Neoplasms,False,DPYD,chr1,97515787.0,Covered
1,1445401125,3,rs11615,ERCC1,capecitabine; radiotherapy,Efficacy,Rectal Neoplasms,False,ERCC1,chr19,45420395.0,Covered
2,1448616922,3,rs699517,TYMS,capecitabine,Toxicity,Asthenia; Nausea; Neoplasms; Vomiting,False,TYMS,chr18,673016.0,Not_Covered
3,1444667292,3,rs3218592,REV3L,capecitabine; fluorouracil,Efficacy,Neoplasm Metastasis,False,REV3L,chr6,111322635.0,Covered
4,981204466,3,rs1045642,ABCB1,capecitabine,Toxicity,Neoplasms,False,ABCB1,chr7,87509329.0,Covered
...,...,...,...,...,...,...,...,...,...,...,...,...
256,1184986201,3,rs11479,TYMP,capecitabine; fluorouracil,Toxicity,Neoplasms,False,TYMP,chr22,50525807.0,Covered
257,981204864,3,rs1801131,MTHFR,capecitabine; fluorouracil; leucovorin; oxalip...,Efficacy,Neoplasms,False,MTHFR,chr1,11794419.0,Covered
258,1449270937,3,rs117458836,CYP2C8,cyclophosphamide; epirubicin; fluorouracil,Toxicity,Breast Neoplasms; Neutropenia,False,CYP2C8,chr10,95013625.0,Not_Covered
259,1449270943,3,rs118088833,TP53AIP1,cyclophosphamide; epirubicin; fluorouracil,Toxicity,Breast Neoplasms; Neutropenia,False,TP53AIP1,chr11,128937143.0,Not_Covered


In [11]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/DYPD_gene.xlsx')
df_2

Unnamed: 0,PharmGKB ID,Level,rsID,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric
0,1451276160,1A,rs55886062,DPYD,capecitabine,Toxicity,Neoplasms,False
1,1451286020,1A,rs1801160,DPYD,fluorouracil,Toxicity,Neoplasms,True
2,981201535,1A,rs2297595,DPYD,fluorouracil,Toxicity,Neoplasms,False
3,1447989706,1A,rs1801266,DPYD,fluorouracil,Other,,False
4,1449575662,1A,rs148994843,DPYD,fluorouracil,Other,,False
5,1451275020,1A,rs3918290,DPYD,fluorouracil,Other,,True
6,1449575656,1A,rs59086055,DPYD,fluorouracil,Other,,False
7,1451287440,1A,rs17376848,DPYD,capecitabine,Toxicity,Neoplasms,False
8,1451266065,1A,rs3918290,DPYD,tegafur,Toxicity,Neoplasms,False
9,1449575668,3,rs748620513,DPYD,fluorouracil,Other,,False


In [12]:
merged_2 = pd.merge(df_2, data, on = 'rsID', how='left', sort=False)
merged_2

Unnamed: 0,PharmGKB ID,Level,rsID,Gene_x,Drugs,Phenotype Categories,Phenotype,Pediatric,Gene_y,CHROM,POS,Covered/Not_Covered
0,1451276160,1A,rs55886062,DPYD,capecitabine,Toxicity,Neoplasms,False,DPYD,chr1,97515787,Covered
1,1451286020,1A,rs1801160,DPYD,fluorouracil,Toxicity,Neoplasms,True,DPYD,chr1,97305364,Covered
2,981201535,1A,rs2297595,DPYD,fluorouracil,Toxicity,Neoplasms,False,DPYD,chr1,97699535,Covered
3,1447989706,1A,rs1801266,DPYD,fluorouracil,Other,,False,DPYD,chr1,97691776,Covered
4,1449575662,1A,rs148994843,DPYD,fluorouracil,Other,,False,DPYD,chr1,97515923,Covered
5,1451275020,1A,rs3918290,DPYD,fluorouracil,Other,,True,DPYD,chr1,97450058,Covered
6,1449575656,1A,rs59086055,DPYD,fluorouracil,Other,,False,DPYD,chr1,97450190,Covered
7,1451287440,1A,rs17376848,DPYD,capecitabine,Toxicity,Neoplasms,False,DPYD,chr1,97450068,Covered
8,1451266065,1A,rs3918290,DPYD,tegafur,Toxicity,Neoplasms,False,DPYD,chr1,97450058,Covered
9,1449575668,3,rs748620513,DPYD,fluorouracil,Other,,False,DPYD,chr1,97573799,Covered


In [13]:
merged_1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/DYPD_gene_Capcetabine_flurouracil_oxaliplatin_new.xlsx', index=False)
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/krishna_covered_pos/DYPD_gene_Capcetabine_flurouracil_oxaliplatin_2.xlsx', index=False)

In [15]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Genes for rare disorders (Orphanet DB).xlsx')
data

Unnamed: 0,gene_symbol
0,KIF7
1,AGA
2,CWC27
3,SUMF1
4,MANBA
...,...
4482,KCNJ1
4483,MIR140
4484,NCKAP1L
4485,SOCS1


In [16]:
data['Matching'] = data['gene_symbol'].apply(lambda gene: 'Yes' if gene in df['gene_symbol'].values else 'No')
data

Unnamed: 0,gene_symbol,Matching
0,KIF7,Yes
1,AGA,Yes
2,CWC27,Yes
3,SUMF1,Yes
4,MANBA,Yes
...,...,...
4482,KCNJ1,Yes
4483,MIR140,No
4484,NCKAP1L,Yes
4485,SOCS1,Yes


In [17]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/4487_genes_status.xlsx', index=False)

In [19]:
df[df['gene_symbol'] == 'SUMF1']

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,gene_symbol
37354,chr3,4361101,4361381,SUMF1
37355,chr3,4361397,4361753,SUMF1
37356,chr3,4361895,4362291,SUMF1
37357,chr3,4376282,4376442,SUMF1
37358,chr3,4410840,4411000,SUMF1
37359,chr3,4417104,4417264,SUMF1
37360,chr3,4417990,4418150,SUMF1
37361,chr3,4420018,4420178,SUMF1
37362,chr3,4449217,4449377,SUMF1
37363,chr3,4452845,4453085,SUMF1


In [17]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,PharmGKB ID,Level,Variant,CHROM,POS,Gene,Drugs,Phenotype Categories,Phenotype,Pediatric,Covered/Not_Covered
0,1449167328,3,rs4902333,chr14,64909368.0,CHURC1,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
1,1449167200,3,rs12302749,chr12,6867132.0,SPSB2,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
2,1449167584,3,rs2013169,chr5,140118020.0,PURA,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
3,1447964236,3,rs6551665,chr4,61873823.0,ADGRL3,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
4,1450376443,3,rs28386840,chr16,55652906.0,SLC6A2,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
5,1449167211,3,rs10420097,chr19,57633193.0,ZNF211,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Not_Covered
6,1449167339,3,rs2295490,chr20,388261.0,TRIB3,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Covered
7,1449167469,3,rs11552708,chr17,7559238.0,SENP3,methylphenidate,Efficacy,Attention Deficit Disorder with Hyperactivity,True,Covered
8,1451420300,3,CYP2D6*4; CYP2D6*5,,,CYP2D6,atomoxetine; methylphenidate,Metabolism/PK,,False,Not_Covered
9,1450374881,3,rs2032582,chr7,87531302.0,ABCB1,methylphenidate,Toxicity,Attention Deficit Disorder with Hyperactivity,True,Covered


In [21]:
data.to_excel(R'C:/Users/GenepoweRx_Madhu/Downloads/Methylphenidate_covered_not_covered.xlsx', index=False)

In [47]:
df.to_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/kalyani_mam_covered.bed', header=None, sep='\t', index=False)

In [6]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/kalyani_mam_covered.bed', header=None, sep='\t', error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65649
1,chr1,69007,70037
2,chr1,450710,451706
3,chr1,685686,686682
4,chr1,924401,924977
...,...,...,...
208906,chrY,25038781,25038941
208907,chrY,25041746,25041906
208908,chrY,25043888,25044048
208909,chrY,25622413,25624093


In [19]:
# Value to check
value = 4382028

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Extended_Start_pos'] <= value) & (df['Extended_End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,gene_symbol
155302,chr16,4380848,4382928,VASN


In [26]:
filtered_df.INFO.iloc[0]

'ccds_id=CCDS30547.1;ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5'

In [21]:
df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/covered_clingeno_bed.xlsx', index=False)

In [7]:
df.chromosome.value_counts()

chr1     20980
chr2     16263
chr3     12382
chr17    12130
chr12    12015
chr11    11675
chr19    11373
chr6     10469
chr7      9921
chr5      9467
chr10     8780
chr16     8717
chr9      8493
chr4      8436
chr15     7524
chrX      7213
chr8      7059
chr14     6618
chr20     5088
chr22     4418
chr13     3802
chr18     3392
chr21     2276
chrY       420
Name: chromosome, dtype: int64

In [10]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/MODY_covered_list.xlsx')
data

Unnamed: 0,samples,GENE,CHROM,POS,ID
0,12652713,,chr16,31779,rs141542623
1,12652713,TUBB8,chr10,49286,rs6560829
2,17751397,TUBB8,chr10,49286,rs6560829
3,17751406,TUBB8,chr10,49286,rs6560829
4,12652700,TUBB8,chr10,49286,rs6560829
...,...,...,...,...,...
165543,17751406,ZNF692,chr1,248855917,rs13313088
165544,17751406,ZNF692,chr1,248856006,rs13313009
165545,17751397,PGBD2,chr1,248913954,rs12025760
165546,17751397,PGBD2,chr1,248916897,rs74157349


# check a dataset positions Covered/ not

In [12]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
data['Madhu_Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,samples,GENE,CHROM,POS,ID,Madhu_Covered/Not_Covered
0,12652713,,chr16,31779,rs141542623,Not_Covered
1,12652713,TUBB8,chr10,49286,rs6560829,Covered
2,17751397,TUBB8,chr10,49286,rs6560829,Covered
3,17751406,TUBB8,chr10,49286,rs6560829,Covered
4,12652700,TUBB8,chr10,49286,rs6560829,Covered
...,...,...,...,...,...,...
165543,17751406,ZNF692,chr1,248855917,rs13313088,Covered
165544,17751406,ZNF692,chr1,248856006,rs13313009,Not_Covered
165545,17751397,PGBD2,chr1,248913954,rs12025760,Not_Covered
165546,17751397,PGBD2,chr1,248916897,rs74157349,Covered


In [16]:
x = data[data['Madhu_Covered/Not_Covered'] == 'Not_Covered']
x

Unnamed: 0,samples,GENE,CHROM,POS,ID,Madhu_Covered/Not_Covered
0,12652713,,chr16,31779,rs141542623,Not_Covered
5,17751406,TUBB8,chr10,49289,rs6560830,Not_Covered
6,12652700,TUBB8,chr10,49289,rs6560830,Not_Covered
7,17751397,TUBB8,chr10,49289,rs6560830,Not_Covered
8,12652713,TUBB8,chr10,49289,rs6560830,Not_Covered
...,...,...,...,...,...,...
165539,12652713,SH3BP5L,chr1,248826485,rs11205415,Not_Covered
165540,17751406,SH3BP5L,chr1,248826485,rs11205415,Not_Covered
165541,17751397,SH3BP5L,chr1,248826485,rs11205415,Not_Covered
165544,17751406,ZNF692,chr1,248856006,rs13313009,Not_Covered


In [19]:
data1 = set(x['ID'])
print(len(data1))

24153


In [20]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Updated_genomic_db.xlsx', index=False)

In [26]:
# Function to check if 'pos' is covered or not_covered
def check_coverage(row):
    pos = row['pos']
    chrom = row['chromosome']
    is_covered = any((chrom == df['chromosome']) & (pos >= df['Extended_Start_pos']) & (pos <= df['Extended_End_pos']))
    return 'Covered' if is_covered else 'Not_Covered'

# Apply the function to create the new column in dataset2
data['Madhu_Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

KeyboardInterrupt: 

In [None]:
data.columns

In [None]:
data = data[['Covered/Not_Covered', 'Madhu_Covered/Not_Covered', 'geneinfo', 'rsid', 'clndn', 'clinical_significance',
            'review_status', 'molecular_consequence', 'clinvar_id', 'chromosome', 'pos', 'ref_allele', 'alt_allele', 
            'disease_name', 'conflicting_clinical_signifincance', 'clnsig_inclusive', 'dbvar_id', 'origin', 'is_indel', 
            'is_snp', 'is_structural_variant', 'is_transition', 'var_type', 'var_subtype', 'CLNDN Include/Exclude', 
            'Group\n(Adult/Neo,Pediatric/Both', 'one liner', 'Section Keyword']]
data

In [None]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Updated_genomic_db.xlsx', index=False)

In [13]:
# Function to check if 'pos' is covered or not_covered
def check_coverage(row):
    pos = row['pos']
    chrom = row['chromosome']
    is_covered = any((chrom == df['chromosome']) & (pos >= df['Extended_Start_pos']) & (pos <= df['Extended_End_pos']))
    return 'covered' if is_covered else 'not_covered'

# Apply the function to create the new column in dataset2
data['coverage'] = data.apply(check_coverage, axis=1)

data


Unnamed: 0,Covered/Not_Covered,geneinfo,rsid,clndn,clinical_significance,review_status,molecular_consequence,clinvar_id,chromosome,pos,ref_allele,alt_allele,disease_name,conflicting_clinical_signifincance,clnsig_inclusive,dbvar_id,origin,is_indel,is_snp,is_structural_variant,is_transition,var_type,var_subtype,CLNDN Include/Exclude,"Group\n(Adult/Neo,Pediatric/Both",one liner,Section Keyword,coverage
0,Covered,CFTR:1080,rs75961395,,Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7143,chr7,117509123,G,[A],CFTR-related disorders,,,,['NA'],0,1,0,1,snp,Transition,Include,Neonatal/Pediatric,,Lung Health,covered
1,Covered,CFTR:1080,rs75961395,,Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7143,chr7,117509123,G,[A],Cystic fibrosis,,,,['NA'],0,1,0,1,snp,Transition,Include,Both,Cystic fibrosis,Lung Health,covered
2,Covered,CFTR:1080,rs75961395,,Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7143,chr7,117509123,G,[A],Congenital bilateral aplasia of vas deferens f...,,,,['NA'],0,1,0,1,snp,Transition,Exclude,--,,Miscellaneous,covered
3,Covered,CFTR:1080,rs75961395,,Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7143,chr7,117509123,G,[A],Inborn genetic diseases,,,,['NA'],0,1,0,1,snp,Transition,Exclude,--,,DD,covered
4,Covered,CFTR:1080,rs75961395,,Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7143,chr7,117509123,G,[A],not provided,,,,['NA'],0,1,0,1,snp,Transition,Exclude,--,,Miscellaneous,covered
5,Covered,CFTR:1080,rs78655421,['Cystic_fibrosis'],Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7109,chr7,117530975,G,[A],CFTR-related disorders,,['209047:Pathogenic'],,['NA'],0,1,0,1,snp,Transition,Include,Neonatal/Pediatric,,Lung Health,covered
6,Covered,CFTR:1080,rs78655421,['Cystic_fibrosis'],Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7109,chr7,117530975,G,[A],Obstructive azoospermia,,['209047:Pathogenic'],,['NA'],0,1,0,1,snp,Transition,Include,Both,,RH_M,covered
7,Covered,CFTR:1080,rs78655421,['Cystic_fibrosis'],Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7109,chr7,117530975,G,[A],Hereditary pancreatitis,,['209047:Pathogenic'],,['NA'],0,1,0,1,snp,Transition,Include,Adult,Pancreatitis,GH_Pancrea,covered
8,Covered,CFTR:1080,rs78655421,['Cystic_fibrosis'],Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7109,chr7,117530975,G,[A],Bronchiectasis with or without elevated sweat ...,,['209047:Pathogenic'],,['NA'],0,1,0,1,snp,Transition,Include,Both,Bronchiectasis,Lung Health,covered
9,Covered,CFTR:1080,rs78655421,['Cystic_fibrosis'],Pathogenic,['practice_guideline'],['SO:0001583|missense_variant'],7109,chr7,117530975,G,[A],Cystic fibrosis,,['209047:Pathogenic'],,['NA'],0,1,0,1,snp,Transition,Include,Both,Cystic fibrosis,Lung Health,covered


## wrong

In [None]:
# Function to check if 'pos' is covered or not_covered
def check_coverage(row):
    pos = row['pos']
    is_covered = any((pos >= df['Extended_Start_pos']) & (pos <= df['Extended_End_pos']))
    return 'covered' if is_covered else 'not_covered'

# Apply the function to create the new column in dataset2
data['coverage'] = data.apply(check_coverage, axis=1)

data

In [None]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Updated_genomic_db.xlsx', index=False)

# check the single position Covered/ not

In [7]:
# Value to check
value = 31779

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Extended_Start_pos'] <= value) & (df['Extended_End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,INFO,gene_symbol


# check the multiple positions Covered/ not

In [13]:
values = [627579,1436793,1447646,1447739,1448375,1588020,1588028,1588032,1589645,1589647,1589661,1589693,1592156,1816872,1816897,505927,169394,240834,240886,432137,196140,196153,196161,724415,1979403,2025553,57035,138279,1781290,2387652,501653,748202,748266,748361,748445,2835920,196129,196142,196150,292900,548240,2891829,3255108,3255111,3283757,3439362,3224872,500147,577842,615982,616026,650242]

# Iterate over each value and filter rows
for value in values:
    filtered_df = df[(df['Extended_Start_pos'] <= value) & (df['Extended_End_pos'] >= value)]
    if not filtered_df.empty:
        print(f"{value} Covered")
        filtered_df
    else:
        print(f"{value} Not Covered")
    print()

627579 Covered

1436793 Covered

1447646 Covered

1447739 Covered

1448375 Covered

1588020 Covered

1588028 Covered

1588032 Covered

1589645 Covered

1589647 Covered

1589661 Covered

1589693 Covered

1592156 Covered

1816872 Covered

1816897 Covered

505927 Covered

169394 Covered

240834 Covered

240886 Covered

432137 Covered

196140 Covered

196153 Covered

196161 Covered

724415 Covered

1979403 Covered

2025553 Covered

57035 Covered

138279 Covered

1781290 Covered

2387652 Covered

501653 Covered

748202 Covered

748266 Covered

748361 Covered

748445 Covered

2835920 Covered

196129 Covered

196142 Covered

196150 Covered

292900 Covered

548240 Covered

2891829 Covered

3255108 Covered

3255111 Covered

3283757 Covered

3439362 Covered

3224872 Covered

500147 Covered

577842 Covered

615982 Covered

616026 Covered

650242 Covered



In [None]:
import pandas as pd

# Sample data
data = {
    'start': [100, 200, 300, 400],
    'end': [250, 350, 400, 500]
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# List of values to check
values = [275, 150, 450]

# Iterate over each value and filter rows
for value in values:
    filtered_df = df[(df['start'] <= value) & (df['end'] >= value)]
    if not filtered_df.empty:
        print(f"Value {value} lies between start and end positions in the DataFrame:")
        print(filtered_df)
    else:
        print(f"Value {value} does not lie between start and end positions in the DataFrame.")
    print()


In [32]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/16chrom.xlsx')
data

Unnamed: 0,Gene,Change,New_Change
0,PKD1,16:2135686:G:A,16_2135686_G_A
1,PKD1,16:2135686:G:C,16_2135686_G_C
2,PKD1,16:2135677:C:T,16_2135677_C_T
3,PKD1,16:2135673:G:T,16_2135673_G_T
4,PKD1,16:2135668:G:GGGCGGGCGC,16_2135668_G_GGGCGGGCGC
...,...,...,...
1136,PKD1,16:2089782:C:T,16_2089782_C_T
1137,PKD1,16:2089773:C:A,16_2089773_C_A
1138,PKD1,16:2089750:T:C,16_2089750_T_C
1139,PKD1,16:2089747:C:T,16_2089747_C_T


In [34]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/PKD2_PKD1.xlsx')
df

Unnamed: 0,Gene,New_Change,HGVSc
0,PKD1,16_2135686_G_A,c.4C>T
1,PKD1,16_2135686_G_C,c.4C>G
2,PKD1,16_2135683_G_T,c.7C>A
3,PKD1,16_2135682_G_C,c.8C>G
4,PKD1,16_2135677_C_T,c.13G>A
...,...,...,...
3553,PKD1,16_2089747_C_T,c.12892G>A
3554,PKD1,16_2089741_G_A,c.12898C>T
3555,PKD1,16_2089731_G_A,c.12908C>T
3556,PKD1,DEL_16_1984218_2112963,c.CNV_Del


In [35]:
merged = pd.merge(data, df, on = 'New_Change', how = 'left', sort=False)
merged

Unnamed: 0,Gene,Change,New_Change,Gene.1,HGVSc
0,PKD1,16:2135686:G:A,16_2135686_G_A,PKD1,c.4C>T
1,PKD1,16:2135686:G:C,16_2135686_G_C,PKD1,c.4C>G
2,PKD1,16:2135677:C:T,16_2135677_C_T,PKD1,c.13G>A
3,PKD1,16:2135673:G:T,16_2135673_G_T,PKD1,c.17C>A
4,PKD1,16:2135668:G:GGGCGGGCGC,16_2135668_G_GGGCGGGCGC,PKD1,c.13_21dup
...,...,...,...,...,...
1136,PKD1,16:2089782:C:T,16_2089782_C_T,PKD1,c.12857G>A
1137,PKD1,16:2089773:C:A,16_2089773_C_A,PKD1,c.12866G>T
1138,PKD1,16:2089750:T:C,16_2089750_T_C,PKD1,c.12889A>G
1139,PKD1,16:2089747:C:T,16_2089747_C_T,PKD1,c.12892G>A


In [36]:
df[df['New_Change'] == '16_2089750_T_C']

Unnamed: 0,Gene,New_Change,HGVSc
3552,PKD1,16_2089750_T_C,c.12889A>G


In [37]:
merged.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/CHROM_4.xlsx', index=False)

In [38]:
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Nephropathy_Lit_final_Positions_hg38_hg37.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3

Unnamed: 0,CHROM,POS,Literature
0,chr2,227303904,Yes
2,chr2,227307878,Yes
2,chr2,227314020,Yes
3,chr2,227263944,Yes
4,chr2,227290810,Yes
...,...,...,...
3233,chr1,17033058,Yes
3233,chr1,17028723,Yes
3233,chr1,17028652,Yes
3234,chr1,17024076,Yes


In [41]:
df_3[df_3['POS'] == "2114176"]

Unnamed: 0,CHROM,POS,Literature
1754,chr16,2114176,Yes


In [19]:
import pandas as pd
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Protein position_Onckokb.xlsx')
df

Unnamed: 0,Alterations
0,A1742P
1,A2062V
2,A2067D
3,A59S
4,C2488Y
...,...
895,V1398D
896,K1423E
897,K1436Q
898,S1463F


In [20]:
df[['before_number', 'after_number']] = df['Alterations'].str.split(r'(\d+)', expand=True).iloc[:, [0, 2]]
df['number'] = df['Alterations'].str.extract(r'(\d+)')
df

Unnamed: 0,Alterations,before_number,after_number,number
0,A1742P,A,P,1742
1,A2062V,A,V,2062
2,A2067D,A,D,2067
3,A59S,A,S,59
4,C2488Y,C,Y,2488
...,...,...,...,...
895,V1398D,V,D,1398
896,K1423E,K,E,1423
897,K1436Q,K,Q,1436
898,S1463F,S,F,1463


In [21]:
df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_df.xlsx', index=False)

In [22]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_df.xlsx')
df

Unnamed: 0,Alterations,change
0,A1742P,p.Ala1742Pro
1,A2062V,p.Ala2062Val
2,A2067D,p.Ala2067Asp
3,A59S,p.Ala59Ser
4,C2488Y,p.Cys2488Tyr
...,...,...
865,R1391S,p.Arg1391Ser
866,V1398D,p.Val1398Asp
867,K1423E,p.Lys1423Glu
868,K1436Q,p.Lys1436Gln


In [23]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Oncokb Database.xlsx')
data

Unnamed: 0,Gene,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
0,ATM,A1742P,Likely Oncogenic,Likely Loss-of-function,The ATM A1742P mutation is located outside of ...,Mutations in the ATM gene lead to impaired ove...,PMID: 16014569,1.0
1,ATM,A2062V,Likely Oncogenic,Likely Loss-of-function,The ATM A2062V mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
2,ATM,A2067D,Likely Oncogenic,Likely Loss-of-function,The ATM A2067D mutation is located in the FAT ...,A-TWinnipeg: Pathogenesis of rare ATM missense...,PMID: 25077176,1.0
3,ATM,A59S,Likely Oncogenic,Likely Loss-of-function,The ATM A59S mutation is located outside of a ...,Loss of DNA Damage Response in Neuroblastoma a...,PMID: 29059438,1.0
4,ATM,C2488Y,Likely Oncogenic,Likely Loss-of-function,The ATM C2488Y mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
...,...,...,...,...,...,...,...,...
881,NF1,V1398D,Likely Oncogenic,Likely Loss-of-function,The NF1 V1398D mutation is located in the GTPa...,Assessment of the potential pathogenicity of m...,PMID: 22807134,1.0
882,NF1,K1423E,Oncogenic,Loss-of-function,The NF1 K1423E mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0
883,NF1,K1436Q,Likely Oncogenic,Likely Loss-of-function,The NF1 K1436Q mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0
884,NF1,S1463F,Likely Oncogenic,Likely Loss-of-function,The NF1 S1463F mutation is located in the GTPa...,Exploring the somatic NF1 mutational spectrum ...,"PMID: 22108604, PMID: 22807134",2.0


In [24]:
merged = pd.merge(data, df, on = 'Alterations', how = 'inner', sort=False)
merged

Unnamed: 0,Gene,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count,change
0,ATM,A1742P,Likely Oncogenic,Likely Loss-of-function,The ATM A1742P mutation is located outside of ...,Mutations in the ATM gene lead to impaired ove...,PMID: 16014569,1.0,p.Ala1742Pro
1,ATM,A2062V,Likely Oncogenic,Likely Loss-of-function,The ATM A2062V mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0,p.Ala2062Val
2,ATM,A2067D,Likely Oncogenic,Likely Loss-of-function,The ATM A2067D mutation is located in the FAT ...,A-TWinnipeg: Pathogenesis of rare ATM missense...,PMID: 25077176,1.0,p.Ala2067Asp
3,ATM,A59S,Likely Oncogenic,Likely Loss-of-function,The ATM A59S mutation is located outside of a ...,Loss of DNA Damage Response in Neuroblastoma a...,PMID: 29059438,1.0,p.Ala59Ser
4,ATM,C2488Y,Likely Oncogenic,Likely Loss-of-function,The ATM C2488Y mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0,p.Cys2488Tyr
...,...,...,...,...,...,...,...,...,...
864,NF1,R1391S,Likely Oncogenic,Likely Loss-of-function,The NF1 R1391S mutation is located in the GTPa...,Mutational and functional analysis of the neur...,"PMID: 9003501, PMID: 16513807, PMID: 9668168",3.0,p.Arg1391Ser
865,NF1,V1398D,Likely Oncogenic,Likely Loss-of-function,The NF1 V1398D mutation is located in the GTPa...,Assessment of the potential pathogenicity of m...,PMID: 22807134,1.0,p.Val1398Asp
866,NF1,K1423E,Oncogenic,Loss-of-function,The NF1 K1423E mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0,p.Lys1423Glu
867,NF1,K1436Q,Likely Oncogenic,Likely Loss-of-function,The NF1 K1436Q mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0,p.Lys1436Gln


In [25]:
merged.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_onco.xlsx', index=False)

In [22]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_onco.xlsx')
df.head()

Unnamed: 0,Gene Names,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
0,ATM,p.Ala1742Pro,Likely Oncogenic,Likely Loss-of-function,The ATM A1742P mutation is located outside of ...,Mutations in the ATM gene lead to impaired ove...,PMID: 16014569,1.0
1,ATM,p.Ala2062Val,Likely Oncogenic,Likely Loss-of-function,The ATM A2062V mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
2,ATM,p.Ala2067Asp,Likely Oncogenic,Likely Loss-of-function,The ATM A2067D mutation is located in the FAT ...,A-TWinnipeg: Pathogenesis of rare ATM missense...,PMID: 25077176,1.0
3,ATM,p.Ala59Ser,Likely Oncogenic,Likely Loss-of-function,The ATM A59S mutation is located outside of a ...,Loss of DNA Damage Response in Neuroblastoma a...,PMID: 29059438,1.0
4,ATM,p.Cys2488Tyr,Likely Oncogenic,Likely Loss-of-function,The ATM C2488Y mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0


# ONCO samples code

In [2]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_onco.xlsx')

data1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/ONCO_17_samples/input/Germline_KHULTRGPCSP1 - KHULTRGPCSP1_S76_L004_R.xlsx', header=1, usecols=list(range(18)))
data1['Alterations'] = data1['HGVS p. (Clinically Relevant)'].str.split(':').str[1]

data1['Gene Match'] = 'No'
data1['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in data1['Gene Names'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df['Gene Names'].values:
                data1.at[index, 'Gene Match'] = 'Yes'
                data1.at[index, 'Matched_Gene'] = gene
                break
    
df = df.rename({'Gene Names':'Matched_Gene'}, axis=1)
mer = pd.merge(data1, df, on = ['Matched_Gene', 'Alterations'], how = 'left', sort=False)
mer = mer.dropna(subset=['Oncogenic'])
mer = mer.drop(['Gene Match', 'Matched_Gene'], axis=1)
mer.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/ONCO_17_samples/Germline_KHULTRGPCSP1 - KHULTRGPCSP1_S76_L004_R.xlsx', index=False)
mer

Unnamed: 0,Chr:Pos,Ref/Alt,Identifier,Variants to track,Filter,Allelic Depths (AD),Variant Allele Fraction,Genotype Qualities (GQ),0/1 Genotypes (GT),Gene Names,Sequence Ontology (Combined),Effect (Combined),N of 4 Predicted Splicing Disrupted (Combined),Predicted Splicing Disrupted (Combined),Transcript Name (Clinically Relevant),HGVS c. (Clinically Relevant),HGVS p. (Clinically Relevant),Transcript Name,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
27192,13:32332592,A/C,,False,,3932,0.450704,99,0/1,BRCA2,missense_variant,Missense,,,NM_000059.4,NM_000059.4:c.1114A>C,NP_000050.3:p.Asn372His,NM_000059.4,p.Asn372His,Likely Neutral,Likely Neutral,The BRCA2 N372H mutation is located in the P/C...,Functional assays for classification of BRCA2 ...,"PMID: 18451181, PMID: 15695382",1.0
27198,13:32355250,T/C,,False,,101,1.0,99,1/1,BRCA2,missense_variant,Missense,,,NM_000059.4,NM_000059.4:c.7397T>C,NP_000050.3:p.Val2466Ala,NM_000059.4,p.Val2466Ala,Likely Neutral,Likely Neutral,The BRCA2 V2466A mutation is located in the FA...,Assessment of the Clinical Relevance of BRCA2 ...,PMID: 29394989,1.0
33666,17:43071077,T/C,,False,,104,1.0,99,1/1,BRCA1,missense_variant,Missense,,,NM_007294.4,NM_007294.4:c.4837A>G,NP_009225.1:p.Ser1613Gly,"NM_007298.3,NM_007299.4,NM_007297.4,NM_007300....",p.Ser1613Gly,Likely Neutral,Likely Neutral,The BRCA1 S1613G mutation is likely neutral. I...,Human BRCA1 inhibits growth in yeast: potentia...,PMID: 9159158,1.0


# --------------------------------------------------------------------------------------------------------------

In [78]:
mer.shape

(13, 25)

In [12]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/final_onco.xlsx')
df

Unnamed: 0,Gene Names,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
0,ATM,p.Ala1742Pro,Likely Oncogenic,Likely Loss-of-function,The ATM A1742P mutation is located outside of ...,Mutations in the ATM gene lead to impaired ove...,PMID: 16014569,1.0
1,ATM,p.Ala2062Val,Likely Oncogenic,Likely Loss-of-function,The ATM A2062V mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
2,ATM,p.Ala2067Asp,Likely Oncogenic,Likely Loss-of-function,The ATM A2067D mutation is located in the FAT ...,A-TWinnipeg: Pathogenesis of rare ATM missense...,PMID: 25077176,1.0
3,ATM,p.Ala59Ser,Likely Oncogenic,Likely Loss-of-function,The ATM A59S mutation is located outside of a ...,Loss of DNA Damage Response in Neuroblastoma a...,PMID: 29059438,1.0
4,ATM,p.Cys2488Tyr,Likely Oncogenic,Likely Loss-of-function,The ATM C2488Y mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
...,...,...,...,...,...,...,...,...
864,NF1,p.Arg1391Ser,Likely Oncogenic,Likely Loss-of-function,The NF1 R1391S mutation is located in the GTPa...,Mutational and functional analysis of the neur...,"PMID: 9003501, PMID: 16513807, PMID: 9668168",3.0
865,NF1,p.Val1398Asp,Likely Oncogenic,Likely Loss-of-function,The NF1 V1398D mutation is located in the GTPa...,Assessment of the potential pathogenicity of m...,PMID: 22807134,1.0
866,NF1,p.Lys1423Glu,Oncogenic,Loss-of-function,The NF1 K1423E mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0
867,NF1,p.Lys1436Gln,Likely Oncogenic,Likely Loss-of-function,The NF1 K1436Q mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0


In [4]:
data1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Germline_KHSCQTGPTTL2 - KHSCQTGPTTL2_S31_L004_R.xlsx', header=1, usecols=list(range(18)))
data1['Alterations'] = data1['HGVS p. (Clinically Relevant)'].str.split(':').str[1]
data1

Unnamed: 0,Chr:Pos,Ref/Alt,Identifier,Filter,Variant Allele Fraction,Allelic Depths (AD),Read Depths (DP),Genotype Qualities (GQ),0/1 Genotypes (GT),Gene Names,Sequence Ontology (Combined),Effect (Combined),N of 4 Predicted Splicing Disrupted (Combined),Predicted Splicing Disrupted (Combined),Transcript Name (Clinically Relevant),HGVS c. (Clinically Relevant),HGVS p. (Clinically Relevant),Transcript Name
0,1:14522,G/A,,,1,02,2,6,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1609C>T,,NR_024540.1
1,1:14542,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1589T>C,,NR_024540.1
2,1:14574,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1557T>C,,NR_024540.1
3,1:14590,G/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1541C>T,,NR_024540.1
4,1:14599,T/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1532A>T,,NR_024540.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164045,Un_KI270756v1:2828,A/T,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,
164046,Un_KI270757v1:23426,G/A,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,
164047,Un_KI270757v1:23500,A/C,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,
164048,Un_KI270757v1:52356,A/G,,LowQual,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,


In [8]:
data1['Alterations'] = data1['HGVS p. (Clinically Relevant)'].str.split(':').str[1]
data1.Alterations.value_counts()

p.Leu12=       8
p.Leu89=       8
p.Ala63=       8
p.Leu101=      7
p.Leu189=      7
              ..
p.Ile287Leu    1
p.Asp158=      1
p.His61Arg     1
p.Arg232Ser    1
p.Gly465=      1
Name: Alterations, Length: 22441, dtype: int64

In [14]:
data1['Gene Match'] = 'No'
data1['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in data1['Gene Names'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df['Gene Names'].values:
                data1.at[index, 'Gene Match'] = 'Yes'
                data1.at[index, 'Matched_Gene'] = gene
                break
    
data1

Unnamed: 0,Chr:Pos,Ref/Alt,Identifier,Filter,Variant Allele Fraction,Allelic Depths (AD),Read Depths (DP),Genotype Qualities (GQ),0/1 Genotypes (GT),Gene Names,Sequence Ontology (Combined),Effect (Combined),N of 4 Predicted Splicing Disrupted (Combined),Predicted Splicing Disrupted (Combined),Transcript Name (Clinically Relevant),HGVS c. (Clinically Relevant),HGVS p. (Clinically Relevant),Transcript Name,Alterations,Gene Match,Matched_Gene
0,1:14522,G/A,,,1,02,2,6,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1609C>T,,NR_024540.1,,No,
1,1:14542,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1589T>C,,NR_024540.1,,No,
2,1:14574,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1557T>C,,NR_024540.1,,No,
3,1:14590,G/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1541C>T,,NR_024540.1,,No,
4,1:14599,T/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1532A>T,,NR_024540.1,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164045,Un_KI270756v1:2828,A/T,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,
164046,Un_KI270757v1:23426,G/A,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,
164047,Un_KI270757v1:23500,A/C,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,
164048,Un_KI270757v1:52356,A/G,,LowQual,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,


In [15]:
data1.Matched_Gene.value_counts()

          163870
NF1           32
BRCA2         30
ATM           24
BARD1         22
BRIP1         16
MSH2          14
MSH6           8
MRE11          8
CHEK2          8
CDH1           7
CDKN2A         4
MUTYH          3
BRCA1          2
HOXB13         2
Name: Matched_Gene, dtype: int64

In [16]:
df = df.rename({'Gene Names':'Matched_Gene'}, axis=1)
df

Unnamed: 0,Matched_Gene,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
0,ATM,p.Ala1742Pro,Likely Oncogenic,Likely Loss-of-function,The ATM A1742P mutation is located outside of ...,Mutations in the ATM gene lead to impaired ove...,PMID: 16014569,1.0
1,ATM,p.Ala2062Val,Likely Oncogenic,Likely Loss-of-function,The ATM A2062V mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
2,ATM,p.Ala2067Asp,Likely Oncogenic,Likely Loss-of-function,The ATM A2067D mutation is located in the FAT ...,A-TWinnipeg: Pathogenesis of rare ATM missense...,PMID: 25077176,1.0
3,ATM,p.Ala59Ser,Likely Oncogenic,Likely Loss-of-function,The ATM A59S mutation is located outside of a ...,Loss of DNA Damage Response in Neuroblastoma a...,PMID: 29059438,1.0
4,ATM,p.Cys2488Tyr,Likely Oncogenic,Likely Loss-of-function,The ATM C2488Y mutation is located in the FAT ...,ATM mutations uniformly lead to ATM dysfunctio...,PMID: 23585524,1.0
...,...,...,...,...,...,...,...,...
864,NF1,p.Arg1391Ser,Likely Oncogenic,Likely Loss-of-function,The NF1 R1391S mutation is located in the GTPa...,Mutational and functional analysis of the neur...,"PMID: 9003501, PMID: 16513807, PMID: 9668168",3.0
865,NF1,p.Val1398Asp,Likely Oncogenic,Likely Loss-of-function,The NF1 V1398D mutation is located in the GTPa...,Assessment of the potential pathogenicity of m...,PMID: 22807134,1.0
866,NF1,p.Lys1423Glu,Oncogenic,Loss-of-function,The NF1 K1423E mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0
867,NF1,p.Lys1436Gln,Likely Oncogenic,Likely Loss-of-function,The NF1 K1436Q mutation is located in the GTPa...,Somatic mutations in the neurofibromatosis 1 g...,"PMID: 1568247, PMID: 16513807, PMID: 32697994",3.0


In [17]:
mer = pd.merge(data1, df, on = ['Matched_Gene', 'Alterations'], how = 'left', sort=False)
mer

Unnamed: 0,Chr:Pos,Ref/Alt,Identifier,Filter,Variant Allele Fraction,Allelic Depths (AD),Read Depths (DP),Genotype Qualities (GQ),0/1 Genotypes (GT),Gene Names,Sequence Ontology (Combined),Effect (Combined),N of 4 Predicted Splicing Disrupted (Combined),Predicted Splicing Disrupted (Combined),Transcript Name (Clinically Relevant),HGVS c. (Clinically Relevant),HGVS p. (Clinically Relevant),Transcript Name,Alterations,Gene Match,Matched_Gene,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
0,1:14522,G/A,,,1,02,2,6,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1609C>T,,NR_024540.1,,No,,,,,,,
1,1:14542,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1589T>C,,NR_024540.1,,No,,,,,,,
2,1:14574,A/G,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1557T>C,,NR_024540.1,,No,,,,,,,
3,1:14590,G/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1541C>T,,NR_024540.1,,No,,,,,,,
4,1:14599,T/A,,,1,03,3,9,1/1,WASH7P,non_coding_exon_variant,Other,,,NR_024540.1,NR_024540.1:n.1532A>T,,NR_024540.1,,No,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164045,Un_KI270756v1:2828,A/T,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,,,,,,,
164046,Un_KI270757v1:23426,G/A,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,,,,,,,
164047,Un_KI270757v1:23500,A/C,,,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,,,,,,,
164048,Un_KI270757v1:52356,A/G,,LowQual,1,02,2,6,1/1,,intergenic_variant,Other,,,,,,,,No,,,,,,,


In [19]:
mer = mer.dropna(subset=['Oncogenic'])
mer = mer.drop(['Gene Match', 'Matched_Gene'], axis=1)
mer

Unnamed: 0,Chr:Pos,Ref/Alt,Identifier,Filter,Variant Allele Fraction,Allelic Depths (AD),Read Depths (DP),Genotype Qualities (GQ),0/1 Genotypes (GT),Gene Names,Sequence Ontology (Combined),Effect (Combined),N of 4 Predicted Splicing Disrupted (Combined),Predicted Splicing Disrupted (Combined),Transcript Name (Clinically Relevant),HGVS c. (Clinically Relevant),HGVS p. (Clinically Relevant),Transcript Name,Alterations,Oncogenic,Mutation effect,Description from OncoKB_Database,Citations,PMID,Count
103734,13:32332592,A/C,,,0.377358,3320,53,99,0/1,BRCA2,missense_variant,Missense,,,NM_000059.4,NM_000059.4:c.1114A>C,NP_000050.3:p.Asn372His,NM_000059.4,p.Asn372His,Likely Neutral,Likely Neutral,The BRCA2 N372H mutation is located in the P/C...,Functional assays for classification of BRCA2 ...,"PMID: 18451181, PMID: 15695382",1.0
103744,13:32355250,T/C,,,1.0,45,45,99,1/1,BRCA2,missense_variant,Missense,,,NM_000059.4,NM_000059.4:c.7397T>C,NP_000050.3:p.Val2466Ala,NM_000059.4,p.Val2466Ala,Likely Neutral,Likely Neutral,The BRCA2 V2466A mutation is located in the FA...,Assessment of the Clinical Relevance of BRCA2 ...,PMID: 29394989,1.0


In [19]:
import pandas as pd
import xlsxwriter

# Sample DataFrame
data = {
    '21_conditions_list': [
        'Cholesterol Disorders',
        'Skin Health',
        'Gut Health (Irritable Bowel Disease, Inflammatory Bowel Syndrome)',
        'Gut Health (Irritable Bowel Disease, Inflammatory Bowel Syndrome)',
        'Skin Health',
        'Skin Health',
        'Skin Health',
        'Skin Health',
        'Diabetes, Cardiac Health(CAD, Arrythmias, Cardiomyopathy), Cholesterol Disorders',
        'Gut Health (Irritable Bowel Disease, Inflammatory Bowel Syndrome)',
        'Gut Health (Irritable Bowel Disease, Inflammatory Bowel Syndrome)'
    ],
    'rsID': [
        'rs6687605',
        'rs1764391',
        'rs3737240',
        'rs13294',
        'rs7518080',
        'rs61816760',
        'rs192116923',
        'rs34806697',
        'rs513043',
        'rs1801274',
        'rs382627'
    ],
    'Literature': [
        'No',
        'No',
        'No',
        'No',
        'No',
        'No',
        'No',
        'No',
        'No',
        'No',
        'No'
    ],
    'CHROM': [
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1',
        'chr1'
    ],
    'POS': [
        25563141,
        34795168,
        150510879,
        150512511,
        152304195,
        152305958,
        152306930,
        152309145,
        156129878,
        161509955,
        161518015
    ],
    'REF': [
        'T',
        'C',
        'C',
        'G',
        'C',
        'T',
        'T',
        'T',
        'T',
        'A',
        'T'
    ],
    'ALT': [
        'C',
        'T',
        'T',
        'A',
        'A',
        'G',
        'G',
        'C',
        'G',
        'G',
        'C'
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,21_conditions_list,rsID,Literature,CHROM,POS,REF,ALT
0,Cholesterol Disorders,rs6687605,No,chr1,25563141,T,C
1,Skin Health,rs1764391,No,chr1,34795168,C,T
2,"Gut Health (Irritable Bowel Disease, Inflammat...",rs3737240,No,chr1,150510879,C,T
3,"Gut Health (Irritable Bowel Disease, Inflammat...",rs13294,No,chr1,150512511,G,A
4,Skin Health,rs7518080,No,chr1,152304195,C,A
5,Skin Health,rs61816760,No,chr1,152305958,T,G
6,Skin Health,rs192116923,No,chr1,152306930,T,G
7,Skin Health,rs34806697,No,chr1,152309145,T,C
8,"Diabetes, Cardiac Health(CAD, Arrythmias, Card...",rs513043,No,chr1,156129878,T,G
9,"Gut Health (Irritable Bowel Disease, Inflammat...",rs1801274,No,chr1,161509955,A,G


In [22]:
# Create an Excel writer object
output_path = r'C:/Users/GenepoweRx_Madhu/Downloads/sample_output.xlsx'
workbook = xlsxwriter.Workbook(output_path)
worksheet_names = []  # To keep track of worksheet names

# Iterate through unique conditions
for condition in df['21_conditions_list'].explode().unique():
    # Filter rows for the current condition, including rows with multiple conditions
    condition_df = df[df['21_conditions_list'].apply(lambda x: condition in x)]
    
    # Truncate long worksheet names to a maximum of 31 characters
    sheet_name = condition[:31]
    
    # Ensure that the sheet name is unique
    suffix = 1
    while sheet_name in worksheet_names:
        suffix += 1
        sheet_name = condition[:28] + f'_{suffix}'
    
    worksheet_names.append(sheet_name)
    
    # Create a new worksheet with the truncated sheet name
    worksheet = workbook.add_worksheet(sheet_name)
    
    # Write the filtered DataFrame to the worksheet
    for row_num, (_, row_data) in enumerate(condition_df.iterrows(), start=1):
        for col_num, value in enumerate(row_data, start=1):
            if isinstance(value, list):
                # Convert lists to a comma-separated string
                value = ', '.join(value)
            worksheet.write(row_num, col_num, value)

# Close the workbook to save it
workbook.close()

In [13]:
import pandas as pd

# Sample DataFrame
data = {
    'Gene': ['A', 'B', 'C', 'D', 'E'],
    'rsID': ['rs1', 'rs2', 'rs1', 'rs3', 'rs1'],
    'CHROM': ['chr1', 'chr2', 'chr1', 'chr3', 'chr1'],
    'POS': [100, 200, 300, 400, 500],
    'REF': ['A', 'C', 'T', 'G', 'A'],
    'ALT': ['T', 'G', 'C', 'A', 'T'],
    'EXON': ['5/6', '6/8', '6/8', '2/5', '6/8']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Gene,rsID,CHROM,POS,REF,ALT,EXON
0,A,rs1,chr1,100,A,T,5/6
1,B,rs2,chr2,200,C,G,6/8
2,C,rs1,chr1,300,T,C,6/8
3,D,rs3,chr3,400,G,A,2/5
4,E,rs1,chr1,500,A,T,6/8


In [14]:
# Step 1: Identify rows with duplicate 'rsID' values
duplicate_rsID = df[df['rsID'].duplicated(keep=False)]

# Step 2: Select the row with the highest denominator in the 'EXON' column
duplicate_rsID['EXON'] = duplicate_rsID['EXON'].apply(lambda x: max(x.split('/'), key=int))

# Step 3: Keep the first occurrence of rows with the same 'rsID' and 'EXON' values
result_df = duplicate_rsID.drop_duplicates(subset=['rsID', 'EXON'], keep='first')

# Print or manipulate the result DataFrame as needed
result_df

Unnamed: 0,Gene,rsID,CHROM,POS,REF,ALT,EXON
0,A,rs1,chr1,100,A,T,6
2,C,rs1,chr1,300,T,C,8


In [39]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Documents/Processed_vcf_files/21 Conditions+Genes info (1).xlsx')
df

Unnamed: 0,Genes,Condition,Headings,21_conditions_list
0,CTLA4,Type 1 Diabetes,Predisposition for diabetes,Diabetes
1,INS,Type 1 Diabetes,Predisposition for diabetes,Diabetes
2,IL2RA,Type 1 Diabetes,Predisposition for diabetes,Diabetes
3,HNF1A,Type 1 Diabetes,Predisposition for diabetes,Diabetes
4,CEL,Type 1 Diabetes,Predisposition for diabetes,Diabetes
...,...,...,...,...
757,RYR1,Obesity,Obesity,Obesity
758,SIM1,Obesity,Obesity,Obesity
759,BRCA2,Obesity,Obesity,Obesity
760,SCN1A,Obesity,Obesity,Obesity


In [40]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Documents/Processed_vcf_files/21-cond.xlsx')
data

Unnamed: 0,21_conditions_list,21_Conditions_list
0,Diabetes,Diabetes
1,High Blood Pressure,High_Blood_Pressure
2,"Cardiac Health(CAD, Arrythmias, Cardiomyopathy)",Cardiac_Health
3,Cholesterol Disorders,Cholesterol_Disorders
4,Thyroid Disorders,Thyroid_Disorders
5,Parkinson's,Parkinsons
6,Dementia,Dementia
7,Headaches,Headaches
8,"Allergies (Respiratory, Food)",Allergies
9,Anemia,Anemia


In [41]:
merged_df = pd.merge(df, data, on = '21_conditions_list', how = 'left', sort=False)
merged_df = merged_df[['Genes', 'Condition', 'Headings', '21_Conditions_list']]
merged_df

Unnamed: 0,Genes,Condition,Headings,21_Conditions_list
0,CTLA4,Type 1 Diabetes,Predisposition for diabetes,Diabetes
1,INS,Type 1 Diabetes,Predisposition for diabetes,Diabetes
2,IL2RA,Type 1 Diabetes,Predisposition for diabetes,Diabetes
3,HNF1A,Type 1 Diabetes,Predisposition for diabetes,Diabetes
4,CEL,Type 1 Diabetes,Predisposition for diabetes,Diabetes
...,...,...,...,...
757,RYR1,Obesity,Obesity,Obesity
758,SIM1,Obesity,Obesity,Obesity
759,BRCA2,Obesity,Obesity,Obesity
760,SCN1A,Obesity,Obesity,Obesity


In [70]:
merged_df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/dummy_genes.xlsx')
merged_df

Unnamed: 0,Genes,Condition,Headings,21_Conditions_list
0,CTLA4,Type 1 Diabetes,Predisposition for diabetes,Diabetes
1,INS,Type 1 Diabetes,Predisposition for diabetes,Diabetes
2,IL2RA,Type 1 Diabetes,Predisposition for diabetes,Diabetes
3,HNF1A,Type 1 Diabetes,Predisposition for diabetes,Diabetes
4,CEL,Type 1 Diabetes,Predisposition for diabetes,Diabetes
...,...,...,...,...
757,RYR1,Obesity,Obesity,Obesity
758,SIM1,Obesity,Obesity,Obesity
759,BRCA2,Obesity,Obesity,Obesity
760,SCN1A,Obesity,Obesity,Obesity


In [71]:
result = merged_df.groupby('Genes').agg({
    'Condition': lambda x: '; '.join(x),
    'Headings': lambda x: '; '.join(x),
    '21_Conditions_list': lambda x: '; '.join(x.astype(str))
}).reset_index()
result = result.rename({'Genes': 'Gene Name'}, axis=1)
result

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list
0,A2ML1,Other Skin Conditions,Skin Health,Skin_Health
1,AAGAB,Dry Skin/fragile skin,Skin Health,Skin_Health
2,ABCA1,Coronary artery disease risk; Polygenic (Famil...,Cardiac; Cholesterol disorders; Gallstones,Cardiac_Health; Cholesterol_Disorders; Gall_st...
3,ABCA12,Dry Skin/fragile skin,Skin Health,Skin_Health
4,ABCB4,"Polygenic (Familial combined, small dense LDL,...",Cholesterol disorders,Cholesterol_Disorders
...,...,...,...,...
579,XPC,Dry Skin/fragile skin,Skin Health,Skin_Health
580,ZMPSTE24,"Polygenic (Familial combined, small dense LDL,...",Cholesterol disorders,Cholesterol_Disorders
581,ZNF804A,Schizophrenia,Mood Disorder,Mood_Disorders
582,ZNRF1,Diabetic Retinopathy,Diabetic Retinopathy,Diabetes


In [72]:
result[result['21_Conditions_list'] == 'Diabetes; nan']

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list


In [73]:
scores = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/condition_specific/Condition_specific_all_lit_genes/Final_lit/last_conditions/genes/genes_scores.xlsx')
scores

Unnamed: 0,Gene Name,Gene_Score
0,TRDN-AS1,6
1,PMS2,8
2,AASS,4
3,ABCA13,4
4,ABCA7,8
...,...,...
3935,MLYCD,6
3936,ZSWIM6,4
3937,PDHX,6
3938,LOC105378457,4


In [74]:
merged_2 = pd.merge(result, scores, on = 'Gene Name', how = 'left', sort = False)
merged_2

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list,Gene_Score
0,A2ML1,Other Skin Conditions,Skin Health,Skin_Health,6.0
1,AAGAB,Dry Skin/fragile skin,Skin Health,Skin_Health,6.0
2,ABCA1,Coronary artery disease risk; Polygenic (Famil...,Cardiac; Cholesterol disorders; Gallstones,Cardiac_Health; Cholesterol_Disorders; Gall_st...,7.0
3,ABCA12,Dry Skin/fragile skin,Skin Health,Skin_Health,6.0
4,ABCB4,"Polygenic (Familial combined, small dense LDL,...",Cholesterol disorders,Cholesterol_Disorders,6.0
...,...,...,...,...,...
579,XPC,Dry Skin/fragile skin,Skin Health,Skin_Health,6.0
580,ZMPSTE24,"Polygenic (Familial combined, small dense LDL,...",Cholesterol disorders,Cholesterol_Disorders,4.0
581,ZNF804A,Schizophrenia,Mood Disorder,Mood_Disorders,
582,ZNRF1,Diabetic Retinopathy,Diabetic Retinopathy,Diabetes,


In [66]:
nan_rows = merged_2[pd.isna(merged_2['21_Conditions_list'])]
nan_rows

Unnamed: 0,Gene Name,Condition,Headings,21_Conditions_list,Gene_Score


In [67]:
merged_2['21_Conditions_list'].value_counts()

Skin_Health                                                                                          83
Cardiac_Health                                                                                       68
Cholesterol_Disorders                                                                                64
Diabetes                                                                                             53
Glomerular_Diseases                                                                                  39
Allergies                                                                                            29
Obesity                                                                                              25
Thyroid_Disorders                                                                                    22
High_Blood_Pressure                                                                                  22
Parkinsons                                                      

In [75]:
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/condition_specific/Condition_specific_all_lit_genes/Final_lit/last_conditions/genes/Conditions_final_genes.xlsx', index=False)

In [60]:
merged_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/dummy_genes.xlsx', index=False)

In [83]:
import pandas as pd

data = {
    'Gene Name': ['PADI4', 'PADI4', 'LDLRAP1', 'LOC105378642,GJA4', 'MYSM1', 'LMNA', 'FCGR2A', 'FCGR2A', 'SELP', 'SELP', 'LOC102724528,C1orf105,PIGC', 'CACNA1S'],
    'Gene': ['PADI4', 'PADI4', 'LDLRAP1', 'LOC105378642', 'MYSM1', 'LMNA', 'FCGR2A', 'FCGR2A', 'SELP', 'SELP', 'PIGC', 'CACNA1S'],
    'Gene_Score': [0, 0, 8, 7, 4, 6, 4, 4, 4, 4, 4, 6],
    'Condition': ['Rheumatoid Arthritis', 'Rheumatoid Arthritis', 'Polygenic (Familial combined, small dense LDL, Miscellaneous)', 'Dry Skin/fragile skin', 'Diabetic Retinopathy', 'Risk of Heart Disease; Hypertrophic cardiomyopathy risk; Long QT syndrome; Cardiac Channelopathies; Polygenic (Familial combined, small dense LDL, Miscellaneous)', 'Gut Health', 'Gut Health', 'Diabetic Retinopathy', 'Diabetic Retinopathy', 'Polygenic (Familial combined, small dense LDL, Miscellaneous)', 'Migraines and other headaches'],
    'Headings': ['Autoimmune Disorder', 'Autoimmune Disorder', 'Cholesterol disorders', 'Skin Health', 'Diabetic Retinopathy', 'Risk of Heart Disease; Cardiac; Arrhythmia; Cardiac; Cholesterol disorders', 'Gut Health', 'Gut Health', 'Diabetic Retinopathy', 'Diabetic Retinopathy', 'Cholesterol disorders', 'Headaches'],
    '21_Conditions_list': ['Arthritis_Degenerative_Joint_Disease', 'Arthritis_Degenerative_Joint_Disease', '', 'Skin_Health', 'Diabetes', 'Diabetes; Cardiac_Health; Cardiac_Health; Cardiac_Health; Cholesterol_Disorders', 'Gut_Health', 'Gut_Health', 'Diabetes', 'Diabetes', 'Cholesterol_Disorders', 'Headaches'],
    'rsID': ['rs11203366', 'rs874881', 'rs6687605', 'rs1764391', 'rs12139511', 'rs513043', 'rs1801274', 'rs382627', 'rs6136', 'rs6127', 'rs1063412', 'rs3850625'],
    'Literature': ['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Gene Name,Gene,Gene_Score,Condition,Headings,21_Conditions_list,rsID,Literature
0,PADI4,PADI4,0,Rheumatoid Arthritis,Autoimmune Disorder,Arthritis_Degenerative_Joint_Disease,rs11203366,No
1,PADI4,PADI4,0,Rheumatoid Arthritis,Autoimmune Disorder,Arthritis_Degenerative_Joint_Disease,rs874881,No
2,LDLRAP1,LDLRAP1,8,"Polygenic (Familial combined, small dense LDL,...",Cholesterol disorders,,rs6687605,No
3,"LOC105378642,GJA4",LOC105378642,7,Dry Skin/fragile skin,Skin Health,Skin_Health,rs1764391,No
4,MYSM1,MYSM1,4,Diabetic Retinopathy,Diabetic Retinopathy,Diabetes,rs12139511,No
5,LMNA,LMNA,6,Risk of Heart Disease; Hypertrophic cardiomyop...,Risk of Heart Disease; Cardiac; Arrhythmia; Ca...,Diabetes; Cardiac_Health; Cardiac_Health; Card...,rs513043,No
6,FCGR2A,FCGR2A,4,Gut Health,Gut Health,Gut_Health,rs1801274,No
7,FCGR2A,FCGR2A,4,Gut Health,Gut Health,Gut_Health,rs382627,No
8,SELP,SELP,4,Diabetic Retinopathy,Diabetic Retinopathy,Diabetes,rs6136,No
9,SELP,SELP,4,Diabetic Retinopathy,Diabetic Retinopathy,Diabetes,rs6127,No


In [84]:
# Split the 'Condition', 'Headings', and '21_Conditions_list' columns into lists
df['Condition'] = df['Condition'].str.split('; ')
df['Headings'] = df['Headings'].str.split('; ')
df['21_Conditions_list'] = df['21_Conditions_list'].str.split('; ')

# Duplicate rows only if they contain a semicolon
df_with_semicolon = df[df['Condition'].apply(lambda x: len(x) > 1)]
df_without_semicolon = df[df['Condition'].apply(lambda x: len(x) == 1)]

# Duplicate rows with semicolons and explode the lists
df_with_semicolon = df_with_semicolon.explode('Condition').explode('Headings').explode('21_Conditions_list')

# Concatenate the dataframes back together
df = pd.concat([df_with_semicolon, df_without_semicolon], ignore_index=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Print the DataFrame
df

Unnamed: 0,Gene Name,Gene,Gene_Score,Condition,Headings,21_Conditions_list,rsID,Literature
0,LMNA,LMNA,6,Risk of Heart Disease,Risk of Heart Disease,Diabetes,rs513043,No
1,LMNA,LMNA,6,Risk of Heart Disease,Risk of Heart Disease,Cardiac_Health,rs513043,No
2,LMNA,LMNA,6,Risk of Heart Disease,Risk of Heart Disease,Cardiac_Health,rs513043,No
3,LMNA,LMNA,6,Risk of Heart Disease,Risk of Heart Disease,Cardiac_Health,rs513043,No
4,LMNA,LMNA,6,Risk of Heart Disease,Risk of Heart Disease,Cholesterol_Disorders,rs513043,No
...,...,...,...,...,...,...,...,...
131,FCGR2A,FCGR2A,4,[Gut Health],[Gut Health],[Gut_Health],rs382627,No
132,SELP,SELP,4,[Diabetic Retinopathy],[Diabetic Retinopathy],[Diabetes],rs6136,No
133,SELP,SELP,4,[Diabetic Retinopathy],[Diabetic Retinopathy],[Diabetes],rs6127,No
134,"LOC102724528,C1orf105,PIGC",PIGC,4,"[Polygenic (Familial combined, small dense LDL...",[Cholesterol disorders],[Cholesterol_Disorders],rs1063412,No


In [91]:
import pandas as pd

data = {
    'column1': ['data; gene1', 'gene; gene1; gene2', 'gene3'],
    'column2': ['gene4; gene6', 'gene6; gene7; gene88', 'gene99']
}

# Create a DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,column1,column2
0,data; gene1,gene4; gene6
1,gene; gene1; gene2,gene6; gene7; gene88
2,gene3,gene99


In [92]:


# Split the cells in each column by commas
df['column1'] = df['column1'].str.split('; ')
df['column2'] = df['column2'].str.split('; ')

# Initialize an empty list to store the result
result = []

# Iterate through the rows and combine elements from both columns
for index, row in df.iterrows():
    column1_elements = row['column1']
    column2_elements = row['column2']
    
    # Pair elements from both columns
    pairs = zip(column1_elements, column2_elements)
    
    # Append pairs to the result list
    result.extend(pairs)

# Create a new DataFrame from the result list
result_df = pd.DataFrame(result, columns=['column1', 'column2'])
result_df

Unnamed: 0,column1,column2
0,data,gene4
1,gene1,gene6
2,gene,gene6
3,gene1,gene7
4,gene2,gene88
5,gene3,gene99


In [112]:
import pandas as pd

# Sample DataFrame
data = {'Column1': ['A;B', 'C;D', 'E', 'F;G;H'],
        'Column2': ['1;2', '3;4', '5', '6;7;8'],
        'Column3': ['X;Y', 'Z;W', 'V', 'U;T;S'],
        'OtherData1': ['foo', 'bar', 'baz', 'qux'],
        'OtherData2': ['apple', 'banana', 'cherry', 'date']}

df = pd.DataFrame(data)
df

Unnamed: 0,Column1,Column2,Column3,OtherData1,OtherData2
0,A;B,1;2,X;Y,foo,apple
1,C;D,3;4,Z;W,bar,banana
2,E,5,V,baz,cherry
3,F;G;H,6;7;8,U;T;S,qux,date


In [113]:
# Split the columns and stack them
df = df.set_index(['OtherData1', 'OtherData2']).apply(lambda x: x.str.split(';').explode()).reset_index()

# Rename the columns
#df.columns = ['Column1', 'Column2', 'Column3', 'OtherData1', 'OtherData2']

# Display the result
df

Unnamed: 0,OtherData1,OtherData2,Column1,Column2,Column3
0,foo,apple,A,1,X
1,foo,apple,B,2,Y
2,bar,banana,C,3,Z
3,bar,banana,D,4,W
4,baz,cherry,E,5,V
5,qux,date,F,6,U
6,qux,date,G,7,T
7,qux,date,H,8,S


In [5]:
import pandas as pd
import numpy as np

# Assuming you have two DataFrames: df1 and df2
# Replace 'POS' with the actual column name in your datasets

# Sample data for illustration purposes
data1 = {'ID': [1, 2, 3], 'POS': [10, 25, 40]}
data2 = {'ID': [4, 5, 6,7], 'POS': [12, 24, 42, 55]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Create an empty 'Distance' column in df1
df1['Distance'] = None
df1['Nearest_POS'] = None

# Loop through each row in df1
for index1, row1 in df1.iterrows():
    # Calculate absolute differences between POS values
    differences = np.abs(df2['POS'] - row1['POS'])
    
    # Find the index of the minimum difference
    min_index = np.argmin(differences)
    
    # Assign the minimum difference and nearest POS to the corresponding row in df1
    df1.at[index1, 'Distance'] = differences[min_index]
    df1.at[index1, 'Nearest_POS'] = df2.at[min_index, 'POS']

# Print the updated df1
df1

Unnamed: 0,ID,POS,Distance,Nearest_POS
0,1,10,2,12
1,2,25,1,24
2,3,40,2,42
