In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [12]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...
...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i..."


In [13]:
df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977
...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093


In [14]:
df['Gene'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos,Gene
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649,OR4F5
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037,OR4F5
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706,OR4F29
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682,OR4F16
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977,SAMD11
...,...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941,BPY2C
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906,BPY2C
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048,BPY2C
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093,CDY1


In [15]:
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'Gene']]
df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,INFO,Gene
0,chr1,65489,65649,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,OR4F5
1,chr1,69007,70037,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,OR4F5
2,chr1,450710,451706,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,OR4F29
3,chr1,685686,686682,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,OR4F16
4,chr1,924401,924977,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,SAMD11
...,...,...,...,...,...
208906,chrY,25038781,25038941,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208907,chrY,25041746,25041906,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208908,chrY,25043888,25044048,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208909,chrY,25622413,25624093,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",CDY1


In [16]:
# Value to check
value = 11259021

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Extended_Start_pos'] <= value) & (df['Extended_End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,INFO,Gene
37892,chr3,11259007,11260529,ccds_id=CCDS2604.1;ensembl_gene_id=ENSG0000019...,HRH1


In [2]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_4/KHAIGHGPTTL190/POS_1.xlsx')
df

Unnamed: 0,POS
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,"
1,"17-7674953-T-C, 17-7670654-T-C,"
2,"17-7674921-C-A, 17-7670703-C-A, 17-7670682-C-A..."
3,"17-7674917-T-C, 17-7673323-T-C, 17-7676011-T-C..."
4,"17-7674894-G-A, 17-7674209-G-A, 17-7673300-G-A..."
...,...
419,17-7578205-A-AT
420,17-7578397-G-GGG
421,17-7577070-A-AGA
422,17-7577095-A-ATCTC


In [3]:
df['POS_new'] = df['POS'].str.split(', ')
df = df.explode('POS_new')
df

Unnamed: 0,POS,POS_new
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-C
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-T
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-A
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",
1,"17-7674953-T-C, 17-7670654-T-C,",17-7674953-T-C
...,...,...
419,17-7578205-A-AT,17-7578205-A-AT
420,17-7578397-G-GGG,17-7578397-G-GGG
421,17-7577070-A-AGA,17-7577070-A-AGA
422,17-7577095-A-ATCTC,17-7577095-A-ATCTC


In [4]:
df.POS_new.dropna()
df

Unnamed: 0,POS,POS_new
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-C
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-T
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",17-7676154-G-A
0,"17-7676154-G-C, 17-7676154-G-T, 17-7676154-G-A,",
1,"17-7674953-T-C, 17-7670654-T-C,",17-7674953-T-C
...,...,...
419,17-7578205-A-AT,17-7578205-A-AT
420,17-7578397-G-GGG,17-7578397-G-GGG
421,17-7577070-A-AGA,17-7577070-A-AGA
422,17-7577095-A-ATCTC,17-7577095-A-ATCTC


In [5]:
df.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/scc_pos.xlsx', index=False)

In [4]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Cardiac_pos.xlsx')
df

Unnamed: 0,POS_new
0,9-104858586-C-T
1,9-104858586-C-G
2,9-104826974-C-T
3,9-104826974-C-G
4,9-104824472-T-C
...,...
2152,18-31595256-GTG-ATA
2153,18-31598691-GTG-ATT
2154,18-31595256-GTG-ATT
2155,18-31598691-GTG-ATC


In [5]:
df['POS'] = df['POS_new'].str.split('-').str[1]
df

Unnamed: 0,POS_new,POS
0,9-104858586-C-T,104858586
1,9-104858586-C-G,104858586
2,9-104826974-C-T,104826974
3,9-104826974-C-G,104826974
4,9-104824472-T-C,104824472
...,...,...
2152,18-31595256-GTG-ATA,31595256
2153,18-31598691-GTG-ATT,31598691
2154,18-31595256-GTG-ATT,31595256
2155,18-31598691-GTG-ATC,31598691


In [6]:
df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Cardiac_pos.xlsx', index=False)

In [2]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_4/KHHSPTGPCSP6/KHHSPTGPCSP6_annotated_indel.vcf', comment = '#', sep="\t", low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...
...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1


In [3]:
sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1


In [5]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)

vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous


In [14]:
vcf.INFO.iloc[900]

'ADP=39;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.769,0.231,.,.;COMMON=1;G5;G5A;GENEINFO=HEPACAM:220296|HEPN1:641654|LOC107984406:107984406;INT;KGPhase3;LSD;NOV;PM;RS=560243987;RSPOS=124919336;SAO=0;SSR=0;TOPMED=0.76814156472986748,0.23183454383282364,0.00001592762487257,0.00000796381243628;U3;U5;VC=DIV;VLD;VP=0x050060c8000517002e100204;WGT=1;dbSNPBuildID=136'

In [15]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous,SAMD11:148398,SAMD11
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous,AGRN:375790,AGRN
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous,SCNN1D:6339,SCNN1D
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous,MIR6808:102466740|DVL1:1855,"MIR6808,DVL1"
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous,MXRA8:54587,MXRA8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous,,
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous,PRORY:100533178,PRORY
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous,,
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous,,


In [16]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')

vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous,SAMD11:148398,SAMD11,,
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous,AGRN:375790,AGRN,,
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous,SCNN1D:6339,SCNN1D,,
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous,MIR6808:102466740|DVL1:1855,"MIR6808,DVL1",,
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous,MXRA8:54587,MXRA8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous,,,,
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous,PRORY:100533178,PRORY,,
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous,,,,
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous,,,,


In [17]:
vcf['SYMBOL / Gene Name'] = vcf['csq'].str.split('|').str[3]
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,SYMBOL / Gene Name,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous,SAMD11:148398,SAMD11,,,,,,,,,,,,,,,,,,,,,
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous,AGRN:375790,AGRN,,,,,,,,,,,,,,,,,,,,,
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous,SCNN1D:6339,SCNN1D,,,,,,,,,,,,,,,,,,,,,
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous,MIR6808:102466740|DVL1:1855,"MIR6808,DVL1",,,,,,,,,,,,,,,,,,,,,
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous,MXRA8:54587,MXRA8,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous,PRORY:100533178,PRORY,,,,,,,,,,,,,,,,,,,,,
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous,,,,,,,,,,,,,,,,,,,,,,,
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,


In [23]:
vcf.fillna('', inplace=True)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,SYMBOL / Gene Name,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous,SAMD11:148398,SAMD11,,,,,,,,,,,,,,,,,,,,,,
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous,AGRN:375790,AGRN,,,,,,,,,,,,,,,,,,,,,,
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous,SCNN1D:6339,SCNN1D,,,,,,,,,,,,,,,,,,,,,,
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous,MIR6808:102466740|DVL1:1855,"MIR6808,DVL1",,,,,,,,,,,,,,,,,,,,,,
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous,MXRA8:54587,MXRA8,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,,
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous,PRORY:100533178,PRORY,,,,,,,,,,,,,,,,,,,,,,
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous,,,,,,,,,,,,,,,,,,,,,,,,
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,,


In [22]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,SYMBOL / Gene Name,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid
0,chr1,943686,rs539654690,T,TC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9994,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:5:6:54.55%:6.192E-3:32:46:2:3:4:2,0/1,22,11,11,5,6,54.55%,6.192E-3,32,46,2,3,4,2,1,0,Heterozygous,SAMD11:148398,SAMD11,,,,,,,,,,,,,,,,,,,,,,
1,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:40:15:15:5:10:66.67%:9.995E-5:44:22:5:0:10:0,0/1,40,15,15,5,10,66.67%,9.995E-5,44,22,5,0,10,0,1,0,Heterozygous,AGRN:375790,AGRN,,,,,,,,,,,,,,,,,,,,,,
2,chr1,1286108,rs555847974,CAA,C,.,PASS,"ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9978,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:93:60:60:34:26:43.33%:4.5813E-10:48:52:21:...,0/1,93,60,60,34,26,43.33%,4.5813E-10,48,52,21,13,17,9,1,0,Heterozygous,SCNN1D:6339,SCNN1D,,,,,,,,,,,,,,,,,,,,,,
3,chr1,1341593,rs145370195,G,GACAC,.,PASS,"ADP=11;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.121,.,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:11:11:4:6:54.55%:5.418E-3:43:55:4:0:6:0,0/1,22,11,11,4,6,54.55%,5.418E-3,43,55,4,0,6,0,1,0,Heterozygous,MIR6808:102466740|DVL1:1855,"MIR6808,DVL1",,,,,,,,,,,,,,,,,,,,,,
4,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=37;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:75:37:37:17:20:54.05%:2.7004E-8:54:49:9:8:...,0/1,75,37,37,17,20,54.05%,2.7004E-8,54,49,9,8,16,4,1,0,Heterozygous,MXRA8:54587,MXRA8,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5408,chrY,21311242,rs202147921,A,AT,.,PASS,ADP=16;WT=0;HET=0;HOM=1;NC=0;ASP;RS=202147921;...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:16:16:1:12:75%:1.3461E-6:74:60:1:0:9:3,1/1,58,16,16,1,12,75%,1.3461E-6,74,60,1,0,9,3,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,,
5409,chrY,21431502,rs113251312,G,GA,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=PROR...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:34:10:10:2:8:80%:3.5723E-4:61:52:2:0:8:0,1/1,34,10,10,2,8,80%,3.5723E-4,61,52,2,0,8,0,0,1,Homozygous,PRORY:100533178,PRORY,,,,,,,,,,,,,,,,,,,,,,
5410,chrY,23188123,.,G,GA,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:18:18:9:7:38.89%:3.3988E-3:32:35:9:0:7:0,0/1,24,18,18,9,7,38.89%,3.3988E-3,32,35,9,0,7,0,1,0,Heterozygous,,,,,,,,,,,,,,,,,,,,,,,,
5411,chrY,56855218,rs148562705;rs869025766,TATA,T,.,PASS,"ADP=12;WT=0;HET=0;HOM=1;NC=0;ASP;RS=148562705,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:37:12:12:3:9:75%:1.6826E-4:41:31:3:0:8:1,1/1,37,12,12,3,9,75%,1.6826E-4,41,31,3,0,8,1,0,1,Homozygous,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
import pandas as pd

# Create a DataFrame with the given rows
data = {
    'Row': ['rs539769126, CM110656, COSV54627430, -',
            'rs146496725, CM110660, -, COSV99074759',
            '-, rs1040877016, CM110671']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Row
0,"rs539769126, CM110656, COSV54627430, -"
1,"rs146496725, CM110660, -, COSV99074759"
2,"-, rs1040877016, CM110671"


In [5]:

# Drop the words that start with "C"
df['Row'] = df['Row'].str.replace(r'\bC\w*,?\s*', '', regex=True)

# Print the updated DataFrame
df

Unnamed: 0,Row
0,rs539769126-
1,"rs146496725-,"
2,"-, rs1040877016,"


In [7]:
import pandas as pd

# Create a DataFrame with the column containing the values
data = {'Value': ['rs146496725, ', 'rs123456789, ', 'rs987654321, ']}
df = pd.DataFrame(data)

# Remove the comma at the end of the values in the 'ID' column
df['Value'] = df['Value'].str.replace(r',\s*$', '', regex=True)

# Print the updated DataFrame
df

Unnamed: 0,Value
0,rs146496725
1,rs123456789
2,rs987654321


In [8]:
import pandas as pd

# Create a sample DataFrame
data = {'ID': [1, 2, 3],
        'Value': [', rs146496725', ', rs789456123', ', rs987654321']}
df = pd.DataFrame(data)

# Remove the comma and space before "rsid" in the 'Value' column
df['Value'] = df['Value'].str.replace(r',\s*(rsid)', r'\1', regex=True)

# Display the updated DataFrame
df

Unnamed: 0,ID,Value
0,1,", rs146496725"
1,2,", rs789456123"
2,3,", rs987654321"


In [9]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/HLH_Lit_variants (2).xlsx')
data

Unnamed: 0,Gene,Nucleotide change,Amino acid change
0,PRF1,c.272C>T,Ala91Val
1,PRF1,c.386G > C,p.Trp129Ser
2,PRF1,c.50DelT,p.Leu17fsTer
3,PRF1,c.190C>T,p.Gln64Ter
4,PRF1,c.148G>A,p.Val50Met
...,...,...,...
73,STXBP2,c.795-4C>T,
74,RHOG,,
75,CDC42,,p.R186C
76,CDC42,,p.C188Y


In [None]:
data['Nucleotide change'].fillna