In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Desktop/Covered_regions.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/KHCDPRGPTTL6_annotated_indel.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHCDPRGPTTL6_annotated_indel.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()


In [85]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/output_filtered_recently.vcf', comment='#', sep="\t", header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:138:138:100:38:27.54%:1.8855E-13:60:45...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:114:114:0:114:100%:4.392E-68:0:59:0:0:...
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:56:64:64:47:17:26.56%:2.243E-6:55:58:33:14...
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:79:79:54:25:31.65%:3.1101E-9:55:54:43:1...
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:58:11:11:0:11:100%:1.4176E-6:0:63:0:0:10:1
...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:130:186:186:146:40:21.51%:8.6616E-14:52:49...
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:54:54:0:54:100%:4.0229E-32:0:55:0:0:35:19
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:182:32:32:0:32:100%:5.4567E-19:0:62:0:0:28:4
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:164:29:29:0:29:100%:3.3259E-17:0:58:0:0:24:5


In [86]:
sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,85,79,79,54,25,31.65%,3.1101E-9,43,11,23,2
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,130,186,186,146,40,21.51%,8.6616E-14,77,69,17,23
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",1/1,255,54,54,0,54,100%,4.0229E-32,0,0,35,19
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",1/1,182,32,32,0,32,100%,5.4567E-19,0,0,28,4
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",1/1,164,29,29,0,29,100%,3.3259E-17,0,0,24,5


In [87]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)

vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,85,79,79,54,25,31.65%,3.1101E-9,43,11,23,2,1,0,Heterozygous
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,10,1,0,1,Homozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,130,186,186,146,40,21.51%,8.6616E-14,77,69,17,23,1,0,Heterozygous
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",1/1,255,54,54,0,54,100%,4.0229E-32,0,0,35,19,0,1,Homozygous
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",1/1,182,32,32,0,32,100%,5.4567E-19,0,0,28,4,0,1,Homozygous
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",1/1,164,29,29,0,29,100%,3.3259E-17,0,0,24,5,0,1,Homozygous


In [88]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,85,79,79,54,25,31.65%,3.1101E-9,43,11,23,2,1,0,Heterozygous,,
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,10,1,0,1,Homozygous,LOC107985728:107985728|SAMD11:148398,"LOC107985728,SAMD11"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,130,186,186,146,40,21.51%,8.6616E-14,77,69,17,23,1,0,Heterozygous,TSPY10:100289087,TSPY10
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",1/1,255,54,54,0,54,100%,4.0229E-32,0,0,35,19,0,1,Homozygous,UTY:7404,UTY
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",1/1,182,32,32,0,32,100%,5.4567E-19,0,0,28,4,0,1,Homozygous,,
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",1/1,164,29,29,0,29,100%,3.3259E-17,0,0,24,5,0,1,Homozygous,,


In [89]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')

vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|regulatory_region_variant|MODIFIER|||Regulat...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|missense_variant|MODERATE|OR4F29|ENSG0000028...
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|intron_variant&non_coding_transcript_variant...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...


In [91]:
vcf.CHROM.value_counts()

chr1                       29303
chr19                      25115
chr17                      23572
chr11                      22962
chr2                       22900
chr16                      20186
chr3                       17731
chr7                       17335
chr12                      16069
chr9                       13364
chr5                       12909
chr6                       12758
chr8                       12545
chr15                      12444
chr10                      12436
chr14                      11737
chr4                       10735
chr22                       9572
chr20                       7483
chr18                       4874
chr13                       4688
chrX                        3826
chr21                       3676
chrY                          65
chr16_KI270728v1_random       11
chr19_GL949746v1_alt           6
chr19_KI270938v1_alt           6
chr16_KI270853v1_alt           4
chr12_KI270904v1_alt           4
chr12_GL877876v1_alt           3
chr6_GL000

In [11]:
#vcf['SYMBOL / Gene Name'] = vcf['csq'].str.split('|').str[3]
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,,,,,,,,transcribed_unprocessed_pseudogene,,,,,,1,,downstream_gene_variant,MODIFIER,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000456328.2:n.932C>T,,,lncRNA,3/3,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|WASH7P|ENSG...,,,,,,,,unprocessed_pseudogene,,,,,,-1,,downstream_gene_variant,MODIFIER,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|MIR6859-1|E...,,,,,,,,miRNA,,,,,,-1,,downstream_gene_variant,MODIFIER,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,enhancer,,,,,,,,regulatory_region_variant,MODIFIER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000484415.6:n.2746T>C,,,retained_intron,5/5,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000492963.6:n.2950T>C,,,retained_intron,6/6,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|downstream_gene_variant|MODIFIER|WASH6P|ENSG...,,,,,,,,retained_intron,,,,,,1,,downstream_gene_variant,MODIFIER,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000496301.6:n.1404T>C,,,retained_intron,2/2,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,


In [12]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,,,,,,,,transcribed_unprocessed_pseudogene,,,,,,1,,downstream_gene_variant,MODIFIER,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000456328.2:n.932C>T,,,lncRNA,3/3,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|WASH7P|ENSG...,,,,,,,,unprocessed_pseudogene,,,,,,-1,,downstream_gene_variant,MODIFIER,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|MIR6859-1|E...,,,,,,,,miRNA,,,,,,-1,,downstream_gene_variant,MODIFIER,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,enhancer,,,,,,,,regulatory_region_variant,MODIFIER,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000484415.6:n.2746T>C,,,retained_intron,5/5,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000492963.6:n.2950T>C,,,retained_intron,6/6,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|downstream_gene_variant|MODIFIER|WASH6P|ENSG...,,,,,,,,retained_intron,,,,,,1,,downstream_gene_variant,MODIFIER,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000496301.6:n.1404T>C,,,retained_intron,2/2,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,


In [13]:
vcf.INFO.iloc[0]

'ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX11L1:100287102;GNO;RS=71260404;RSPOS=13684;RV;SAO=0;SLO;SSR=0;TOPMED=0.72495380988786952,0.27504619011213047;VC=SNV;VP=0x050100000005000102000100;WGT=1;dbSNPBuildID=130;CSQ=T|downstream_gene_variant|MODIFIER|DDX11L1|ENSG00000223972|Transcript|ENST00000450305|transcribed_unprocessed_pseudogene||||||||||rs71260404|14|1||SNV|HGNC|HGNC:37102|YES|||||||||||||||||||||||||||||||||0.1495|0.02914|0.2292|0.1784|0.2692|0.1122|0.1975|0.18|0.2406|0.149|0.1845|0.2692|gnomADg_ASJ|||||||||||||,T|non_coding_transcript_exon_variant|MODIFIER|DDX11L2|ENSG00000290825|Transcript|ENST00000456328|lncRNA|3/3||ENST00000456328.2:n.932C>T||932|||||rs71260404||1||SNV|EntrezGene||YES|||1||||||||||||||||||||||||||||||0.1495|0.02914|0.2292|0.1784|0.2692|0.1122|0.1975|0.18|0.2406|0.149|0.1845|0.2692|gnomADg_ASJ|||||||||||||,T|downstream_gene_variant|MODIFIER|WASH7P|ENSG00000227232|Transcript|ENST00000488147|unprocessed_pseudogene||||||||||rs71260404|720|-1||SNV|HGNC|HGNC:3

In [14]:
vcf[['HGVSc', 'HGVSc (Transcript)']] = vcf['HGVSC'].str.split(':', 1, expand=True)
vcf[['HGVSp', 'HGVSp (Transcript)']] = vcf['HGVSP'].str.split(':', 1, expand=True)
vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript)
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,,,,,,,,transcribed_unprocessed_pseudogene,,,,,,1,,downstream_gene_variant,MODIFIER,,,,,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000456328.2:n.932C>T,,,lncRNA,3/3,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,,ENST00000456328.2,n.932C>T,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|WASH7P|ENSG...,,,,,,,,unprocessed_pseudogene,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|downstream_gene_variant|MODIFIER|MIR6859-1|E...,,,,,,,,miRNA,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,
0,chr1,13684,rs71260404,C,T,.,PASS,ADP=30;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,1,0,Heterozygous,DDX11L1:100287102,DDX11L1,T|downstream_gene_variant|MODIFIER|DDX11L1|ENS...,T|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,enhancer,,,,,,,,regulatory_region_variant,MODIFIER,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000484415.6:n.2746T>C,,,retained_intron,5/5,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,,ENST00000484415.6,n.2746T>C,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000492963.6:n.2950T>C,,,retained_intron,6/6,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,,ENST00000492963.6,n.2950T>C,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|downstream_gene_variant|MODIFIER|WASH6P|ENSG...,,,,,,,,retained_intron,,,,,,1,,downstream_gene_variant,MODIFIER,,,,,,
83759,chrY,57211949,.,T,C,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,1,0,Heterozygous,,,C|non_coding_transcript_exon_variant|MODIFIER|...,C|non_coding_transcript_exon_variant|MODIFIER|...,,,,,ENST00000496301.6:n.1404T>C,,,retained_intron,2/2,,,,,1,,non_coding_transcript_exon_variant,MODIFIER,,,ENST00000496301.6,n.1404T>C,,


In [15]:
vcf_final = vcf[['Gene Name', 'ID','CHROM', 'POS', 'REF', 'ALT', 'Zygosity', 'Consequence', 'IMPACT',
          'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
          'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
          'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
vcf_final

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,transcribed_unprocessed_pseudogene,,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000456328.2,n.932C>T,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,lncRNA,3/3,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,unprocessed_pseudogene,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,miRNA,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,regulatory_region_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,enhancer,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000484415.6,n.2746T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,5/5,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000492963.6,n.2950T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,6/6,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000496301.6,n.1404T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,2/2,,,,1,


In [16]:
vcf_final.ClinVar_CLNDN.value_counts()

                                                                                                                                                                                                                                                               476553
not_provided                                                                                                                                                                                                                                                    76180
not_specified&not_provided                                                                                                                                                                                                                                       2856
not_specified                                                                                                                                                                                                         

In [17]:
# Define the terms to remove
remove_terms = set(["not_specified", "not_provided"])

# Apply the filtering operation to 'Column1' only
vcf_final['ClinVar_CLNDN'] = vcf_final['ClinVar_CLNDN'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['CLIN_SIG'] = vcf_final['CLIN_SIG'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['ClinVar_CLNREVSTAT'] = vcf_final['ClinVar_CLNREVSTAT'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)

# Print the modified DataFrame
vcf_final

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,transcribed_unprocessed_pseudogene,,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000456328.2,n.932C>T,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,lncRNA,3/3,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,unprocessed_pseudogene,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,miRNA,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,regulatory_region_variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.005271,5,18,4,3,,,enhancer,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000484415.6,n.2746T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,5/5,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000492963.6,n.2950T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,6/6,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non_coding_transcript_exon_variant,MODIFIER,,,,,ENST00000496301.6,n.1404T>C,,,0/1,29,15,15,7,8,53.33%,0.001099,4,3,2,6,,,retained_intron,2/2,,,,1,


In [18]:
vcf_final = vcf_final.astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
vcf_final

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,transcribed unprocessed pseudogene,,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000456328.2,n.932C>T,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,lncRNA,3/3,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,unprocessed pseudogene,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,miRNA,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,enhancer,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000484415.6,n.2746T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,5/5,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000492963.6,n.2950T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,6/6,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000496301.6,n.1404T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,2/2,,,,1,


In [19]:
vcf_final['POS'] = vcf_final['POS'].astype('int64')
vcf_final

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,transcribed unprocessed pseudogene,,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000456328.2,n.932C>T,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,lncRNA,3/3,,,,1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,unprocessed pseudogene,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,miRNA,,,,,-1,
0,DDX11L1,rs71260404,chr1,13684,C,T,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,22,30,30,23,7,23.33%,0.0052713,5,18,4,3,,,enhancer,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000484415.6,n.2746T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,5/5,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000492963.6,n.2950T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,6/6,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,,,,,1,
83759,,.,chrY,57211949,T,C,Heterozygous,non coding transcript exon variant,MODIFIER,,,,,ENST00000496301.6,n.1404T>C,,,0/1,29,15,15,7,8,53.33%,0.0010995,4,3,2,6,,,retained intron,2/2,,,,1,


In [20]:
vcf_final.isnull().sum()

Gene Name                          0
ID                                 0
CHROM                              0
POS                                0
REF                                0
ALT                                0
Zygosity                           0
Consequence                        0
IMPACT                             0
ClinVar_CLNDN                      0
CLIN_SIG                           0
ClinVar_CLNREVSTAT                 0
ClinVar                            0
HGVSc                              0
HGVSc (Transcript)                 0
HGVSp                              0
HGVSp (Transcript)                 0
GT                                 0
GQ                                 0
SDP                                0
DP                                 0
RD                                 0
AD                                 0
FREQ                               0
PVAL                               0
RDF                                0
RDR                                0
A

In [21]:
vcf_final.CLIN_SIG.value_counts()

                                                                            477203
benign                                                                      168597
benign,likely benign                                                          4708
likely benign                                                                 2996
benign,benign/likely benign                                                   1331
                                                                             ...  
benign,benign/likely benign,conflicting interpretations of pathogenicity         3
uncertain significance,risk factor                                               2
association,likely benign                                                        2
likely benign,drug response,other                                                2
conflicting interpretations of pathogenicity,risk factor,benign                  2
Name: CLIN_SIG, Length: 109, dtype: int64

In [22]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHGLBS535/POS.xlsx')
df

Unnamed: 0,POS
0,69270
1,69511
2,69735
3,69897
4,451229
...,...
38110,2789135
38111,3019783
38112,6885470
38113,9466925


In [23]:
merged_vcf = pd.merge(vcf_final, df, on = 'POS', how = 'inner')
merged_vcf

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,


In [24]:
336150-335796

354

In [25]:
#merged_vcf.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHHSPTGPCSP6_processed_vcf.xlsx', index=False)

In [26]:
#df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHHSPTGPCSP6_processed_vcf.xlsx')
#df.head()

In [27]:
merged_vcf.Consequence.value_counts()

downstream gene variant                                     58852
intron variant                                              46047
upstream gene variant                                       43542
synonymous variant                                          35881
missense variant                                            28788
                                                            ...  
stop gained,splice region variant                               4
coding sequence variant                                         4
start lost,splice region variant,NMD transcript variant         3
stop gained,splice region variant,NMD transcript variant        3
stop lost,splice region variant,NMD transcript variant          1
Name: Consequence, Length: 63, dtype: int64

In [28]:
merged_vcf['consequence'] = merged_vcf['Consequence'].str.split(',').str[0]
merged_vcf

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,regulatory region variant
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,synonymous variant
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,upstream gene variant
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,upstream gene variant


In [29]:
merged_vcf[merged_vcf['ID'] == 'rs6672356']

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
46,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding,,,,,-1.0,,downstream gene variant
47,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000341065.8,c.751T>C,ENSP00000349216.4,p.Trp251Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,8/12,,W251R,Tgg/Cgg,1.0,,missense variant
48,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000342066.8,c.1027T>C,ENSP00000342313.3,p.Trp343Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,10/14,,W343R,Tgg/Cgg,1.0,,missense variant
49,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000455979.1,c.508T>C,ENSP00000412228.1,p.Trp170Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,4/7,,W170R,Tgg/Cgg,1.0,,missense variant
50,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000464948.1,n.286T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,1/2,,,,1.0,,non coding transcript exon variant
51,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000466827.1,n.191T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,2/2,,,,1.0,,non coding transcript exon variant
52,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000474461.1,n.389T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,3/4,,,,1.0,,non coding transcript exon variant
53,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant
54,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding CDS not defined,,,,,1.0,,downstream gene variant
55,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant


In [30]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [31]:
merged_1 = pd.merge(merged_vcf, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant,3/10
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,regulatory region variant,2/10
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,synonymous variant,3/10
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant,3/10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,upstream gene variant,2/10
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant,7/10
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant,7/10
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,upstream gene variant,2/10


In [32]:
merged_1[merged_1['ID'] == 'rs6672356']

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
46,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding,,,,,-1.0,,downstream gene variant,2/10
47,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000341065.8,c.751T>C,ENSP00000349216.4,p.Trp251Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,8/12,,W251R,Tgg/Cgg,1.0,,missense variant,7/10
48,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000342066.8,c.1027T>C,ENSP00000342313.3,p.Trp343Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,10/14,,W343R,Tgg/Cgg,1.0,,missense variant,7/10
49,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000455979.1,c.508T>C,ENSP00000412228.1,p.Trp170Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,4/7,,W170R,Tgg/Cgg,1.0,,missense variant,7/10
50,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000464948.1,n.286T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,1/2,,,,1.0,,non coding transcript exon variant,2/10
51,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000466827.1,n.191T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,2/2,,,,1.0,,non coding transcript exon variant,2/10
52,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000474461.1,n.389T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,3/4,,,,1.0,,non coding transcript exon variant,2/10
53,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant,2/10
54,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding CDS not defined,,,,,1.0,,downstream gene variant,2/10
55,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant,2/10


In [33]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [34]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant,3/10,2.5
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,regulatory region variant,2/10,1.5
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10,5.0
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,synonymous variant,3/10,2.5
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant,3/10,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,upstream gene variant,2/10,1.5
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant,7/10,5.0
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,MODERATE,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,missense variant,7/10,5.0
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,MODIFIER,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,upstream gene variant,2/10,1.5


In [35]:
merged_2[merged_2['ID'] == 'rs6672356']

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
46,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding,,,,,-1.0,,downstream gene variant,2/10,1.5
47,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000341065.8,c.751T>C,ENSP00000349216.4,p.Trp251Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,8/12,,W251R,Tgg/Cgg,1.0,,missense variant,7/10,5.0
48,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000342066.8,c.1027T>C,ENSP00000342313.3,p.Trp343Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,10/14,,W343R,Tgg/Cgg,1.0,,missense variant,7/10,5.0
49,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",1166513,ENST00000455979.1,c.508T>C,ENSP00000412228.1,p.Trp170Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,4/7,,W170R,Tgg/Cgg,1.0,,missense variant,7/10,5.0
50,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000464948.1,n.286T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,1/2,,,,1.0,,non coding transcript exon variant,2/10,1.5
51,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000466827.1,n.191T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,2/2,,,,1.0,,non coding transcript exon variant,2/10,1.5
52,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,non coding transcript exon variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,ENST00000474461.1,n.389T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,3/4,,,,1.0,,non coding transcript exon variant,2/10,1.5
53,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant,2/10,1.5
54,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding CDS not defined,,,,,1.0,,downstream gene variant,2/10,1.5
55,SAMD11,rs6672356,chr1,942451,T,C,Homozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,,downstream gene variant,2/10,1.5


In [36]:
merged_2 = merged_2[['Gene Name', 'ID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_2

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,


In [37]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Vitiligo_genes.xlsx')
df_gene

Unnamed: 0,Gene Name
0,HLADR4
1,HLADR7
2,HLADQ7
3,HLADR1
4,HLAB13
5,HLADQW3
6,HLACW6
7,HLAA30
8,AIS1
9,FOXD3


In [38]:
#merged_2 = pd.merge(merged_2, df_gene, on = 'Gene Name', how='left', sort=False)
#merged_2['Gene_Match_Condition'] = merged_2['Gene_Match_Condition'].fillna('No')
#merged_2

In [39]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,Gene Name,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene_Match
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,No
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,No
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,No
3,OR4F5,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,No
4,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,No
333894,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,No
333895,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,No
333896,TSPY1,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,No


In [40]:
merged_2.Gene_Match.value_counts()

No     333870
Yes        28
Name: Gene_Match, dtype: int64

In [41]:
merged_2.columns

Index(['Gene Name', 'ID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Gene_Match'],
      dtype='object')

In [42]:
merged_2 = merged_2[['Gene Name', 'Gene_Match', 'ID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_2

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,
2,OR4F5,No,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,OR4F5,No,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,
4,OR4F5,No,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,
333894,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,
333895,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,
333896,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,


In [376]:
330219 - 329950

269

In [43]:
df_3 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Condition_pos/Vitiligo_pos.xlsx')
df_3

Unnamed: 0,POS,Literature
0,113834946,Yes
1,188371466,Yes
2,188394766,Yes
3,188396801,Yes
4,188406566,Yes


In [44]:
merged_3 = pd.merge(merged_2, df_3, on = 'POS', how = 'left', sort=False)
merged_3

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature
0,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,
1,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,
2,OR4F5,No,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,
3,OR4F5,No,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,
4,OR4F5,No,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,
333894,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,
333895,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,
333896,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,


In [45]:
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature
0,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,,No
1,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,,No
2,OR4F5,No,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,No
3,OR4F5,No,rs1245478362,chr1,69735,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,,No
4,OR4F5,No,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,,No
333894,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,,No
333895,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,,No
333896,TSPY1,No,rs777840135,chrY,9467257,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,,No


In [46]:
merged_3.Literature.value_counts()

No     333889
Yes         9
Name: Literature, dtype: int64

In [47]:
merged_3[merged_3['Literature'] == 'Yes']

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature
16704,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,"benign,risk factor",,,ENST00000359785.10,c.1858T>C,ENSP00000352833.5,p.Trp620Arg,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,tolerated(1),benign(0),protein coding,14/21,,W620R,Tgg/Cgg,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16705,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,"benign,risk factor",,,ENST00000420377.6,c.1858T>C,ENSP00000388229.2,p.Trp620Arg,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,tolerated(1),benign(0),protein coding,14/20,,W620R,Tgg/Cgg,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16706,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,"benign,risk factor",,,ENST00000460620.5,c.469-15292T>C,,,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,,,protein coding,,6/7,,,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16707,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,,"benign,risk factor",,,ENST00000484147.5,n.1899T>C,,,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,,,retained intron,14/16,,,,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16708,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,"benign,risk factor",,,ENST00000525799.1,c.1477T>C,ENSP00000432674.1,p.Trp493Arg,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,tolerated(1),benign(0),protein coding,9/15,,W493R,Tgg/Cgg,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16709,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,"benign,risk factor",,,ENST00000528414.5,c.1693T>C,ENSP00000435176.1,p.Trp565Arg,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,tolerated(1),benign(0),protein coding,12/19,,W565R,Tgg/Cgg,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16710,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,"3 prime UTR variant,NMD transcript variant",3/10,MODIFIER,1.5,,"benign,risk factor",,,ENST00000532224.5,c.*1136T>C,,,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,,,nonsense mediated decay,10/17,,,,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16711,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,"benign,risk factor",,,ENST00000538253.5,c.1786T>C,ENSP00000439372.2,p.Trp596Arg,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,tolerated(1),benign(0),protein coding,13/20,,W596R,Tgg/Cgg,-1,"18248681,30409984,20941391,27417569,20444268,2...",Yes
16712,"PTPN22,AP4B1-AS1",Yes,rs2476601,chr1,113834946,A,G,Homozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,"benign,risk factor",,,ENST00000664434.1,n.470+3133A>G,,,1/1,70,13,13,0,13,100%,9.6148e-08,0,0,9,4,,,lncRNA,,4/5,,,1,"18248681,30409984,20941391,27417569,20444268,2...",Yes


In [48]:
merged_3.columns

Index(['Gene Name', 'Gene_Match', 'ID', 'CHROM', 'POS', 'REF', 'ALT',
       'Zygosity', 'Consequence', 'Consequence_score', 'IMPACT',
       'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
       'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)',
       'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR',
       'ADF', 'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Literature'],
      dtype='object')

In [49]:
merged_3 = merged_3[['Gene Name', 'Gene_Match', 'ID', 'CHROM', 'POS', 'Literature', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_3

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,Literature,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,No,rs201219564,chr1,69270,No,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,No,rs201219564,chr1,69270,No,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,120,101,101,66,35,34.65%,8.1311e-13,48,18,24,11,,,TF binding site,,,,,,
2,OR4F5,No,rs2691305,chr1,69511,No,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,143,143,0,143,100%,1.7063e-85,0,0,100,43,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,OR4F5,No,rs1245478362,chr1,69735,No,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.708A>G,ENSP00000493376.2,p.Leu236%3D,0/1,163,180,180,131,49,27.22%,4.0111e-17,84,47,39,10,,,protein coding,3/3,,L236,ctA/ctG,1,
4,OR4F5,No,rs200676709,chr1,69897,No,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,75,96,96,73,23,23.96%,2.6678e-08,30,43,13,10,,,protein coding,3/3,,S290,tcT/tcC,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333893,TSPY1,No,rs777840135,chrY,9467257,No,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,transcribed unprocessed pseudogene,,,,,-1,
333894,TSPY1,No,rs777840135,chrY,9467257,No,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000423647.6,c.257G>A,ENSP00000389324.3,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.375),protein coding,1/6,,R86Q,cGg/cAg,1,
333895,TSPY1,No,rs777840135,chrY,9467257,No,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000451548.6,c.257G>A,ENSP00000403304.1,p.Arg86Gln,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,tolerated(1),benign(0.101),protein coding,1/6,,R86Q,cGg/cAg,1,
333896,TSPY1,No,rs777840135,chrY,9467257,No,G,A,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,255,383,383,273,110,28.72%,7.984299999999999e-38,136,137,83,27,,,lncRNA,,,,,-1,


In [50]:
merged_3[merged_3['ID'] == 'rs6672356']

Unnamed: 0,Gene Name,Gene_Match,ID,CHROM,POS,Literature,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
46,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding,,,,,-1.0,
47,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",1166513,ENST00000341065.8,c.751T>C,ENSP00000349216.4,p.Trp251Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,8/12,,W251R,Tgg/Cgg,1.0,
48,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",1166513,ENST00000342066.8,c.1027T>C,ENSP00000342313.3,p.Trp343Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,10/14,,W343R,Tgg/Cgg,1.0,
49,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",1166513,ENST00000455979.1,c.508T>C,ENSP00000412228.1,p.Trp170Arg,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,tolerated(1),benign(0),protein coding,4/7,,W170R,Tgg/Cgg,1.0,
50,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,ENST00000464948.1,n.286T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,1/2,,,,1.0,
51,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,ENST00000466827.1,n.191T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,2/2,,,,1.0,
52,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,ENST00000474461.1,n.389T>C,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,3/4,,,,1.0,
53,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,
54,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,protein coding CDS not defined,,,,,1.0,
55,SAMD11,No,rs6672356,chr1,942451,No,T,C,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1166513,,,,,1/1,255,79,79,0,79,100%,4.3185e-47,0,0,38,41,,,retained intron,,,,,-1.0,


In [51]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHGLBS535_depth_vcf_processed.xlsx', index=False)

# InDel Analysis

In [2]:
indel = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/cdpr_and_csps_all_files/KHCDPRGPTTL5/KHCDPRGPTTL5_indel_clinical_significance.xlsx')
indel = indel.rename({'gene_name' : 'Gene Name'}, axis=1)
indel

Unnamed: 0,allele,zygocity,Gene Name,rsid,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,"['AAGAGAG', A]",Heterozygous,TRPS1,rs10546472,5_prime_UTR_variant,Uncertain_significance,Trichorhinophalangeal_syndrome,"criteria_provided', '_single_submitter",germline,indel,Insertion,MayBe No,No,No
1,"['AAGAGAG', A]",Heterozygous,TRPS1,rs10546472,5_prime_UTR_variant,Uncertain_significance,Trichorhinophalangeal_syndrome,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
2,"['ACCTT', A]",Heterozygous,ADAMTS17,rs10549565,3_prime_UTR_variant,Benign,"Weill-Marchesani_4_syndrome', '_recessive|not_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No
3,"['ATCC', A]",Heterozygous,KIF1A,rs10594016,"inframe_insertion', 'SO:0001627",Conflicting_interpretations_of_pathogenicity,Hereditary_spastic_paraplegia_30|not_specified...,"criteria_provided', '_conflicting_interpretations",germline,indel,Insertion,MayBe No,No,Yes
4,"['ATCC', A]",Heterozygous,KIF1A,rs10594016,"inframe_deletion', 'SO:0001627",Benign,not_specified|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,"['AACTT', A]",Heterozygous,PLCB1,rs78940282,3_prime_UTR_variant,Likely_benign,"Early_Infantile_Epileptic_Encephalopathy', '_A...","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
579,"['AT', A]",Heterozygous,ATCAY,rs796133913,3_prime_UTR_variant,Benign,Cayman_type_cerebellar_ataxia,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
580,"['AT', A]",Heterozygous,ATCAY,rs796133913,3_prime_UTR_variant,Uncertain_significance,Cayman_type_cerebellar_ataxia,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
581,"['GT', G]",Heterozygous,CD2AP,rs797004904,3_prime_UTR_variant,Benign,Focal_segmental_glomerulosclerosis,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No


In [3]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/cardiac genes.xlsx')
df_gene

Unnamed: 0,Gene Name
0,LDLR
1,LDLR-AS1
2,NOS3
3,PRKAG2
4,LRP6
...,...
75,TRDN
76,LDLRAP1
77,PCSK9
78,APOB


In [4]:
indel['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in indel['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            indel.loc[indel['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
indel

Unnamed: 0,allele,zygocity,Gene Name,rsid,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Gene_Match
0,"['AAGAGAG', A]",Heterozygous,TRPS1,rs10546472,5_prime_UTR_variant,Uncertain_significance,Trichorhinophalangeal_syndrome,"criteria_provided', '_single_submitter",germline,indel,Insertion,MayBe No,No,No,No
1,"['AAGAGAG', A]",Heterozygous,TRPS1,rs10546472,5_prime_UTR_variant,Uncertain_significance,Trichorhinophalangeal_syndrome,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No
2,"['ACCTT', A]",Heterozygous,ADAMTS17,rs10549565,3_prime_UTR_variant,Benign,"Weill-Marchesani_4_syndrome', '_recessive|not_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,No
3,"['ATCC', A]",Heterozygous,KIF1A,rs10594016,"inframe_insertion', 'SO:0001627",Conflicting_interpretations_of_pathogenicity,Hereditary_spastic_paraplegia_30|not_specified...,"criteria_provided', '_conflicting_interpretations",germline,indel,Insertion,MayBe No,No,Yes,No
4,"['ATCC', A]",Heterozygous,KIF1A,rs10594016,"inframe_deletion', 'SO:0001627",Benign,not_specified|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,"['AACTT', A]",Heterozygous,PLCB1,rs78940282,3_prime_UTR_variant,Likely_benign,"Early_Infantile_Epileptic_Encephalopathy', '_A...","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No
579,"['AT', A]",Heterozygous,ATCAY,rs796133913,3_prime_UTR_variant,Benign,Cayman_type_cerebellar_ataxia,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No
580,"['AT', A]",Heterozygous,ATCAY,rs796133913,3_prime_UTR_variant,Uncertain_significance,Cayman_type_cerebellar_ataxia,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No
581,"['GT', G]",Heterozygous,CD2AP,rs797004904,3_prime_UTR_variant,Benign,Focal_segmental_glomerulosclerosis,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No


In [5]:
indel.Gene_Match.value_counts()

No     556
Yes     27
Name: Gene_Match, dtype: int64

## InDel Analysis

In [7]:
data = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHCDPRGPTTL6_annotated_indel.vcf', comment='#', sep= '\t', header=None, low_memory=False)
data.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:150:29:29:1:28:96.55%:9.9776E-16:32:21:1:0...
1,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=65;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:109:65:65:35:30:46.15%:1.1519E-11:55:57:27...
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=76;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:151:76:76:36:40:52.63%:7.6319E-16:58:56:28...
3,chr1,1398672,rs368050244;rs3831366,CTAGAG,C,.,PASS,ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.925...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:131:60:60:26:34:56.67%:7.4725E-14:57:50:21...
4,chr1,1629572,rs112177324,TG,T,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;G5;GENEINFO=M...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:50:22:21:9:13:59.09%:9.5813E-6:57:56:9:0:11:2
...,...,...,...,...,...,...,...,...,...,...
2606,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=63;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.0002649...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:63:63:2:61:96.83%:3.4466E-34:74:55:2:0...
2607,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:64:25:25:5:15:60%:3.8543E-7:64:55:5:0:14:1
2608,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=17;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:81:17:17:1:16:94.12%:7.7134E-9:35:55:0:1:11:5
2609,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=USP9...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:50:21:21:4:12:57.14%:8.0605E-6:42:49:3:1:10:2


In [8]:
sample_cols = data['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
data = pd.concat([data, sample_cols], axis=1)
data = data[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",1/1,150,29,29,1,28,96.55%,9.9776E-16,1,0,25,3
1,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=65;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,109,65,65,35,30,46.15%,1.1519E-11,27,8,26,4
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=76;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,151,76,76,36,40,52.63%,7.6319E-16,28,8,34,6
3,chr1,1398672,rs368050244;rs3831366,CTAGAG,C,.,PASS,ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.925...,0/1,131,60,60,26,34,56.67%,7.4725E-14,21,5,26,8
4,chr1,1629572,rs112177324,TG,T,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;G5;GENEINFO=M...,0/1,50,22,21,9,13,59.09%,9.5813E-6,9,0,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2606,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=63;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.0002649...,1/1,255,63,63,2,61,96.83%,3.4466E-34,2,0,50,11
2607,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,0/1,64,25,25,5,15,60%,3.8543E-7,5,0,14,1
2608,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=17;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,1/1,81,17,17,1,16,94.12%,7.7134E-9,0,1,11,5
2609,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=USP9...,0/1,50,21,21,4,12,57.14%,8.0605E-6,3,1,10,2


In [9]:
data.INFO.iloc[0]

'ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0.4792;COMMON=1;G5;GENEINFO=AGRN:375790;GNO;INT;KGPhase1;KGPhase3;PM;PMC;RS=35881187;RSPOS=1043224;RV;SAO=0;SLO;SSR=0;TOPMED=0.55968081039755351,0.44031918960244648;VC=DIV;VLD;VP=0x05012808000515013e000200;WGT=1;dbSNPBuildID=126'

In [11]:
# Create empty columns
columns = ['ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID', 'SLO',
           'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV', 'TPA', 'CFL', 'GNO',
           'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC',
           'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO',
           'GENEINFO', 'INT', 'G5', 'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar',
           'ClinVar_CLNSIG']

for col in columns:
    data[col] = ''

# Populate columns based on 'info' values
for i, row in data.iterrows():
    info = row['INFO']
    items = info.split(';')
    for item in items:
        key_value = item.split('=')
        key = key_value[0]
        if key in columns:
            if len(key_value) > 1:
                value = key_value[1]
                data.at[i, key] = f"{key}={value}"
            else:
                data.at[i, key] = key
        else:
            data.at[i, key] = 'null'


            
data["Gene_Name"] = data["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
data['Gene Name'] = data['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')

# Print the resulting DataFrame
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,GENEINFO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,Gene_Name,Gene Name
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",1/1,150,29,29,1,28,96.55%,9.9776E-16,1,0,25,3,ADP=29,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,GENEINFO=AGRN:375790,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,,,AGRN:375790,AGRN
1,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=65;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,109,65,65,35,30,46.15%,1.1519E-11,27,8,26,4,ADP=65,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,GENEINFO=ACAP3:116983,INT,G5,,,SSR=0,RSPOS=1299383,,,,,ACAP3:116983,ACAP3
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=76;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,151,76,76,36,40,52.63%,7.6319E-16,28,8,34,6,ADP=76,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,GENEINFO=MXRA8:54587,INT,G5,,,SSR=0,RSPOS=1353988,,,,,MXRA8:54587,MXRA8
3,chr1,1398672,rs368050244;rs3831366,CTAGAG,C,.,PASS,ADP=60;WT=0;HET=1;HOM=0;NC=0;ASP;ASS;CAF=0.925...,0/1,131,60,60,26,34,56.67%,7.4725E-14,21,5,26,8,ADP=60,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=107,138",SLO,,,R5,,,,"COMMON=1,1","RS=3831366,368050244",,,,GNO,VLD,ASP,ASS,,,,"TOPMED=0.94513729612640163,0.05486270387359836","WGT=1,1",,,,,,KGPhase3,"CAF=0.9259,0.07408,0.9259,0.07408","VC=DIV,DIV",,KGPhase1,,"VP=0x05012822000515013e000200,0x0500000a000500...","SAO=0,0","GENEINFO=LOC148413:148413|CCNL2:81669,LOC14841...",INT,G5,,PMC,"SSR=0,0","RSPOS=1398673,1398677",,PM,,,"LOC148413:148413|CCNL2:81669,LOC148413:148413|...","LOC148413,CCNL2,CCNL2"
4,chr1,1629572,rs112177324,,T,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;G5;GENEINFO=M...,0/1,50,22,21,9,13,59.09%,9.5813E-6,9,0,11,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=132,SLO,NSF,,,,,,,RS=112177324,,,,GNO,VLD,ASP,,,,,"TOPMED=0.75031058868501529,0.24968941131498470",WGT=1,,,,,,KGPhase3,,VC=DIV,,KGPhase1,,VP=0x05010008120515013e000200,SAO=0,GENEINFO=MIB2:142678,INT,G5,,,SSR=0,RSPOS=1629573,,,,,MIB2:142678,MIB2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2606,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=63;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.0002649...,1/1,255,63,63,2,61,96.83%,3.4466E-34,2,0,50,11,ADP=63,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=131,126",SLO,,,,,,,"COMMON=0,0","RS=77485258,5904376",,,,GNO,,ASP,,,,,"TOPMED=0.01672400611620795,0.98327599388379204","WGT=1,1",,,,,,KGPhase3,"CAF=0.0002649,0.9997,0.0002649,0.9997","VC=DIV,DIV",,KGPhase1,,"VP=0x05000008000501003e000200,0x05010008000500...","SAO=0,0","GENEINFO=ARHGAP4:393,ARHGAP4:393",INT,G5,,,"SSR=0,0","RSPOS=153913005,153913008",,,,,"ARHGAP4:393,ARHGAP4:393",ARHGAP4
2607,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=25;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,0/1,64,25,25,5,15,60%,3.8543E-7,5,0,14,1,ADP=25,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,GENEINFO=VBP1:7411,INT,,,,SSR=0,RSPOS=155228363,,,,,VBP1:7411,VBP1
2608,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=17;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,1/1,81,17,17,1,16,94.12%,7.7134E-9,0,1,11,5,ADP=17,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0","GENEINFO=TMLHE:55217|TMLHE-AS1:100507404,TMLHE...",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,,,"TMLHE:55217|TMLHE-AS1:100507404,TMLHE:55217|TM...","TMLHE,TMLHE-AS1,TMLHE-AS1"
2609,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=USP9...,0/1,50,21,21,4,12,57.14%,8.0605E-6,3,1,10,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,GENEINFO=USP9Y:8287,INT,,,,SSR=0,RSPOS=12786503,,,,,USP9Y:8287,USP9Y


In [12]:
data.columns

Index(['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT',
       'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD',
       'dbSNPBuildID', 'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON',
       'RS', 'RV', 'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5',
       'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF',
       'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'GENEINFO', 'INT', 'G5',
       'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar', 'ClinVar_CLNSIG',
       'Gene_Name', 'Gene Name'],
      dtype='object')

In [14]:
data = data[['CHROM', 'POS', 'Gene Name', 'rsID', 'REF', 'ALT', 'GT',
       'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD',
       'dbSNPBuildID', 'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON',
       'RS', 'RV', 'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5',
       'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF',
       'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5',
       'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar', 'ClinVar_CLNSIG']]
data

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG
0,chr1,1043223,AGRN,rs35881187,CCT,C,1/1,150,29,29,1,28,96.55%,9.9776E-16,1,0,25,3,ADP=29,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,,
1,chr1,1299382,ACAP3,rs143128930,AG,A,0/1,109,65,65,35,30,46.15%,1.1519E-11,27,8,26,4,ADP=65,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,
2,chr1,1353987,MXRA8,rs140777846,CTG,C,0/1,151,76,76,36,40,52.63%,7.6319E-16,28,8,34,6,ADP=76,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,
3,chr1,1398672,"LOC148413,CCNL2,CCNL2",rs368050244;rs3831366,CTAGAG,C,0/1,131,60,60,26,34,56.67%,7.4725E-14,21,5,26,8,ADP=60,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=107,138",SLO,,,R5,,,,"COMMON=1,1","RS=3831366,368050244",,,,GNO,VLD,ASP,ASS,,,,"TOPMED=0.94513729612640163,0.05486270387359836","WGT=1,1",,,,,,KGPhase3,"CAF=0.9259,0.07408,0.9259,0.07408","VC=DIV,DIV",,KGPhase1,,"VP=0x05012822000515013e000200,0x0500000a000500...","SAO=0,0",INT,G5,,PMC,"SSR=0,0","RSPOS=1398673,1398677",,PM,,
4,chr1,1629572,MIB2,rs112177324,,T,0/1,50,22,21,9,13,59.09%,9.5813E-6,9,0,11,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=132,SLO,NSF,,,,,,,RS=112177324,,,,GNO,VLD,ASP,,,,,"TOPMED=0.75031058868501529,0.24968941131498470",WGT=1,,,,,,KGPhase3,,VC=DIV,,KGPhase1,,VP=0x05010008120515013e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1629573,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2606,chrX,153913005,ARHGAP4,rs5904376;rs77485258,G,GC,1/1,255,63,63,2,61,96.83%,3.4466E-34,2,0,50,11,ADP=63,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=131,126",SLO,,,,,,,"COMMON=0,0","RS=77485258,5904376",,,,GNO,,ASP,,,,,"TOPMED=0.01672400611620795,0.98327599388379204","WGT=1,1",,,,,,KGPhase3,"CAF=0.0002649,0.9997,0.0002649,0.9997","VC=DIV,DIV",,KGPhase1,,"VP=0x05000008000501003e000200,0x05010008000500...","SAO=0,0",INT,G5,,,"SSR=0,0","RSPOS=153913005,153913008",,,,
2607,chrX,155228363,VBP1,rs781912204,A,AT,0/1,64,25,25,5,15,60%,3.8543E-7,5,0,14,1,ADP=25,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,INT,,,,SSR=0,RSPOS=155228363,,,,
2608,chrX,155492733,"TMLHE,TMLHE-AS1,TMLHE-AS1",rs1169019545;rs376271737,T,TG,1/1,81,17,17,1,16,94.12%,7.7134E-9,0,1,11,5,ADP=17,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,,
2609,chrY,12786501,USP9Y,rs760255651,CT,C,0/1,50,21,21,4,12,57.14%,8.0605E-6,3,1,10,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=12786503,,,,


In [17]:
data.ClinVar_CLNSIG.value_counts()

    2611
Name: ClinVar_CLNSIG, dtype: int64

In [22]:
data['Allele'] = '[' + data['REF'] + ', ' + data['ALT'] + ']'
data

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,Allele
0,chr1,1043223,AGRN,rs35881187,CCT,C,1/1,150,29,29,1,28,96.55%,9.9776E-16,1,0,25,3,ADP=29,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,,,"[CCT, C]"
1,chr1,1299382,ACAP3,rs143128930,AG,A,0/1,109,65,65,35,30,46.15%,1.1519E-11,27,8,26,4,ADP=65,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,,"[AG, A]"
2,chr1,1353987,MXRA8,rs140777846,CTG,C,0/1,151,76,76,36,40,52.63%,7.6319E-16,28,8,34,6,ADP=76,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,"[CTG, C]"
3,chr1,1398672,"LOC148413,CCNL2,CCNL2",rs368050244;rs3831366,CTAGAG,C,0/1,131,60,60,26,34,56.67%,7.4725E-14,21,5,26,8,ADP=60,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=107,138",SLO,,,R5,,,,"COMMON=1,1","RS=3831366,368050244",,,,GNO,VLD,ASP,ASS,,,,"TOPMED=0.94513729612640163,0.05486270387359836","WGT=1,1",,,,,,KGPhase3,"CAF=0.9259,0.07408,0.9259,0.07408","VC=DIV,DIV",,KGPhase1,,"VP=0x05012822000515013e000200,0x0500000a000500...","SAO=0,0",INT,G5,,PMC,"SSR=0,0","RSPOS=1398673,1398677",,PM,,,"[CTAGAG, C]"
4,chr1,1629572,MIB2,rs112177324,,T,0/1,50,22,21,9,13,59.09%,9.5813E-6,9,0,11,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=132,SLO,NSF,,,,,,,RS=112177324,,,,GNO,VLD,ASP,,,,,"TOPMED=0.75031058868501529,0.24968941131498470",WGT=1,,,,,,KGPhase3,,VC=DIV,,KGPhase1,,VP=0x05010008120515013e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1629573,,,,,"[null, T]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2606,chrX,153913005,ARHGAP4,rs5904376;rs77485258,G,GC,1/1,255,63,63,2,61,96.83%,3.4466E-34,2,0,50,11,ADP=63,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=131,126",SLO,,,,,,,"COMMON=0,0","RS=77485258,5904376",,,,GNO,,ASP,,,,,"TOPMED=0.01672400611620795,0.98327599388379204","WGT=1,1",,,,,,KGPhase3,"CAF=0.0002649,0.9997,0.0002649,0.9997","VC=DIV,DIV",,KGPhase1,,"VP=0x05000008000501003e000200,0x05010008000500...","SAO=0,0",INT,G5,,,"SSR=0,0","RSPOS=153913005,153913008",,,,,"[G, GC]"
2607,chrX,155228363,VBP1,rs781912204,A,AT,0/1,64,25,25,5,15,60%,3.8543E-7,5,0,14,1,ADP=25,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,INT,,,,SSR=0,RSPOS=155228363,,,,,"[A, AT]"
2608,chrX,155492733,"TMLHE,TMLHE-AS1,TMLHE-AS1",rs1169019545;rs376271737,T,TG,1/1,81,17,17,1,16,94.12%,7.7134E-9,0,1,11,5,ADP=17,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,,,"[T, TG]"
2609,chrY,12786501,USP9Y,rs760255651,CT,C,0/1,50,21,21,4,12,57.14%,8.0605E-6,3,1,10,2,ADP=21,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=12786503,,,,,"[CT, C]"


In [18]:
data.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/InDel_columns_Extracted.xlsx', index=False)

In [25]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHCDPRGPTTL6_InDel.xlsx')
df = df.rename({'zygocity':'zygosity'}, axis = 1)
df['Allele'] = df['allele'].str.replace('\'', '')
df

Unnamed: 0,allele,zygosity,Gene Name,Gene_Match,rsid,consequence,Consequence_score,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Allele
0,"['ACCTT', A]",Homozygous,ADAMTS17,No,rs10549565,3_prime_UTR_variant,3/10,Benign,"Weill-Marchesani_4_syndrome', '_recessive|not_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,"[ACCTT, A]"
1,"['CACA', C]",Heterozygous,GIGYF2,No,rs10555297,"non-coding_transcript_variant', 'SO:0001822",2/10,Benign,"Parkinson_disease_11', '_autosomal_dominant', ...","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,Yes,"[CACA, C]"
2,"['GCA', G]",Heterozygous,TRPV3,No,rs10573788,"5_prime_UTR_variant', 'SO:0001624",3/10,Benign,Olmsted_syndrome_1|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,"[GCA, G]"
3,"['CAT', C]",Homozygous,PTGES3,No,rs10579382,"initiatior_codon_variant', 'SO:0001589",,Benign,not_provided,no_assertion_criteria_provided,germline,indel,Deletion,MayBe No,No,No,"[CAT, C]"
4,"['CAA', C]",Heterozygous,JAK3,No,rs10580414,,,Benign,Severe_combined_immunodeficiency_disease,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,"[CAA, C]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,"['AAAGGAAGT', A]",Heterozygous,ATR,No,rs797045404,intron_variant,2/10,Uncertain_significance,not_specified|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,"[AAAGGAAGT, A]"
665,"['TA', T]",Heterozygous,WDR62,No,rs797046108,intron_variant,2/10,Conflicting_interpretations_of_pathogenicity,not_specified|Primary_Microcephaly_2_With_or_W...,"criteria_provided', '_conflicting_interpretations",germline,indel,Deletion,MayBe No,No,Yes,"[TA, T]"
666,"['CTT', C]",Heterozygous,ANTXR2,No,rs80314910,"5_prime_UTR_variant', 'SO:0001627",3/10,Benign,Hyaline_fibromatosis_syndrome|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,"[CTT, C]"
667,"['CCTTT', C]",Heterozygous,CNGB1,No,rs866017919,3_prime_UTR_variant,3/10,Likely_benign,"Retinitis_Pigmentosa', '_Recessive","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,"[CCTTT, C]"
