In [48]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# VCF file Covered Positions Extraction From the Bed File 

In [49]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Desktop/Covered_regions.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHAIGHGPTTL187/KHAIGHGPTTL187_final.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGHGPTTL187_final.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()


In [50]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGHGPTTL187_final.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,65529,rs1473003496,C,T,.,PASS,ADP=22;WT=0;HET=1;HOM=0;NC=0;ASP;RS=1473003496...,0/1,23,22,22,15,7,31.82%,4.4505E-3,14,1,6,1
1,chr1,69270,rs201219564,A,G,.,PASS,ADP=102;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,132,102,102,64,38,37.25%,5.1448E-14,41,23,28,10
2,chr1,69511,rs2691305,A,G,.,PASS,ADP=160;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,160,160,0,160,100%,1.0505E-95,0,0,116,44
3,chr1,69897,rs200676709,T,C,.,PASS,"ADP=86;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,165,86,86,42,44,51.16%,3.1151E-17,27,15,34,10
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=33;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,188,33,33,0,33,100%,1.3852E-19,0,0,20,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34676,chrY,23179733,.,T,C,.,PASS,ADP=89;WT=0;HET=1;HOM=0;NC=0;CSQ=C|synonymous_...,0/1,91,89,89,62,27,30.34%,7.2581E-10,44,18,23,4
34677,chrY,23190581,.,T,C,.,PASS,ADP=67;WT=0;HET=1;HOM=0;NC=0;CSQ=C|synonymous_...,0/1,48,67,67,52,15,22.39%,1.2648E-5,31,21,10,5
34678,chrY,23227961,.,A,G,.,PASS,ADP=75;WT=0;HET=1;HOM=0;NC=0;CSQ=G|synonymous_...,0/1,77,75,75,52,23,30.67%,1.6219E-8,41,11,21,2
34679,chrY,24842348,.,A,G,.,PASS,ADP=65;WT=0;HET=1;HOM=0;NC=0;CSQ=G|synonymous_...,0/1,60,65,65,47,18,27.69%,9.7371E-7,31,16,14,4


In [17]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,85,79,79,54,25,31.65%,3.1101E-9,43,11,23,2,1,0,Heterozygous
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,10,1,0,1,Homozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,130,186,186,146,40,21.51%,8.6616E-14,77,69,17,23,1,0,Heterozygous
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",1/1,255,54,54,0,54,100%,4.0229E-32,0,0,35,19,0,1,Homozygous
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",1/1,182,32,32,0,32,100%,5.4567E-19,0,0,28,4,0,1,Homozygous
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",1/1,164,29,29,0,29,100%,3.3259E-17,0,0,24,5,0,1,Homozygous


In [18]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,
3,chr1,686266,.,A,C,.,PASS,ADP=79;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,85,79,79,54,25,31.65%,3.1101E-9,43,11,23,2,1,0,Heterozygous,,
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,10,1,0,1,Homozygous,LOC107985728:107985728|SAMD11:148398,"LOC107985728,SAMD11"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36647,chrY,9528488,rs1179775265,T,G,.,PASS,ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,130,186,186,146,40,21.51%,8.6616E-14,77,69,17,23,1,0,Heterozygous,TSPY10:100289087,TSPY10
36648,chrY,13360045,rs2032674,T,C,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.8094,0....",1/1,255,54,54,0,54,100%,4.0229E-32,0,0,35,19,0,1,Homozygous,UTY:7404,UTY
36649,chrY,14941891,rs17269816,C,T,.,PASS,"ADP=32;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.91,0.09...",1/1,182,32,32,0,32,100%,5.4567E-19,0,0,28,4,0,1,Homozygous,,
36650,chrY,15174113,rs17307398,T,C,.,PASS,"ADP=29;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.266,0.7...",1/1,164,29,29,0,29,100%,3.3259E-17,0,0,24,5,0,1,Homozygous,,


In [19]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|regulatory_region_variant|MODIFIER|||Regulat...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|missense_variant|MODERATE|OR4F29|ENSG0000028...
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|intron_variant&non_coding_transcript_variant...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...


In [20]:
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,,,,,ENST00000641515.2:c.243A>G,ENSP00000493376.2:p.Ser81%3D,,protein_coding,3/3,,81,S,tcA/tcG,1,,synonymous_variant,LOW,
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,TF_binding_site,,,,,,,,regulatory_region_variant,MODIFIER,
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,,,,,ENST00000641515.2:c.484A>G,ENSP00000493376.2:p.Thr162Ala,benign(0),protein_coding,3/3,,162,T/A,Aca/Gca,1,,missense_variant,MODERATE,tolerated(0.92)
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,,,,,ENST00000426406.4:c.389T>G,ENSP00000409316.1:p.Leu130Arg,benign(0.336),protein_coding,1/1,,130,L/R,cTc/cGc,-1,,missense_variant,MODERATE,deleterious_low_confidence(0.01)
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000455207.5:n.169+33750T>G,,,lncRNA,,1/2,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,


In [21]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,,,,,ENST00000641515.2:c.243A>G,ENSP00000493376.2:p.Ser81%3D,,protein_coding,3/3,,81,S,tcA/tcG,1,,synonymous_variant,LOW,,S81
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,TF_binding_site,,,,,,,,regulatory_region_variant,MODIFIER,,
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,,,,,ENST00000641515.2:c.484A>G,ENSP00000493376.2:p.Thr162Ala,benign(0),protein_coding,3/3,,162,T/A,Aca/Gca,1,,missense_variant,MODERATE,tolerated(0.92),T162A
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,,,,,ENST00000426406.4:c.389T>G,ENSP00000409316.1:p.Leu130Arg,benign(0.336),protein_coding,1/1,,130,L/R,cTc/cGc,-1,,missense_variant,MODERATE,deleterious_low_confidence(0.01),L130R
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000455207.5:n.169+33750T>G,,,lncRNA,,1/2,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,


In [22]:
vcf[['HGVSc', 'HGVSc (Transcript)']] = vcf['HGVSC'].str.split(':', 1, expand=True)
vcf[['HGVSp', 'HGVSp (Transcript)']] = vcf['HGVSP'].str.split(':', 1, expand=True)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,Protein Position and Amino Acid,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript)
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,,,,,ENST00000641515.2:c.243A>G,ENSP00000493376.2:p.Ser81%3D,,protein_coding,3/3,,81,S,tcA/tcG,1,,synonymous_variant,LOW,,S81,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=138;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,1,0,Heterozygous,OR4F5:79501,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,TF_binding_site,,,,,,,,regulatory_region_variant,MODIFIER,,,,,,
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=114;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,,,,,ENST00000641515.2:c.484A>G,ENSP00000493376.2:p.Thr162Ala,benign(0),protein_coding,3/3,,162,T/A,Aca/Gca,1,,missense_variant,MODERATE,tolerated(0.92),T162A,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,,,,,ENST00000426406.4:c.389T>G,ENSP00000409316.1:p.Leu130Arg,benign(0.336),protein_coding,1/1,,130,L/R,cTc/cGc,-1,,missense_variant,MODERATE,deleterious_low_confidence(0.01),L130R,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg
2,chr1,451290,.,A,C,.,PASS,ADP=64;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,1,0,Heterozygous,,,C|missense_variant|MODERATE|OR4F29|ENSG0000028...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000455207.5:n.169+33750T>G,,,lncRNA,,1/2,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,,ENST00000455207.5,n.169+33750T>G,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,,,,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,,,,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,,,,,
36651,chrY,20577481,rs9786153,C,T,.,PASS,"ADP=22;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1727,0....",1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,0,1,Homozygous,EIF1AY:9086,EIF1AY,T|intron_variant|MODIFIER|EIF1AY|ENSG000001986...,T|upstream_gene_variant|MODIFIER|TTTY10|ENSG00...,,,,,,,,lncRNA,,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...,upstream_gene_variant,MODIFIER,,,,,,


In [24]:
vcf_final = vcf[['Gene Name', 'rsID','CHROM', 'POS', 'REF', 'ALT', 'Zygosity', 'Consequence', 'IMPACT',
          'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
          'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
          'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous_variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein_coding,3/3,,S81,tcA/tcG,1,
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory_region_variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF_binding_site,,,,,,
1,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense_variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein_coding,3/3,,T162A,Aca/Gca,1,
2,,.,chr1,451290,A,C,Heterozygous,missense_variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious_low_confidence(0.01),benign(0.336),protein_coding,1/1,,L130R,cTc/cGc,-1,
2,,.,chr1,451290,A,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...


In [25]:
# Define the terms to remove
remove_terms = set(["not_specified", "not_provided"])

# Apply the filtering operation to 'Column1' only
vcf_final['ClinVar_CLNDN'] = vcf_final['ClinVar_CLNDN'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['CLIN_SIG'] = vcf_final['CLIN_SIG'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['ClinVar_CLNREVSTAT'] = vcf_final['ClinVar_CLNREVSTAT'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)

# Print the modified DataFrame
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous_variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein_coding,3/3,,S81,tcA/tcG,1,
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory_region_variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF_binding_site,,,,,,
1,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense_variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein_coding,3/3,,T162A,Aca/Gca,1,
2,,.,chr1,451290,A,C,Heterozygous,missense_variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious_low_confidence(0.01),benign(0.336),protein_coding,1/1,,L130R,cTc/cGc,-1,
2,,.,chr1,451290,A,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream_gene_variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,22271044&29391530&24262073&15896936&32996047&3...


In [26]:
vcf_final = vcf_final.astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,
1,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
2,,.,chr1,451290,A,C,Heterozygous,missense variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,
2,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."


In [27]:
vcf_final['POS'] = vcf_final['POS'].astype('int64')
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,
1,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
2,,.,chr1,451290,A,C,Heterozygous,missense variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,
2,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."


In [28]:
vcf_final['consequence'] = vcf_final['Consequence'].str.split(',').str[0]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,,regulatory region variant
1,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant
2,,.,chr1,451290,A,C,Heterozygous,missense variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,,missense variant
2,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,,intron variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant
36651,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant


In [29]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [30]:
merged_1 = pd.merge(vcf_final, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant,3/10
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,,regulatory region variant,2/10
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10
3,,.,chr1,451290,A,C,Heterozygous,missense variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,,missense variant,7/10
4,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,,intron variant,2/10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10
328330,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10
328331,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10
328332,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10


In [31]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [32]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,,synonymous variant,3/10,2.5
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,MODIFIER,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,,regulatory region variant,2/10,1.5
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10,5.0
3,,.,chr1,451290,A,C,Heterozygous,missense variant,MODERATE,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,,missense variant,7/10,5.0
4,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,,intron variant,2/10,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10,1.5
328330,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10,1.5
328331,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10,1.5
328332,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,MODIFIER,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",upstream gene variant,2/10,1.5


In [34]:
merged_2 = merged_2[['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,,.,chr1,451290,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,
4,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328330,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328331,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328332,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."


In [35]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/cardiac genes.xlsx')
df_gene

Unnamed: 0,Gene Name
0,LDLR
1,LDLR-AS1
2,NOS3
3,PRKAG2
4,LRP6
...,...
75,TRDN
76,LDLRAP1
77,PCSK9
78,APOB


In [36]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene_Match
0,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,,No
1,OR4F5,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,,No
2,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,No
3,,.,chr1,451290,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,,No
4,,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328330,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328331,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328332,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No


In [37]:
merged_2.Gene_Match.value_counts()

No     322967
Yes      5367
Name: Gene_Match, dtype: int64

In [38]:
merged_2 = merged_2[['Gene Name', 'Gene_Match', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_2

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,
2,OR4F5,No,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,,No,.,chr1,451290,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,
4,,No,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328330,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328331,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328332,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."


In [40]:
df_3 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Condition_pos/Cardiac_pos.xlsx')
df_3

Unnamed: 0,POS,Literature
0,104858586,Yes
1,104826974,Yes
2,104824472,Yes
3,104858554,Yes
4,104794495,Yes
...,...,...
1861,31592980,Yes
1862,31595223,Yes
1863,31593004,Yes
1864,31598691,Yes


In [41]:
merged_3 = pd.merge(merged_2, df_3, on = 'POS', how = 'left', sort=False)
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature
0,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,,No
1,OR4F5,No,rs201219564,chr1,69270,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,,No
2,OR4F5,No,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,,No
3,,No,.,chr1,451290,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,,No
4,,No,.,chr1,451290,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328330,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328331,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
328332,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No


In [42]:
merged_3.Literature.value_counts()

No     328255
Yes        79
Name: Literature, dtype: int64

In [43]:
merged_3[merged_3['Literature'] == 'Yes']

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature
15189,CASQ2,Yes,rs7521023,chr1,115700759,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,"Caudal regression sequence,Catecholaminergic p...","benign,likely benign","criteria provided, single submitter",292122,ENST00000261448.6,c.*482C>T,,,0/1,52,26,26,12,14,53.85%,5.4595E-6,10,2,11,3,,,protein coding,11/11,,,,-1,27899944244444463105329232765588,Yes
15190,CASQ2,Yes,rs7521023,chr1,115700759,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,"Caudal regression sequence,Catecholaminergic p...","benign,likely benign","criteria provided, single submitter",292122,,,,,0/1,52,26,26,12,14,53.85%,5.4595E-6,10,2,11,3,,,protein coding,,,,,1,27899944244444463105329232765588,Yes
15191,CASQ2,Yes,rs7521023,chr1,115700759,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,"Caudal regression sequence,Catecholaminergic p...","benign,likely benign","criteria provided, single submitter",292122,,,,,0/1,52,26,26,12,14,53.85%,5.4595E-6,10,2,11,3,,,protein coding,,,,,1,27899944244444463105329232765588,Yes
23277,TNNT2,Yes,rs3730238,chr1,201361301,T,C,Heterozygous,missense variant,7/10,MODERATE,5.0,"Dilated Cardiomyopathy, Dominant,Cardiovascula...","benign,likely benign","criteria provided, multiple submitters, no con...",43669,ENST00000236918.11,c.788A>G,ENSP00000236918.8,p.Lys263Arg,0/1,68,32,32,14,18,56.25%,1.3089E-7,14,0,12,6,deleterious low confidence(0.04),benign(0.007),protein coding,14/16,,K263R,aAg/aGg,-1,"25741868,24033266,18414213,23861362,15542288,2...",Yes
23278,TNNT2,Yes,rs3730238,chr1,201361301,T,C,Heterozygous,missense variant,7/10,MODERATE,5.0,"Dilated Cardiomyopathy, Dominant,Cardiovascula...","benign,likely benign","criteria provided, multiple submitters, no con...",43669,ENST00000360372.8,c.659A>G,ENSP00000353535.5,p.Lys220Arg,0/1,68,32,32,14,18,56.25%,1.3089E-7,14,0,12,6,deleterious(0.02),benign(0.388),protein coding,12/14,,K220R,aAg/aGg,-1,"25741868,24033266,18414213,23861362,15542288,2...",Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297400,KCNH2,Yes,rs1805123,chr7,150948446,T,G,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,"Cardiac arrhythmia,Cardiovascular phenotype,At...",benign,"criteria provided, multiple submitters, no con...",67427,,,,,1/1,93,18,18,0,17,94.44%,4.2852E-10,0,0,12,5,,,retained intron,,,,,-1,"25741868,21056700,24033266,20850564,23820649,2...",Yes
297401,KCNH2,Yes,rs1805123,chr7,150948446,T,G,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,"Cardiac arrhythmia,Cardiovascular phenotype,At...",benign,"criteria provided, multiple submitters, no con...",67427,,,,,1/1,93,18,18,0,17,94.44%,4.2852E-10,0,0,12,5,,,retained intron,,,,,-1,"25741868,21056700,24033266,20850564,23820649,2...",Yes
297402,KCNH2,Yes,rs1805123,chr7,150948446,T,G,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,"Cardiac arrhythmia,Cardiovascular phenotype,At...",benign,"criteria provided, multiple submitters, no con...",67427,,,,,1/1,93,18,18,0,17,94.44%,4.2852E-10,0,0,12,5,,,retained intron,,,,,-1,"25741868,21056700,24033266,20850564,23820649,2...",Yes
297403,KCNH2,Yes,rs1805123,chr7,150948446,T,G,Homozygous,downstream gene variant,2/10,MODIFIER,1.5,"Cardiac arrhythmia,Cardiovascular phenotype,At...",benign,"criteria provided, multiple submitters, no con...",67427,,,,,1/1,93,18,18,0,17,94.44%,4.2852E-10,0,0,12,5,,,retained intron,,,,,-1,"25741868,21056700,24033266,20850564,23820649,2...",Yes


In [45]:
merged_3 = merged_3[['Gene Name', 'Gene_Match', 'rsID', 'CHROM', 'POS', 'Literature', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_3

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,Literature,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,No,rs201219564,chr1,69270,No,A,G,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.243A>G,ENSP00000493376.2,p.Ser81%3D,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,protein coding,3/3,,S81,tcA/tcG,1,
1,OR4F5,No,rs201219564,chr1,69270,No,A,G,Heterozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,127,138,138,100,38,27.54%,1.8855E-13,86,14,29,9,,,TF binding site,,,,,,
2,OR4F5,No,rs2691305,chr1,69511,No,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,114,114,0,114,100%,4.392E-68,0,0,98,16,tolerated(0.92),benign(0),protein coding,3/3,,T162A,Aca/Gca,1,
3,,No,.,chr1,451290,No,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000426406.4,c.389T>G,ENSP00000409316.1,p.Leu130Arg,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,deleterious low confidence(0.01),benign(0.336),protein coding,1/1,,L130R,cTc/cGc,-1,
4,,No,.,chr1,451290,No,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+33750T>G,,,0/1,56,64,64,47,17,26.56%,2.243E-6,33,14,15,2,,,lncRNA,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328329,EIF1AY,No,rs9786153,chrY,20577481,No,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328330,EIF1AY,No,rs9786153,chrY,20577481,No,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328331,EIF1AY,No,rs9786153,chrY,20577481,No,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
328332,EIF1AY,No,rs9786153,chrY,20577481,No,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,123,22,22,0,22,100%,4.7526E-13,0,0,14,8,,,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."


In [47]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHCDPRGPTTL6_depth_vcf_processed.xlsx', index=False)