In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [25]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHGLBS448_final.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS448_final.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [45]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS448_final.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0
4,chr1,686266,.,A,C,.,PASS,ADP=89;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,58,89,89,71,18,20.22%,1.4677E-6,58,13,14,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33484,chrX,155997799,rs6642401,C,T,.,PASS,"ADP=27;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9483,0....",0/1,47,27,27,14,13,48.15%,1.81E-5,12,2,12,1
33485,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=9;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.4079,0.5...",1/1,29,9,9,2,7,77.78%,1.1312E-3,2,0,6,1
33486,chrY,2789135,rs2534636,C,T,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9278,0....",1/1,255,55,54,0,54,100%,4.0229E-32,0,0,47,7
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9


In [46]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous
4,chr1,686266,.,A,C,.,PASS,ADP=89;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,58,89,89,71,18,20.22%,1.4677E-6,58,13,14,4,1,0,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33484,chrX,155997799,rs6642401,C,T,.,PASS,"ADP=27;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9483,0....",0/1,47,27,27,14,13,48.15%,1.81E-5,12,2,12,1,1,0,Heterozygous
33485,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=9;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.4079,0.5...",1/1,29,9,9,2,7,77.78%,1.1312E-3,2,0,6,1,0,1,Homozygous
33486,chrY,2789135,rs2534636,C,T,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9278,0....",1/1,255,55,54,0,54,100%,4.0229E-32,0,0,47,7,0,1,Homozygous
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous


In [47]:
vcf["Gene Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,
4,chr1,686266,.,A,C,.,PASS,ADP=89;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,58,89,89,71,18,20.22%,1.4677E-6,58,13,14,4,1,0,Heterozygous,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33484,chrX,155997799,rs6642401,C,T,.,PASS,"ADP=27;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9483,0....",0/1,47,27,27,14,13,48.15%,1.81E-5,12,2,12,1,1,0,Heterozygous,IL9R
33485,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=9;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.4079,0.5...",1/1,29,9,9,2,7,77.78%,1.1312E-3,2,0,6,1,0,1,Homozygous,IL9R
33486,chrY,2789135,rs2534636,C,T,.,PASS,"ADP=54;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9278,0....",1/1,255,55,54,0,54,100%,4.0229E-32,0,0,47,7,0,1,Homozygous,SRY
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1


In [48]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...


In [49]:
########################################################### Required columns extraction from the CSQ column ####################
vcf['Allele'] = vcf['csq'].str.split('|').str[0]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SYMBOL'] = vcf['csq'].str.split('|').str[3]
vcf['Gene'] = vcf['csq'].str.split('|').str[4]
vcf['Feature_type'] = vcf['csq'].str.split('|').str[5]
vcf['Feature'] = vcf['csq'].str.split('|').str[6]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['HGVSc'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSp'] = vcf['csq'].str.split('|').str[11]
vcf['cDNA_position'] = vcf['csq'].str.split('|').str[12]
vcf['CDS_position'] = vcf['csq'].str.split('|').str[13]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['Existing_variation'] = vcf['csq'].str.split('|').str[17]
vcf['DISTANCE'] = vcf['csq'].str.split('|').str[18]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['FLAGS'] = vcf['csq'].str.split('|').str[20]
vcf['SYMBOL_SOURCE'] = vcf['csq'].str.split('|').str[21]
vcf['HGNC_ID'] = vcf['csq'].str.split('|').str[22]
vcf['SOURCE'] = vcf['csq'].str.split('|').str[23]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[24]
vcf['ClinVar_CLNSIG'] = vcf['csq'].str.split('|').str[25]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[26]
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[27]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,


In [50]:
vcf['consequence'] = vcf['Consequence'].apply(lambda x: x.split('&')[0] if isinstance(x, str) else x)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,,synonymous_variant
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,,missense_variant
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,,synonymous_variant
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,,downstream_gene_variant
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,,intron_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense_variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense_variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,,upstream_gene_variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,,upstream_gene_variant


In [51]:
vcf['consequence'] = vcf['consequence'].str.split(',').str[0]
vcf['consequence'] = vcf['consequence'].str.replace('_', ' ')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,,missense variant
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,,downstream gene variant
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,,intron variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant
33487,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant


In [52]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [53]:
merged_1 = pd.merge(vcf, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence,Consequence_score
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,,missense variant,7/10
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,,downstream gene variant,2/10
4,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,,intron variant,2/10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275974,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10
275975,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10
275976,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10
275977,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10


In [54]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [55]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence,Consequence_score,IMPACT_score
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10,2.5
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,,missense variant,7/10,5.0
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10,2.5
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,,downstream gene variant,2/10,1.5
4,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,,intron variant,2/10,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275974,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10,5.0
275975,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10,5.0
275976,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10,1.5
275977,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10,1.5


In [56]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Hereditary_Cancer_New_genes.xlsx')
df_gene

Unnamed: 0,Gene Name
0,ATM
1,BARD1
2,BRCA1
3,BRCA2
4,BRIP1
5,CDH1
6,CDKN2A
7,CHEK2
8,MSH2
9,MSH6


In [57]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence,Consequence_score,IMPACT_score,Gene_Match
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=143;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,137,143,143,102,41,28.67%,1.5868E-14,99,3,33,8,1,0,Heterozygous,OR4F5,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,G,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,303,243,81,S,tcA/tcG,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10,2.5,No
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=168;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,168,168,0,168,100%,1.6424E-100,0,0,145,23,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G,missense_variant,MODERATE,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,544,484,162,T/A,Aca/Gca,,,1,,HGNC,HGNC:14825,,,,,,missense variant,7/10,5.0,No
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,109,98,98,66,32,32.65%,1.1206E-11,52,14,25,7,1,0,Heterozygous,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C,synonymous_variant,LOW,OR4F5,ENSG00000186092,Transcript,ENST00000641515,protein_coding,3/3,,,,930,870,290,S,tcT/tcC,,,1,,HGNC,HGNC:14825,,,,,,synonymous variant,3/10,2.5,No
3,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C,downstream_gene_variant,MODIFIER,OR4F16,ENSG00000284662,Transcript,ENST00000332831,protein_coding,,,,,,,,,,,22,-1,,HGNC,HGNC:15079,,,,,,downstream gene variant,2/10,1.5,No
4,chr1,685694,.,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,20,18,18,12,6,33.33%,9.5308E-3,12,0,6,0,1,0,Heterozygous,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,C,intron_variant&non_coding_transcript_variant,MODIFIER,,ENSG00000230021,Transcript,ENST00000419394,processed_transcript,,3/3,,,,,,,,,,-1,,,,,,,,,intron variant,2/10,1.5,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275974,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000423647,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10,5.0,No
275975,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|missense_variant|MODERATE|TSPY1|ENSG00000258...,A,missense_variant,MODERATE,TSPY1,ENSG00000258992,Transcript,ENST00000451548,protein_coding,1/6,,,,303,257,86,R/Q,cGg/cAg,,,1,,HGNC,HGNC:12381,,,,,,missense variant,7/10,5.0,No
275976,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000615605,processed_transcript,,,,,,,,,,,2900,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10,1.5,No
275977,chrY,9467257,rs777840135,G,A,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=TSP...,0/1,224,241,241,174,67,27.8%,3.236E-23,132,42,58,9,1,0,Heterozygous,TSPY1,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A|upstream_gene_variant|MODIFIER|FAM197Y5|ENSG...,A,upstream_gene_variant,MODIFIER,FAM197Y5,ENSG00000225516,Transcript,ENST00000619815,processed_transcript,,,,,,,,,,,2912,-1,,HGNC,HGNC:37467,,,,,,upstream gene variant,2/10,1.5,No


In [59]:
merged_2[merged_2['Gene_Match'] == 'Yes']

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,SOURCE,ClinVar,ClinVar_CLNSIG,ClinVar_CLNREVSTAT,ClinVar_CLNDN,consequence,Consequence_score,IMPACT_score,Gene_Match
10581,chr1,45331833,rs3219489,C,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6865,0....",0/1,63,32,32,15,17,53.12%,4.1013E-7,13,2,15,2,1,0,Heterozygous,MUTYH,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G,downstream_gene_variant,MODIFIER,HPDL,ENSG00000186603,Transcript,ENST00000334815,protein_coding,,,,,,,,,,,3123,1,,HGNC,HGNC:28242,,41767,Benign,criteria_provided&_multiple_submitters&_no_con...,Carcinoma_of_colon&MUTYH-related_attenuated_fa...,downstream gene variant,2/10,1.5,Yes
10582,chr1,45331833,rs3219489,C,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6865,0....",0/1,63,32,32,15,17,53.12%,4.1013E-7,13,2,15,2,1,0,Heterozygous,MUTYH,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G|missense_variant|MODERATE|MUTYH|ENSG00000132...,G,missense_variant,MODERATE,MUTYH,ENSG00000132781,Transcript,ENST00000354383,protein_coding,12/16,,,,1044,933,311,Q/H,caG/caC,,,-1,,HGNC,HGNC:7527,,41767,Benign,criteria_provided&_multiple_submitters&_no_con...,Carcinoma_of_colon&MUTYH-related_attenuated_fa...,missense variant,7/10,5.0,Yes
10583,chr1,45331833,rs3219489,C,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6865,0....",0/1,63,32,32,15,17,53.12%,4.1013E-7,13,2,15,2,1,0,Heterozygous,MUTYH,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G|missense_variant|MODERATE|MUTYH|ENSG00000132...,G,missense_variant,MODERATE,MUTYH,ENSG00000132781,Transcript,ENST00000355498,protein_coding,12/16,,,,1076,930,310,Q/H,caG/caC,,,-1,,HGNC,HGNC:7527,,41767,Benign,criteria_provided&_multiple_submitters&_no_con...,Carcinoma_of_colon&MUTYH-related_attenuated_fa...,missense variant,7/10,5.0,Yes
10584,chr1,45331833,rs3219489,C,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6865,0....",0/1,63,32,32,15,17,53.12%,4.1013E-7,13,2,15,2,1,0,Heterozygous,MUTYH,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G|missense_variant|MODERATE|MUTYH|ENSG00000132...,G,missense_variant,MODERATE,MUTYH,ENSG00000132781,Transcript,ENST00000372098,protein_coding,12/16,,,,1139,1005,335,Q/H,caG/caC,,,-1,,HGNC,HGNC:7527,,41767,Benign,criteria_provided&_multiple_submitters&_no_con...,Carcinoma_of_colon&MUTYH-related_attenuated_fa...,missense variant,7/10,5.0,Yes
10585,chr1,45331833,rs3219489,C,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.6865,0....",0/1,63,32,32,15,17,53.12%,4.1013E-7,13,2,15,2,1,0,Heterozygous,MUTYH,G|downstream_gene_variant|MODIFIER|HPDL|ENSG00...,G|missense_variant|MODERATE|MUTYH|ENSG00000132...,G,missense_variant,MODERATE,MUTYH,ENSG00000132781,Transcript,ENST00000372104,protein_coding,13/17,,,,1123,930,310,Q/H,caG/caC,,,-1,,HGNC,HGNC:7527,,41767,Benign,criteria_provided&_multiple_submitters&_no_con...,Carcinoma_of_colon&MUTYH-related_attenuated_fa...,missense variant,7/10,5.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266041,chr9,95099836,rs4647558,C,T,.,PASS,"ADP=59;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5865,0....",0/1,98,59,59,32,27,45.76%,1.5211E-10,24,8,23,4,1,0,Heterozygous,"FANCC,C9orf3",T|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,T|non_coding_transcript_exon_variant|MODIFIER|...,T,non_coding_transcript_exon_variant,MODIFIER,FANCC,ENSG00000158169,Transcript,ENST00000696260,retained_intron,3/3,,,,4363,,,,,,,-1,,HGNC,HGNC:3584,,367581,Benign,criteria_provided&_single_submitter,Fanconi_anemia_complementation_group_C,non coding transcript exon variant,2/10,1.5,Yes
266042,chr9,95100419,rs4647554,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4972,0....",0/1,45,32,32,19,13,40.62%,2.6443E-5,15,4,11,2,1,0,Heterozygous,"FANCC,C9orf3",G|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,G|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,G,3_prime_UTR_variant,MODIFIER,FANCC,ENSG00000158169,Transcript,ENST00000289081,protein_coding,15/15,,,,3227,,,,,,,-1,,HGNC,HGNC:3584,,367589,Benign,criteria_provided&_single_submitter,Fanconi_anemia_complementation_group_C,3 prime UTR variant,3/10,1.5,Yes
266043,chr9,95100419,rs4647554,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4972,0....",0/1,45,32,32,19,13,40.62%,2.6443E-5,15,4,11,2,1,0,Heterozygous,"FANCC,C9orf3",G|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,G|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,G,3_prime_UTR_variant,MODIFIER,FANCC,ENSG00000158169,Transcript,ENST00000375305,protein_coding,15/15,,,,3169,,,,,,,-1,,HGNC,HGNC:3584,,367589,Benign,criteria_provided&_single_submitter,Fanconi_anemia_complementation_group_C,3 prime UTR variant,3/10,1.5,Yes
266044,chr9,95100419,rs4647554,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4972,0....",0/1,45,32,32,19,13,40.62%,2.6443E-5,15,4,11,2,1,0,Heterozygous,"FANCC,C9orf3",G|3_prime_UTR_variant|MODIFIER|FANCC|ENSG00000...,G|downstream_gene_variant|MODIFIER|FANCC|ENSG0...,G,downstream_gene_variant,MODIFIER,FANCC,ENSG00000158169,Transcript,ENST00000649334,nonsense_mediated_decay,,,,,,,,,,,1291,-1,,HGNC,HGNC:3584,,367589,Benign,criteria_provided&_single_submitter,Fanconi_anemia_complementation_group_C,downstream gene variant,2/10,1.5,Yes


In [58]:
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHGLBS448_final_depth_vcf_processed.xlsx', index=False)