In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHGLBS567/KHGLBS567_annotated_indel.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS567_annotated_indel.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [3]:
data = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS567_annotated_indel.vcf', comment='#', sep= '\t', header=None, low_memory=False)
data.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=24;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:135:24:24:0:24:100%:3.101E-14:0:19:0:0:20:4
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:58:50:50:32:17:34%:1.4216E-6:42:35:9:23:1:16
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=58;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1793,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:58:58:2:56:96.55%:2.8822E-31:63:49:2:0...
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:60:18:18:4:14:77.78%:8.0605E-7:32:40:0:4:9:5
4,chr1,1387763,rs35654872,CCT,C,.,PASS,"ADP=19;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6565,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:105:19:19:0:19:100%:2.8292E-11:0:19:0:0:17:2
...,...,...,...,...,...,...,...,...,...,...
2926,chrX,154585478,rs1372477485,ATTCTG,A,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CTAG...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:42:18:18:7:11:61.11%:5.2969E-5:42:58:2:5:7:4
2927,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=23;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:66:23:23:4:15:65.22%:2.5053E-7:46:47:4:0:13:2
2928,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=23;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:115:23:23:1:22:95.65%:2.9149E-12:35:46:0:1...
2929,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=20;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=USP9...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:71:20:20:2:15:75%:7.3277E-8:55:44:2:0:13:2


In [4]:
sample_cols = data['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
data = pd.concat([data, sample_cols], axis=1)
data = data[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=24;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",1/1,135,24,24,0,24,100%,3.101E-14,0,0,20,4
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,58,50,50,32,17,34%,1.4216E-6,9,23,1,16
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=58;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1793,0....",1/1,255,58,58,2,56,96.55%,2.8822E-31,2,0,51,5
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,60,18,18,4,14,77.78%,8.0605E-7,0,4,9,5
4,chr1,1387763,rs35654872,CCT,C,.,PASS,"ADP=19;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6565,0....",1/1,105,19,19,0,19,100%,2.8292E-11,0,0,17,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926,chrX,154585478,rs1372477485,ATTCTG,A,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CTAG...,0/1,42,18,18,7,11,61.11%,5.2969E-5,2,5,7,4
2927,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=23;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,0/1,66,23,23,4,15,65.22%,2.5053E-7,4,0,13,2
2928,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=23;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,1/1,115,23,23,1,22,95.65%,2.9149E-12,0,1,8,14
2929,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=20;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=USP9...,1/1,71,20,20,2,15,75%,7.3277E-8,2,0,13,2


In [5]:
# Create empty columns
columns = ['ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID', 'SLO',
           'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV', 'TPA', 'CFL', 'GNO',
           'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC',
           'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO',
           'GENEINFO', 'INT', 'G5', 'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar',
           'ClinVar_CLNSIG']

for col in columns:
    data[col] = ''

# Populate columns based on 'INFO' values
for i, row in data.iterrows():
    info = row['INFO']
    items = info.split(';')
    for item in items:
        key_value = item.split('=')
        key = key_value[0]
        if key in columns:
            if len(key_value) > 1:
                value = key_value[1]
                data.at[i, key] = f"{key}={value}"
            else:
                data.at[i, key] = key
        else:
            data.at[i, key] = 'null'


            
data["Gene_Name"] = data["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
#data['Gene Name'] = data['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
data['Gene Name'] = data['Gene_Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')

# Print the resulting DataFrame
data

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,GENEINFO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,Gene_Name,Gene Name
0,chr1,1043223,rs35881187,CCT,C,.,PASS,"ADP=24;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5208,0....",1/1,135,24,24,0,24,100%,3.101E-14,0,0,20,4,ADP=24,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,GENEINFO=AGRN:375790,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,,,AGRN:375790,AGRN
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,58,50,50,32,17,34%,1.4216E-6,9,23,1,16,ADP=50,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=1,RS=200330269,,,,,VLD,ASP,,,,,"TOPMED=0.82975758154943934,0.17023445463812436...",WGT=1,,,,,,KGPhase3,"CAF=0.8105,0.1895,.",VC=DIV,,KGPhase1,,VP=0x05000008000515003e000200,SAO=0,GENEINFO=TAS1R3:83756,INT,G5,,,SSR=0,RSPOS=1331945,,,,,TAS1R3:83756,TAS1R3
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=58;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1793,0....",1/1,255,58,58,2,56,96.55%,2.8822E-31,2,0,51,5,ADP=58,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,GENEINFO=MXRA8:54587,INT,G5,,,SSR=0,RSPOS=1353988,,,,,MXRA8:54587,MXRA8
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,60,18,18,4,14,77.78%,8.0605E-7,0,4,9,5,ADP=18,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,GENEINFO=MXRA8:54587,INT,,,,SSR=0,RSPOS=1355780,,,,,MXRA8:54587,MXRA8
4,chr1,1387763,rs35654872,CCT,C,.,PASS,"ADP=19;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6565,0....",1/1,105,19,19,0,19,100%,2.8292E-11,0,0,17,2,ADP=19,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35654872,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.43493565239551478,0.56506434760448521",WGT=1,,,,,,KGPhase3,"CAF=0.6565,0.3435",VC=DIV,,KGPhase1,,VP=0x05010008000515013e000200,SAO=0,GENEINFO=CCNL2:81669,INT,G5,,,SSR=0,RSPOS=1387764,,,,,CCNL2:81669,CCNL2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926,chrX,154585478,rs1372477485,ATTCTG,A,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CTAG...,0/1,42,18,18,7,11,61.11%,5.2969E-5,2,5,7,4,ADP=18,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=151,,,,,,,,,RS=1372477485,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,GENEINFO=CTAG1A:246100,INT,,,,SSR=0,RSPOS=154585479,,,,,CTAG1A:246100,CTAG1A
2927,chrX,155228363,rs781912204,A,AT,.,PASS,ADP=23;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=VBP1...,0/1,66,23,23,4,15,65.22%,2.5053E-7,4,0,13,2,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,GENEINFO=VBP1:7411,INT,,,,SSR=0,RSPOS=155228363,,,,,VBP1:7411,VBP1
2928,chrX,155492733,rs1169019545;rs376271737,T,TG,.,PASS,ADP=23;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=TMLH...,1/1,115,23,23,1,22,95.65%,2.9149E-12,0,1,8,14,ADP=23,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0","GENEINFO=TMLHE:55217|TMLHE-AS1:100507404,TMLHE...",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,,,"TMLHE:55217|TMLHE-AS1:100507404,TMLHE:55217|TM...","TMLHE-AS1,TMLHE"
2929,chrY,12786501,rs760255651,CT,C,.,PASS,ADP=20;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=USP9...,1/1,71,20,20,2,15,75%,7.3277E-8,2,0,13,2,ADP=20,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,GENEINFO=USP9Y:8287,INT,,,,SSR=0,RSPOS=12786503,,,,,USP9Y:8287,USP9Y


In [6]:
data = data[['CHROM', 'POS', 'Gene Name', 'rsID', 'REF', 'ALT', 'GT',
       'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD',
       'dbSNPBuildID', 'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON',
       'RS', 'RV', 'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5',
       'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF',
       'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5',
       'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM']]
data

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM
0,chr1,1043223,AGRN,rs35881187,CCT,C,1/1,135,24,24,0,24,100%,3.101E-14,0,0,20,4,ADP=24,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM
1,chr1,1331945,TAS1R3,rs200330269,G,GC,0/1,58,50,50,32,17,34%,1.4216E-6,9,23,1,16,ADP=50,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=1,RS=200330269,,,,,VLD,ASP,,,,,"TOPMED=0.82975758154943934,0.17023445463812436...",WGT=1,,,,,,KGPhase3,"CAF=0.8105,0.1895,.",VC=DIV,,KGPhase1,,VP=0x05000008000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1331945,,
2,chr1,1353987,MXRA8,rs140777846,CTG,C,1/1,255,58,58,2,56,96.55%,2.8822E-31,2,0,51,5,ADP=58,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,
3,chr1,1355779,MXRA8,rs201260508,GA,G,1/1,60,18,18,4,14,77.78%,8.0605E-7,0,4,9,5,ADP=18,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,INT,,,,SSR=0,RSPOS=1355780,,
4,chr1,1387763,CCNL2,rs35654872,CCT,C,1/1,105,19,19,0,19,100%,2.8292E-11,0,0,17,2,ADP=19,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35654872,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.43493565239551478,0.56506434760448521",WGT=1,,,,,,KGPhase3,"CAF=0.6565,0.3435",VC=DIV,,KGPhase1,,VP=0x05010008000515013e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1387764,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926,chrX,154585478,CTAG1A,rs1372477485,ATTCTG,A,0/1,42,18,18,7,11,61.11%,5.2969E-5,2,5,7,4,ADP=18,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=151,,,,,,,,,RS=1372477485,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=154585479,,
2927,chrX,155228363,VBP1,rs781912204,A,AT,0/1,66,23,23,4,15,65.22%,2.5053E-7,4,0,13,2,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,INT,,,,SSR=0,RSPOS=155228363,,
2928,chrX,155492733,"TMLHE-AS1,TMLHE",rs1169019545;rs376271737,T,TG,1/1,115,23,23,1,22,95.65%,2.9149E-12,0,1,8,14,ADP=23,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,
2929,chrY,12786501,USP9Y,rs760255651,CT,C,1/1,71,20,20,2,15,75%,7.3277E-8,2,0,13,2,ADP=20,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=12786503,,


In [7]:
data['rsid'] = data['rsID'].str.split(';')
data = data.explode('rsid')
data

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid
0,chr1,1043223,AGRN,rs35881187,CCT,C,1/1,135,24,24,0,24,100%,3.101E-14,0,0,20,4,ADP=24,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187
1,chr1,1331945,TAS1R3,rs200330269,G,GC,0/1,58,50,50,32,17,34%,1.4216E-6,9,23,1,16,ADP=50,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=1,RS=200330269,,,,,VLD,ASP,,,,,"TOPMED=0.82975758154943934,0.17023445463812436...",WGT=1,,,,,,KGPhase3,"CAF=0.8105,0.1895,.",VC=DIV,,KGPhase1,,VP=0x05000008000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1331945,,,rs200330269
2,chr1,1353987,MXRA8,rs140777846,CTG,C,1/1,255,58,58,2,56,96.55%,2.8822E-31,2,0,51,5,ADP=58,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846
3,chr1,1355779,MXRA8,rs201260508,GA,G,1/1,60,18,18,4,14,77.78%,8.0605E-7,0,4,9,5,ADP=18,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,INT,,,,SSR=0,RSPOS=1355780,,,rs201260508
4,chr1,1387763,CCNL2,rs35654872,CCT,C,1/1,105,19,19,0,19,100%,2.8292E-11,0,0,17,2,ADP=19,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35654872,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.43493565239551478,0.56506434760448521",WGT=1,,,,,,KGPhase3,"CAF=0.6565,0.3435",VC=DIV,,KGPhase1,,VP=0x05010008000515013e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1387764,,,rs35654872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2927,chrX,155228363,VBP1,rs781912204,A,AT,0/1,66,23,23,4,15,65.22%,2.5053E-7,4,0,13,2,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=781912204,,,,,,ASP,,,,,"TOPMED=0.43169438073394495,0.56314506880733944...",WGT=1,,,,,,,,VC=DIV,,,NOV,VP=0x050000080005000002000204,SAO=0,INT,,,,SSR=0,RSPOS=155228363,,,rs781912204
2928,chrX,155492733,"TMLHE-AS1,TMLHE",rs1169019545;rs376271737,T,TG,1/1,115,23,23,1,22,95.65%,2.9149E-12,0,1,8,14,ADP=23,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,rs1169019545
2928,chrX,155492733,"TMLHE-AS1,TMLHE",rs1169019545;rs376271737,T,TG,1/1,115,23,23,1,22,95.65%,2.9149E-12,0,1,8,14,ADP=23,WT=0,HET=0,HOM=1,NC=0,,,,,"dbSNPBuildID=151,138",,,,,,,,,"RS=1169019545,376271737",,,,,,ASP,,,,,"TOPMED=0.35031218144750254,0.64968781855249745","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,,"VP=0x050000080005000002000200,0x05000008000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=155492733,155492734",,,rs376271737
2929,chrY,12786501,USP9Y,rs760255651,CT,C,1/1,71,20,20,2,15,75%,7.3277E-8,2,0,13,2,ADP=20,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=144,,,,,,,,,RS=760255651,,,,,,ASP,,,,,,WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=12786503,,,rs760255651


In [27]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHHSPTGPCSP15_and_16/KHHSPTGPCSP16/KHHSPTGPCSP16_indel_clinical_significance.xlsx')
df = df.rename(columns={'gene_name': 'Gene name', 'zygocity':'zygosity'})
df

Unnamed: 0,allele,zygosity,Gene name,rsid,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,"['TA', T]",Heterozygous,WDR41,rs10536220,3_prime_UTR_variant,Uncertain_significance,Striatal_Degeneration,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,Yes
1,"['TA', T]",Heterozygous,WDR41,rs10536220,3_prime_UTR_variant,Benign,Striatal_Degeneration,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,Yes
2,"['G', GA]",Heterozygous,TRAPPC2,rs1057515794,3_prime_UTR_variant,Conflicting_interpretations_of_pathogenicity,Spondyloepiphyseal_dysplasia_congenita|not_pro...,"criteria_provided', '_conflicting_interpretations",germline,indel,Insertion,MayBe No,No,Yes
3,"['CAT', C]",Heterozygous,PTGES3,rs10579382,"initiatior_codon_variant', 'SO:0001589",Benign,not_provided,no_assertion_criteria_provided,germline,indel,Deletion,MayBe No,No,No
4,"['CCT', C]",Homozygous,EDA,rs10579679,intron_variant,Benign,Hypohidrotic_X-linked_ectodermal_dysplasia|not...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,"['AT', A]",Heterozygous,ATCAY,rs796133913,3_prime_UTR_variant,Uncertain_significance,Cayman_type_cerebellar_ataxia,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
766,"['ACCT', A]",Homozygous,RYR1,rs796750554,intron_variant,Benign,Congenital_multicore_myopathy_with_external_op...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
767,"['GT', G]",Homozygous,CD2AP,rs797004904,3_prime_UTR_variant,Benign,Focal_segmental_glomerulosclerosis,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
768,"['TA', T]",Homozygous,BRCA1,rs8176144,intron_variant,Benign,Hereditary_breast_ovarian_cancer_syndrome|Brea...,reviewed_by_expert_panel,germline,indel,Deletion,MayBe No,No,Yes


In [39]:
merged = pd.merge(data, df, on = 'rsid', how = 'outer', sort=False)
merged

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid,allele,zygosity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,chr1,1043223.0,AGRN,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187,"['CCT', C]",Heterozygous,AGRN,intron_variant,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
1,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,rs143128930,,,,,,,,,,,,,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs70949570,,,,,,,,,,,,,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs752605219,,,,,,,,,,,,,
4,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs70976124,"['GC', G]",Homozygous,HEXB,intron_variant,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No
4914,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71027384,"['A', AT]",Heterozygous,SERAC1,intron_variant,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No
4915,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs72459173,"['GAAATCTCAGGACTAGA', G]",Heterozygous,ADAM9,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
4916,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs74733861,"['AAGTCTC', A]",Homozygous,PSAP,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No


In [40]:
merged['Gene Name'] = merged['Gene Name'].fillna(merged['Gene name'])
merged

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid,allele,zygosity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,chr1,1043223.0,AGRN,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187,"['CCT', C]",Heterozygous,AGRN,intron_variant,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
1,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,rs143128930,,,,,,,,,,,,,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs70949570,,,,,,,,,,,,,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs752605219,,,,,,,,,,,,,
4,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs70976124,"['GC', G]",Homozygous,HEXB,intron_variant,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No
4914,,,SERAC1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71027384,"['A', AT]",Heterozygous,SERAC1,intron_variant,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No
4915,,,ADAM9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs72459173,"['GAAATCTCAGGACTAGA', G]",Heterozygous,ADAM9,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
4916,,,PSAP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs74733861,"['AAGTCTC', A]",Homozygous,PSAP,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No


In [41]:
cond_genes = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Hereditary_cancer_genes.xlsx')
cond_genes

Unnamed: 0,Gene Name
0,ATM
1,BRCA1
2,BRCA2
3,BRIP1
4,CHEK2
5,BARD1
6,CDH1
7,CDKN2A
8,EPCAM
9,FANCC


# Multiple Genes

In [17]:
merged['Gene Match'] = 'No'
merged['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in merged['Gene Name'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in cond_genes['Gene Name'].values:
                merged.at[index, 'Gene Match'] = 'Yes'
                merged.at[index, 'Matched_Gene'] = gene
                break
merged

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,rsid,allele,zygocity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Gene Match,Matched_Gene
0,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,ADP=50,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,,rs143128930,,,,,,,,,,,,,,No,
1,chr1,1331945.0,TAS1R3,rs200330269,G,GC,0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,ADP=41,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=1,RS=200330269,,,,,VLD,ASP,,,,,"TOPMED=0.82975758154943934,0.17023445463812436...",WGT=1,,,,,,KGPhase3,"CAF=0.8105,0.1895,.",VC=DIV,,KGPhase1,,VP=0x05000008000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1331945,,,,,rs200330269,,,,,,,,,,,,,,No,
2,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,ADP=70,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,rs140777846,,,,,,,,,,,,,,No,
3,chr1,1355779.0,MXRA8,rs201260508,GA,G,0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,ADP=29,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,INT,,,,SSR=0,RSPOS=1355780,,,,,rs201260508,,,,,,,,,,,,,,No,
4,chr1,1657358.0,CDK11B,rs377230281,T,TA,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,ADP=112,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=138,,,,,,,,,RS=377230281,,,,,,ASP,,,,,"TOPMED=0.98587219673802242,0.01412780326197757",WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=1657358,,,,,rs377230281,,,,,,,,,,,,,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4953,,,KAT6B,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71929101,"['GGAA', G]",Heterozygous,KAT6B,inframe_deletion,Benign/Likely_benign,Genitopatellar_syndrome|not_specified|not_prov...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No,
4954,,,KAT6B,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71929101,"['GGAA', G]",Heterozygous,KAT6B,inframe_deletion,Benign,Genitopatellar_syndrome|not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,Yes,No,
4955,,,GNPTAB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs76300806,"['AGCC', A]",Heterozygous,GNPTAB,5_prime_UTR_variant,Benign,Mucolipidosis_type_II|Pseudo-Hurler_polydystro...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No,
4956,,,EEF1A2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs77467883,"['AG', A]",Homozygous,EEF1A2,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No,


In [18]:
merged.Matched_Gene.value_counts()

           4876
POLG         19
NPC1          9
APTX          5
SNCA          4
ASS1          3
LRRK2         3
ERCC4         3
NR4A2         3
RNF168        2
SETX          2
XPA           2
VLDLR         2
TTPA          2
CA8           2
CSTB          2
CYP27A1       2
VPS13C        2
TRPM7         2
IVD           2
PCSK9         2
PDHB          1
TNK2          1
PINK1         1
KCNN2         1
PEX7          1
VPS35         1
ATP7B         1
ATM           1
PHYH          1
Name: Matched_Gene, dtype: int64

# single Gene

In [42]:
merged['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in cond_genes['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged.loc[merged['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid,allele,zygosity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Gene_Match
0,chr1,1043223.0,AGRN,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187,"['CCT', C]",Heterozygous,AGRN,intron_variant,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No
1,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,rs143128930,,,,,,,,,,,,,,No
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs70949570,,,,,,,,,,,,,,No
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs752605219,,,,,,,,,,,,,,No
4,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846,,,,,,,,,,,,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs70976124,"['GC', G]",Homozygous,HEXB,intron_variant,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,No
4914,,,SERAC1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71027384,"['A', AT]",Heterozygous,SERAC1,intron_variant,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No,No
4915,,,ADAM9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs72459173,"['GAAATCTCAGGACTAGA', G]",Heterozygous,ADAM9,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No
4916,,,PSAP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs74733861,"['AAGTCTC', A]",Homozygous,PSAP,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No


In [14]:
cond_genes = cond_genes.rename({'Gene Name':'Matched_Gene', 'Gene_Match':'Gene Match'}, axis=1)
cond_genes

Unnamed: 0,Matched_Gene
0,BRCA1
1,BRCA2
2,PALB2
3,CDKN2A
4,ATM
5,TP53
6,STK11
7,MLH1
8,MSH2
9,MSH6


In [21]:
merged = merged.drop(columns=['Gene Match'], axis=1)
merged_2 = pd.merge(merged, cond_genes, on= 'Matched_Gene', how = 'left', sort = False)
merged_2['Gene Match'] = merged_2['Gene Match'].fillna('No')
merged_2

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,rsid,allele,zygocity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Matched_Gene,Gene Match
0,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,ADP=50,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,,rs143128930,,,,,,,,,,,,,,,No
1,chr1,1331945.0,TAS1R3,rs200330269,G,GC,0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,ADP=41,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=1,RS=200330269,,,,,VLD,ASP,,,,,"TOPMED=0.82975758154943934,0.17023445463812436...",WGT=1,,,,,,KGPhase3,"CAF=0.8105,0.1895,.",VC=DIV,,KGPhase1,,VP=0x05000008000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1331945,,,,,rs200330269,,,,,,,,,,,,,,,No
2,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,ADP=70,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,rs140777846,,,,,,,,,,,,,,,No
3,chr1,1355779.0,MXRA8,rs201260508,GA,G,0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,ADP=29,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,INT,,,,SSR=0,RSPOS=1355780,,,,,rs201260508,,,,,,,,,,,,,,,No
4,chr1,1657358.0,CDK11B,rs377230281,T,TA,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,ADP=112,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=138,,,,,,,,,RS=377230281,,,,,,ASP,,,,,"TOPMED=0.98587219673802242,0.01412780326197757",WGT=1,,,,,,,,VC=DIV,,,,VP=0x050000080005000002000200,SAO=0,INT,,,,SSR=0,RSPOS=1657358,,,,,rs377230281,,,,,,,,,,,,,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4953,,,KAT6B,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71929101,"['GGAA', G]",Heterozygous,KAT6B,inframe_deletion,Benign/Likely_benign,Genitopatellar_syndrome|not_specified|not_prov...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,,No
4954,,,KAT6B,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71929101,"['GGAA', G]",Heterozygous,KAT6B,inframe_deletion,Benign,Genitopatellar_syndrome|not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,Yes,,No
4955,,,GNPTAB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs76300806,"['AGCC', A]",Heterozygous,GNPTAB,5_prime_UTR_variant,Benign,Mucolipidosis_type_II|Pseudo-Hurler_polydystro...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,,No
4956,,,EEF1A2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs77467883,"['AG', A]",Homozygous,EEF1A2,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,,No


In [43]:
merged['Consequence'] = merged['consequence'].astype(str).apply(lambda x: x.replace('&', ',').replace('_', ' ').replace("'", '').replace("-", ' '))
merged['Consequence'] = merged['Consequence'].str.split(',').str[0]
merged

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid,allele,zygosity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Gene_Match,Consequence
0,chr1,1043223.0,AGRN,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187,"['CCT', C]",Heterozygous,AGRN,intron_variant,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No,intron variant
1,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,rs143128930,,,,,,,,,,,,,,No,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs70949570,,,,,,,,,,,,,,No,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs752605219,,,,,,,,,,,,,,No,
4,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846,,,,,,,,,,,,,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs70976124,"['GC', G]",Homozygous,HEXB,intron_variant,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,No,intron variant
4914,,,SERAC1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71027384,"['A', AT]",Heterozygous,SERAC1,intron_variant,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No,No,intron variant
4915,,,ADAM9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs72459173,"['GAAATCTCAGGACTAGA', G]",Heterozygous,ADAM9,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No,intron variant
4916,,,PSAP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs74733861,"['AAGTCTC', A]",Homozygous,PSAP,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No,intron variant


In [44]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1 = df_1.rename({'consequence':'Consequence'}, axis =1)
df_1

Unnamed: 0,Consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [45]:
merged_1 = pd.merge(merged, df_1, on='Consequence', how='left', sort=False)
merged_1

Unnamed: 0,CHROM,POS,Gene Name,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,rsid,allele,zygosity,Gene name,consequence,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious,Gene_Match,Consequence,Consequence_score
0,chr1,1043223.0,AGRN,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,rs35881187,"['CCT', C]",Heterozygous,AGRN,intron_variant,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes,No,intron variant,2/10
1,chr1,1299382.0,ACAP3,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,rs143128930,,,,,,,,,,,,,,No,,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs70949570,,,,,,,,,,,,,,No,,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,rs752605219,,,,,,,,,,,,,,No,,
4,chr1,1353987.0,MXRA8,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,rs140777846,,,,,,,,,,,,,,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs70976124,"['GC', G]",Homozygous,HEXB,intron_variant,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No,No,intron variant,2/10
4914,,,SERAC1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs71027384,"['A', AT]",Heterozygous,SERAC1,intron_variant,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No,No,intron variant,2/10
4915,,,ADAM9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs72459173,"['GAAATCTCAGGACTAGA', G]",Heterozygous,ADAM9,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No,intron variant,2/10
4916,,,PSAP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs74733861,"['AAGTCTC', A]",Homozygous,PSAP,intron_variant,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No,No,intron variant,2/10


In [46]:
merged_1.columns

Index(['CHROM', 'POS', 'Gene Name', 'rsID', 'REF', 'ALT', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'ADP',
       'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID',
       'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV',
       'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED',
       'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC',
       'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5', 'OM', 'PMC', 'SSR',
       'RSPOS', 'HD', 'PM', 'rsid', 'allele', 'zygosity', 'Gene name',
       'consequence', 'clinical_significance', 'associated_diseases',
       'review_status', 'origin', 'variant_type', 'variant_subtype',
       'Phargkb_ann_exists', 'is_mutation', 'Variant_is_precious',
       'Gene_Match', 'Consequence', 'Consequence_score'],
      dtype='object')

In [47]:
merged_1 = merged_1[['CHROM', 'POS', 'Gene Name', 'Gene_Match', 'rsID', 'REF', 'ALT', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'ADP',
       'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID',
       'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV',
       'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED',
       'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC',
       'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5', 'OM', 'PMC', 'SSR',
       'RSPOS', 'HD', 'PM', 'allele',
       'zygosity', 'consequence', 'Consequence_score', 'clinical_significance',
       'associated_diseases', 'review_status', 'origin', 'variant_type',
       'variant_subtype', 'Phargkb_ann_exists', 'is_mutation',
       'Variant_is_precious']]
merged_1

Unnamed: 0,CHROM,POS,Gene Name,Gene_Match,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,allele,zygosity,consequence,Consequence_score,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,chr1,1043223.0,AGRN,No,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,"['CCT', C]",Heterozygous,intron_variant,2/10,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
1,chr1,1299382.0,ACAP3,No,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,,,,,,,,,,,,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",No,rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,,,,,,,,,,,,,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",No,rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,,,,,,,,,,,,,
4,chr1,1353987.0,MXRA8,No,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['GC', G]",Homozygous,intron_variant,2/10,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No
4914,,,SERAC1,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['A', AT]",Heterozygous,intron_variant,2/10,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No
4915,,,ADAM9,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['GAAATCTCAGGACTAGA', G]",Heterozygous,intron_variant,2/10,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
4916,,,PSAP,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['AAGTCTC', A]",Homozygous,intron_variant,2/10,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No


In [48]:
merged_1 = merged_1.fillna('NA')
merged_1

Unnamed: 0,CHROM,POS,Gene Name,Gene_Match,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,allele,zygosity,consequence,Consequence_score,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,chr1,1043223.0,AGRN,No,rs35881187,CCT,C,0/1,93,30,30,8,22,73.33%,4.1351E-10,8,0,21,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=126,SLO,,,,,,,COMMON=1,RS=35881187,RV,,,GNO,VLD,ASP,,,,,"TOPMED=0.55968081039755351,0.44031918960244648",WGT=1,,,,,,KGPhase3,"CAF=0.5208,0.4792",VC=DIV,,KGPhase1,,VP=0x05012808000515013e000200,SAO=0,INT,G5,,PMC,SSR=0,RSPOS=1043224,,PM,"['CCT', C]",Heterozygous,intron_variant,2/10,Benign,Congenital_myasthenic_syndrome_8|not_specified...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
1,chr1,1299382.0,ACAP3,No,rs143128930,AG,A,0/1,66,23,23,7,16,69.57%,2.4726E-7,5,2,11,5,ADP=23,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,,,,R5,,,,COMMON=1,RS=143128930,,,,,VLD,ASP,,,,,"TOPMED=0.91126720183486238,0.08873279816513761",WGT=1,,,,,,KGPhase3,"CAF=0.8806,0.1194",VC=DIV,,KGPhase1,,VP=0x0500000a000515003e000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1299383,,,,,,,,,,,,,,,
2,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",No,rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,,,,,,,,,,,,,
3,chr1,1312198.0,"INTS11,MIR6727,CPSF3L",No,rs70949570;rs752605219,T,TGGGGG,0/1,29,19,19,2,7,36.84%,1.1312E-3,0,2,7,0,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,"dbSNPBuildID=144,130",SLO,,R3,,,,,,"RS=752605219,70949570",,,,GNO,VLD,ASP,,,,,"TOPMED=0.42793546126401630,0.00001592762487257...","WGT=1,1",,,,,,,,"VC=DIV,DIV",,,NOV,"VP=0x0500000c0005040002000204,0x0501000c000500...","SAO=0,0",INT,,,,"SSR=0,0","RSPOS=1312198,1312208",,,,,,,,,,,,,,,
4,chr1,1353987.0,MXRA8,No,rs140777846,CTG,C,0/1,93,59,59,33,26,44.07%,4.2407E-10,20,13,16,10,ADP=59,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4913,,,HEXB,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['GC', G]",Homozygous,intron_variant,2/10,Benign,Sandhoff_disease|not_provided,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,No
4914,,,SERAC1,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['A', AT]",Heterozygous,intron_variant,2/10,Benign,"3-methylglutaconic_aciduria_with_deafness', '_...","criteria_provided', '_multiple_submitters', '_...",germline,indel,Insertion,MayBe No,No,No
4915,,,ADAM9,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['GAAATCTCAGGACTAGA', G]",Heterozygous,intron_variant,2/10,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
4916,,,PSAP,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"['AAGTCTC', A]",Homozygous,intron_variant,2/10,Benign,not_provided,"criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No


In [49]:
merged_1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/InDel_files/KHHSPTGPCSP16_InDel_final.xlsx', index=False)

# Multiple Genes wise analysis

In [None]:
merged_2['Gene Match'] = 'No'
merged_2['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in merged_2['Gene Name'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df_gene['Gene Name'].values:
                merged_2.at[index, 'Gene Match'] = 'Yes'
                merged_2.at[index, 'Matched_Gene'] = gene
                break
    
merged_2