In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Desktop/Covered_regions.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/KHAIGHGPTTL207_annotated_indel.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGHGPTTL207_annotated_indel.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()


In [9]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGHGPTTL207_annotated_indel.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=70;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0,1;COMMO...",0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13
4,chr1,1657358,rs377230281,T,TA,.,PASS,ADP=112;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CDK...,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,chrX,153462077,rs138590468;rs369801831,TA,T,.,PASS,"ADP=40;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5624,0....",0/1,56,40,40,24,16,40%,2.3315E-6,20,4,13,3
2760,chrX,153692201,rs199498083,C,CA,.,PASS,"ADP=79;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8972,0....",0/1,113,79,79,47,32,40.51%,4.4277E-12,30,17,23,9
2761,chrX,153909413,rs11405331;rs34046683;rs869136125,T,TG,.,PASS,"ADP=30;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,.,1,0,1...",1/1,170,30,30,0,30,100%,8.4556E-18,0,0,16,14
2762,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=108;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.000264...,1/1,255,108,108,7,101,93.52%,7.6722E-54,4,3,76,25


In [10]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity
0,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,1,0,Heterozygous
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,1,0,Heterozygous
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=70;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,1,0,Heterozygous
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0,1;COMMO...",0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,1,0,Heterozygous
4,chr1,1657358,rs377230281,T,TA,.,PASS,ADP=112;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CDK...,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,1,0,Heterozygous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,chrX,153462077,rs138590468;rs369801831,TA,T,.,PASS,"ADP=40;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5624,0....",0/1,56,40,40,24,16,40%,2.3315E-6,20,4,13,3,1,0,Heterozygous
2760,chrX,153692201,rs199498083,C,CA,.,PASS,"ADP=79;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8972,0....",0/1,113,79,79,47,32,40.51%,4.4277E-12,30,17,23,9,1,0,Heterozygous
2761,chrX,153909413,rs11405331;rs34046683;rs869136125,T,TG,.,PASS,"ADP=30;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,.,1,0,1...",1/1,170,30,30,0,30,100%,8.4556E-18,0,0,16,14,0,1,Homozygous
2762,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=108;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.000264...,1/1,255,108,108,7,101,93.52%,7.6722E-54,4,3,76,25,0,1,Homozygous


In [11]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name
0,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,1,0,Heterozygous,ACAP3:116983,ACAP3
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,1,0,Heterozygous,TAS1R3:83756,TAS1R3
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=70;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,1,0,Heterozygous,MXRA8:54587,MXRA8
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0,1;COMMO...",0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,1,0,Heterozygous,MXRA8:54587,MXRA8
4,chr1,1657358,rs377230281,T,TA,.,PASS,ADP=112;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CDK...,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,1,0,Heterozygous,CDK11B:984,CDK11B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,chrX,153462077,rs138590468;rs369801831,TA,T,.,PASS,"ADP=40;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5624,0....",0/1,56,40,40,24,16,40%,2.3315E-6,20,4,13,3,1,0,Heterozygous,"HAUS7:55559,HAUS7:55559",HAUS7
2760,chrX,153692201,rs199498083,C,CA,.,PASS,"ADP=79;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8972,0....",0/1,113,79,79,47,32,40.51%,4.4277E-12,30,17,23,9,1,0,Heterozygous,SLC6A8:6535,SLC6A8
2761,chrX,153909413,rs11405331;rs34046683;rs869136125,T,TG,.,PASS,"ADP=30;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,.,1,0,1...",1/1,170,30,30,0,30,100%,8.4556E-18,0,0,16,14,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393,ARHGAP4:393",ARHGAP4
2762,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=108;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.000264...,1/1,255,108,108,7,101,93.52%,7.6722E-54,4,3,76,25,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393",ARHGAP4


In [12]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq
0,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,1,0,Heterozygous,ACAP3:116983,ACAP3,,
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,1,0,Heterozygous,TAS1R3:83756,TAS1R3,,
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=70;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,1,0,Heterozygous,MXRA8:54587,MXRA8,,
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0,1;COMMO...",0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,1,0,Heterozygous,MXRA8:54587,MXRA8,,
4,chr1,1657358,rs377230281,T,TA,.,PASS,ADP=112;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CDK...,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,1,0,Heterozygous,CDK11B:984,CDK11B,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,chrX,153462077,rs138590468;rs369801831,TA,T,.,PASS,"ADP=40;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5624,0....",0/1,56,40,40,24,16,40%,2.3315E-6,20,4,13,3,1,0,Heterozygous,"HAUS7:55559,HAUS7:55559",HAUS7,,
2760,chrX,153692201,rs199498083,C,CA,.,PASS,"ADP=79;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8972,0....",0/1,113,79,79,47,32,40.51%,4.4277E-12,30,17,23,9,1,0,Heterozygous,SLC6A8:6535,SLC6A8,,
2761,chrX,153909413,rs11405331;rs34046683;rs869136125,T,TG,.,PASS,"ADP=30;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,.,1,0,1...",1/1,170,30,30,0,30,100%,8.4556E-18,0,0,16,14,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393,ARHGAP4:393",ARHGAP4,,
2762,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=108;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.000264...,1/1,255,108,108,7,101,93.52%,7.6722E-54,4,3,76,25,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393",ARHGAP4,,


In [7]:
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]

vcf['AF'] = vcf['csq'].str.split('|').str[42]
vcf['AFR_AF'] = vcf['csq'].str.split('|').str[43]
vcf['AMR_AF'] = vcf['csq'].str.split('|').str[44]
vcf['EAS_AF'] = vcf['csq'].str.split('|').str[45]
vcf['EUR_AF'] = vcf['csq'].str.split('|').str[46]
vcf['SAS_AF'] = vcf['csq'].str.split('|').str[47]
vcf['gnomADe_AF'] = vcf['csq'].str.split('|').str[48]
vcf['gnomADe_AFR_AF'] = vcf['csq'].str.split('|').str[49]
vcf['gnomADe_AMR_AF'] = vcf['csq'].str.split('|').str[50]
vcf['gnomADe_ASJ_AF'] = vcf['csq'].str.split('|').str[51]
vcf['gnomADe_EAS_AF'] = vcf['csq'].str.split('|').str[52]
vcf['gnomADe_FIN_AF'] = vcf['csq'].str.split('|').str[53]
vcf['gnomADe_NFE_AF'] = vcf['csq'].str.split('|').str[54]
vcf['gnomADe_OTH_AF'] = vcf['csq'].str.split('|').str[55]
vcf['gnomADe_SAS_AF'] = vcf['csq'].str.split('|').str[56]
vcf['gnomADg_AF'] = vcf['csq'].str.split('|').str[57]
vcf['gnomADg_AFR_AF'] = vcf['csq'].str.split('|').str[58]
vcf['gnomADg_AMI_AF'] = vcf['csq'].str.split('|').str[59]
vcf['gnomADg_AMR_AF'] = vcf['csq'].str.split('|').str[60]
vcf['gnomADg_ASJ_AF'] = vcf['csq'].str.split('|').str[61]
vcf['gnomADg_EAS_AF'] = vcf['csq'].str.split('|').str[62]
vcf['gnomADg_FIN_AF'] = vcf['csq'].str.split('|').str[63]
vcf['gnomADg_MID_AF'] = vcf['csq'].str.split('|').str[64]
vcf['gnomADg_NFE_AF'] = vcf['csq'].str.split('|').str[65]
vcf['gnomADg_OTH_AF'] = vcf['csq'].str.split('|').str[66]
vcf['gnomADg_SAS_AF'] = vcf['csq'].str.split('|').str[67]
vcf['MAX_AF'] = vcf['csq'].str.split('|').str[68]
vcf['MAX_AF_POPS'] = vcf['csq'].str.split('|').str[69]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS
0,chr1,1299382,rs143128930,AG,A,.,PASS,"ADP=50;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8806,0....",0/1,97,50,50,24,26,52%,1.7375E-10,18,6,17,9,1,0,Heterozygous,ACAP3:116983,ACAP3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,chr1,1331945,rs200330269,G,GC,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8105,0....",0/1,48,41,41,26,14,34.15%,1.5388E-5,12,14,2,12,1,0,Heterozygous,TAS1R3:83756,TAS1R3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,chr1,1353987,rs140777846,CTG,C,.,PASS,"ADP=70;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.1793,0....",0/1,219,70,70,19,51,72.86%,1.2075E-22,12,7,34,17,1,0,Heterozygous,MXRA8:54587,MXRA8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,chr1,1355779,rs201260508,GA,G,.,PASS,"ADP=29;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0,1;COMMO...",0/1,88,29,29,8,21,72.41%,1.2841E-9,7,1,8,13,1,0,Heterozygous,MXRA8:54587,MXRA8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,chr1,1657358,rs377230281,T,TA,.,PASS,ADP=112;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=CDK...,0/1,78,112,112,88,24,21.43%,1.4999E-8,65,23,4,20,1,0,Heterozygous,CDK11B:984,CDK11B,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,chrX,153462077,rs138590468;rs369801831,TA,T,.,PASS,"ADP=40;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5624,0....",0/1,56,40,40,24,16,40%,2.3315E-6,20,4,13,3,1,0,Heterozygous,"HAUS7:55559,HAUS7:55559",HAUS7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2760,chrX,153692201,rs199498083,C,CA,.,PASS,"ADP=79;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8972,0....",0/1,113,79,79,47,32,40.51%,4.4277E-12,30,17,23,9,1,0,Heterozygous,SLC6A8:6535,SLC6A8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2761,chrX,153909413,rs11405331;rs34046683;rs869136125,T,TG,.,PASS,"ADP=30;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,.,1,0,1...",1/1,170,30,30,0,30,100%,8.4556E-18,0,0,16,14,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393,ARHGAP4:393",ARHGAP4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2762,chrX,153913005,rs5904376;rs77485258,G,GC,.,PASS,ADP=108;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.000264...,1/1,255,108,108,7,101,93.52%,7.6722E-54,4,3,76,25,0,1,Homozygous,"ARHGAP4:393,ARHGAP4:393",ARHGAP4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Cerebellarataxia_Lit_final_Positions_hg38_hg37.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]
df_3

Unnamed: 0,CHROM,POS,Literature
0,chrX,38352177,Yes
1,chrX,38352183,Yes
2,chrX,38352181,Yes
3,chrX,38352256,Yes
4,chrX,38352332,Yes
...,...,...,...
1882,chr7,107915692,Yes
1883,chr7,107916864,Yes
1884,chr7,107915532,Yes
1885,chr7,107917349,Yes


In [12]:
df_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Cerebellarataxia.xlsx', index=False)

In [10]:
df_3 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/parkin_Cerebellarataxia_lit_pos.xlsx')
df_3

Unnamed: 0,CHROM,POS,Literature
0,chr1,7962863,Parkinson
1,chr1,7984930,Parkinson
2,chr1,7984981,Parkinson
3,chr1,7984954,Parkinson
4,chr1,7970951,Parkinson
...,...,...,...
3146,chr7,107915692,Cerebellarataxia
3147,chr7,107916864,Cerebellarataxia
3148,chr7,107915532,Cerebellarataxia
3149,chr7,107917349,Cerebellarataxia


In [49]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Park_cere_genes.xlsx')
df_gene

Unnamed: 0,Gene Name,Gene_Match
0,MYO6,Parkinson
1,GBA1,Parkinson
2,PRKN,Parkinson
3,MRE11,Parkinson
4,LRRK2,Parkinson
...,...,...
79,CLN1,Cerebellarataxia
80,CLN2,Cerebellarataxia
81,CLN3,Cerebellarataxia
82,PHYH,Cerebellarataxia


In [12]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=120;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,,,,,ENST00000641515.2:c.484A>G,ENSP00000493376.2:p.Thr162Ala,benign(0),protein_coding,3/3,,162,T/A,Aca/Gca,1,,missense_variant,MODERATE,tolerated(0.92),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg_EAS,T162A
1,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,1,0,Heterozygous,OR4F5:79501,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,,,,,ENST00000641515.2:c.870T>C,ENSP00000493376.2:p.Ser290%3D,,protein_coding,3/3,,290,S,tcT/tcC,1,,synonymous_variant,LOW,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,S290
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,,,,,,,,protein_coding,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000419394.2:n.480+17991A>G,,,processed_transcript,,3/3,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000440200.5:n.169+34338A>G,,,processed_transcript,,1/2,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.4079,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,0,1,Homozygous,IL9R:3581,IL9R,C|splice_polypyrimidine_tract_variant&intron_v...,C|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,enhancer,,,,,,,,regulatory_region_variant,MODIFIER,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe_AFR,
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|missense_variant|MODERATE|IL9R|ENSG000001243...,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000244174.11:c.1094G>A,ENSP00000244174.5:p.Arg365His,benign(0),protein_coding,9/9,,365,R/H,cGt/cAt,1,,missense_variant,MODERATE,tolerated(0.56),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,R365H
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|3_prime_UTR_variant|MODIFIER|IL9R|ENSG000001...,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000369423.7:c.*85G>A,,,protein_coding,9/9,,,,,1,,3_prime_UTR_variant,MODIFIER,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|downstream_gene_variant|MODIFIER|WASIR1|ENSG...,not_provided,benign,criteria_provided&_single_submitter,771064,,,,lncRNA,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,


In [13]:
vcf[['HGVSc', 'HGVSc (Transcript)']] = vcf['HGVSC'].str.split(':', 1, expand=True)
vcf[['HGVSp', 'HGVSp (Transcript)']] = vcf['HGVSP'].str.split(':', 1, expand=True)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,HET,HOM,Zygosity,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript)
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=120;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,0,1,Homozygous,OR4F5:79501,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,,,,,ENST00000641515.2:c.484A>G,ENSP00000493376.2:p.Thr162Ala,benign(0),protein_coding,3/3,,162,T/A,Aca/Gca,1,,missense_variant,MODERATE,tolerated(0.92),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg_EAS,T162A,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala
1,chr1,69897,rs200676709,T,C,.,PASS,"ADP=98;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,1,0,Heterozygous,OR4F5:79501,OR4F5,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,,,,,ENST00000641515.2:c.870T>C,ENSP00000493376.2:p.Ser290%3D,,protein_coding,3/3,,290,S,tcT/tcC,1,,synonymous_variant,LOW,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,S290,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,,,,,,,,protein_coding,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000419394.2:n.480+17991A>G,,,processed_transcript,,3/3,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ENST00000419394.2,n.480+17991A>G,,
2,chr1,685694,.,T,C,.,PASS,ADP=28;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,1,0,Heterozygous,,,C|downstream_gene_variant|MODIFIER|OR4F16|ENSG...,C|intron_variant&non_coding_transcript_variant...,,,,,ENST00000440200.5:n.169+34338A>G,,,processed_transcript,,1/2,,,,-1,,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ENST00000440200.5,n.169+34338A>G,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=11;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.4079,0....",1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,0,1,Homozygous,IL9R:3581,IL9R,C|splice_polypyrimidine_tract_variant&intron_v...,C|regulatory_region_variant|MODIFIER|||Regulat...,,,,,,,,enhancer,,,,,,,,regulatory_region_variant,MODIFIER,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe_AFR,,,,,
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|missense_variant|MODERATE|IL9R|ENSG000001243...,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000244174.11:c.1094G>A,ENSP00000244174.5:p.Arg365His,benign(0),protein_coding,9/9,,365,R/H,cGt/cAt,1,,missense_variant,MODERATE,tolerated(0.56),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,R365H,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|3_prime_UTR_variant|MODIFIER|IL9R|ENSG000001...,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000369423.7:c.*85G>A,,,protein_coding,9/9,,,,,1,,3_prime_UTR_variant,MODIFIER,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,,ENST00000369423.7,c.*85G>A,,
38701,chrX,156009937,rs147385831,G,A,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;ASP;CFL;GENEINFO=...,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,1,0,Heterozygous,IL9R:3581,IL9R,A|missense_variant|MODERATE|IL9R|ENSG000001243...,A|downstream_gene_variant|MODIFIER|WASIR1|ENSG...,not_provided,benign,criteria_provided&_single_submitter,771064,,,,lncRNA,,,,,,-1,,downstream_gene_variant,MODIFIER,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,,,,,


In [14]:
vcf_final = vcf[['Gene Name', 'rsID','CHROM', 'POS', 'REF', 'ALT', 'Zygosity', 'Consequence', 'IMPACT',
          'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
          'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
          'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense_variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg_EAS,protein_coding,3/3,,T162A,Aca/Gca,1,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous_variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein_coding,3/3,,S290,tcT/tcC,1,
2,,.,chr1,685694,T,C,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein_coding,,,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed_transcript,,3/3,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed_transcript,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory_region_variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe_AFR,enhancer,,,,,,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense_variant,MODERATE,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,protein_coding,9/9,,R365H,cGt/cAt,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3_prime_UTR_variant,MODIFIER,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,protein_coding,9/9,,,,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream_gene_variant,MODIFIER,not_provided,benign,criteria_provided&_single_submitter,771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,lncRNA,,,,,-1,


In [15]:
# Define the terms to remove
remove_terms = set(["not_specified", "not_provided"])

# Apply the filtering operation to 'Column1' only
vcf_final['ClinVar_CLNDN'] = vcf_final['ClinVar_CLNDN'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['CLIN_SIG'] = vcf_final['CLIN_SIG'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['ClinVar_CLNREVSTAT'] = vcf_final['ClinVar_CLNREVSTAT'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)

# Print the modified DataFrame
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense_variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg_EAS,protein_coding,3/3,,T162A,Aca/Gca,1,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous_variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein_coding,3/3,,S290,tcT/tcC,1,
2,,.,chr1,685694,T,C,Heterozygous,downstream_gene_variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein_coding,,,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed_transcript,,3/3,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed_transcript,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory_region_variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe_AFR,enhancer,,,,,,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense_variant,MODERATE,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,protein_coding,9/9,,R365H,cGt/cAt,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3_prime_UTR_variant,MODIFIER,not_provided,benign,criteria_provided&_single_submitter,771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,protein_coding,9/9,,,,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream_gene_variant,MODIFIER,not_provided,benign,criteria_provided&_single_submitter,771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg_FIN,lncRNA,,,,,-1,


In [16]:
vcf_final = vcf_final.astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
#vcf_final.loc[:, vcf_final.columns != 'CHROM'] = vcf_final.loc[:, vcf_final.columns != 'CHROM'].astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,
2,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,


In [17]:
vcf_final['consequence'] = vcf_final['Consequence'].str.split(',').str[0]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,missense variant
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,downstream gene variant
2,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,intron variant
2,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,intron variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38700,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,regulatory region variant
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,missense variant
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,3 prime UTR variant
38701,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,downstream gene variant


In [18]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [19]:
merged_1 = pd.merge(vcf_final, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant,3/10
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,downstream gene variant,2/10
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,intron variant,2/10
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,intron variant,2/10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,regulatory region variant,2/10
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,missense variant,7/10
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,3 prime UTR variant,3/10
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,downstream gene variant,2/10


In [74]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [75]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,MODERATE,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,missense variant,7/10,5.0
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,LOW,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,synonymous variant,3/10,2.5
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,MODIFIER,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,downstream gene variant,2/10,1.5
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,intron variant,2/10,1.5
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,intron variant,2/10,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,MODIFIER,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,regulatory region variant,2/10,1.5
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,MODERATE,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,missense variant,7/10,5.0
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,3 prime UTR variant,3/10,1.5
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,MODIFIER,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,downstream gene variant,2/10,1.5


In [76]:
merged_2 = merged_2[['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,


In [77]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Park_cere_genes.xlsx')
df_gene

Unnamed: 0,Gene Name,Gene_Match
0,MYO6,Parkinson
1,GBA1,Parkinson
2,PRKN,Parkinson
3,MRE11,Parkinson
4,LRRK2,Parkinson
...,...,...
79,CLN1,Cerebellarataxia
80,CLN2,Cerebellarataxia
81,CLN3,Cerebellarataxia
82,PHYH,Cerebellarataxia


In [78]:
merged_2['Gene Match'] = 'No'
merged_2['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in merged_2['Gene Name'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df_gene['Gene Name'].values:
                merged_2.at[index, 'Gene Match'] = 'Yes'
                merged_2.at[index, 'Matched_Gene'] = gene
                break
    
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene Match,Matched_Gene
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,No,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,No,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,No,
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,No,
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,No,
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,No,
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,No,
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,No,


In [84]:
df_gene = df_gene.rename({'Gene Name':'Matched_Gene', 'Gene_Match':'Gene Match'}, axis=1)
df_gene

Unnamed: 0,Matched_Gene,Gene Match
0,MYO6,Parkinson
1,GBA1,Parkinson
2,PRKN,Parkinson
3,MRE11,Parkinson
4,LRRK2,Parkinson
...,...,...
79,CLN1,Cerebellarataxia
80,CLN2,Cerebellarataxia
81,CLN3,Cerebellarataxia
82,PHYH,Cerebellarataxia


In [56]:
merged_2.columns

Index(['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Gene_Match', 'Matched_Ge

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene_Match,Matched_Gene,Gene Match_x,Gene Match_y,Gene Match
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,No,,,,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,No,,,,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,No,,,,
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,No,,,,
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,No,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,No,,,,
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,No,,,,
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,No,,,,
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,No,,,,


In [79]:
merged_2[merged_2['Gene Match'] == 'Yes']

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene Match,Matched_Gene
3631,PARK7,rs2640906,chr1,7969449,G,A,Homozygous,intron variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1267406,ENST00000338639.10,c.252+45G>A,,,1/1,126,27,25,1,24,96%,2.0568E-13,1,0,13,11,,,0.3323,0.3472,0.3689,0.6508,0.163,0.1319,0.2549,0.3386,0.4742,0.08915,0.6243,0.2772,0.1654,0.2089,0.1092,0.2623,0.3352,0.06872,0.352,0.09267,0.645,0.3337,0.1007,0.1774,0.2253,0.1582,0.6508,EAS,protein coding,,4/6,,,1,,Yes,PARK7
3632,PARK7,rs2640906,chr1,7969449,G,A,Homozygous,intron variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1267406,ENST00000377488.5,c.252+45G>A,,,1/1,126,27,25,1,24,96%,2.0568E-13,1,0,13,11,,,0.3323,0.3472,0.3689,0.6508,0.163,0.1319,0.2549,0.3386,0.4742,0.08915,0.6243,0.2772,0.1654,0.2089,0.1092,0.2623,0.3352,0.06872,0.352,0.09267,0.645,0.3337,0.1007,0.1774,0.2253,0.1582,0.6508,EAS,protein coding,,4/6,,,1,,Yes,PARK7
3633,PARK7,rs2640906,chr1,7969449,G,A,Homozygous,intron variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1267406,ENST00000377491.5,c.252+45G>A,,,1/1,126,27,25,1,24,96%,2.0568E-13,1,0,13,11,,,0.3323,0.3472,0.3689,0.6508,0.163,0.1319,0.2549,0.3386,0.4742,0.08915,0.6243,0.2772,0.1654,0.2089,0.1092,0.2623,0.3352,0.06872,0.352,0.09267,0.645,0.3337,0.1007,0.1774,0.2253,0.1582,0.6508,EAS,protein coding,,4/6,,,1,,Yes,PARK7
3634,PARK7,rs2640906,chr1,7969449,G,A,Homozygous,intron variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1267406,ENST00000377493.9,c.193-1445G>A,,,1/1,126,27,25,1,24,96%,2.0568E-13,1,0,13,11,,,0.3323,0.3472,0.3689,0.6508,0.163,0.1319,0.2549,0.3386,0.4742,0.08915,0.6243,0.2772,0.1654,0.2089,0.1092,0.2623,0.3352,0.06872,0.352,0.09267,0.645,0.3337,0.1007,0.1774,0.2253,0.1582,0.6508,EAS,protein coding,,3/5,,,1,,Yes,PARK7
3635,PARK7,rs2640906,chr1,7969449,G,A,Homozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",1267406,ENST00000460192.5,n.412+45G>A,,,1/1,126,27,25,1,24,96%,2.0568E-13,1,0,13,11,,,0.3323,0.3472,0.3689,0.6508,0.163,0.1319,0.2549,0.3386,0.4742,0.08915,0.6243,0.2772,0.1654,0.2089,0.1092,0.2623,0.3352,0.06872,0.352,0.09267,0.645,0.3337,0.1007,0.1774,0.2253,0.1582,0.6508,EAS,retained intron,,5/5,,,1,,Yes,PARK7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339719,ABCD1,rs11803,chrX,153744612,C,T,Homozygous,TF binding site variant,2/10,MODIFIER,1.5,Adrenoleukodystrophy,benign,"criteria provided, single submitter",368066,,,,,1/1,255,68,68,0,68,100%,1.6809E-40,0,0,46,22,,,0.7311,0.8784,0.7309,0.6008,0.6149,0.7883,,,,,,,,,,0.6966,0.8398,0.5541,0.6935,0.5757,0.6118,0.6302,0.6667,0.6329,0.662,0.7649,0.8784,AFR,,,,,,-1,,Yes,ABCD1
339720,ABCD1,rs1055847,chrX,153744629,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,Adrenoleukodystrophy,benign,"criteria provided, single submitter",368067,ENST00000218104.6,c.*894G>A,,,0/1,122,58,58,26,32,55.17%,5.693E-13,19,7,20,12,,,0.4599,0.1944,0.6069,0.4921,0.5196,0.6253,,,,,,,,,,0.4496,0.2174,0.5332,0.5703,0.5165,0.5175,0.5434,0.5805,0.5324,0.457,0.606,0.6253,SAS,protein coding,10/10,,,,1,,Yes,ABCD1
339721,ABCD1,rs1055847,chrX,153744629,G,A,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,Adrenoleukodystrophy,benign,"criteria provided, single submitter",368067,ENST00000434284.1,n.72-6051C>T,,,0/1,122,58,58,26,32,55.17%,5.693E-13,19,7,20,12,,,0.4599,0.1944,0.6069,0.4921,0.5196,0.6253,,,,,,,,,,0.4496,0.2174,0.5332,0.5703,0.5165,0.5175,0.5434,0.5805,0.5324,0.457,0.606,0.6253,SAS,lncRNA,,1/2,,,-1,,Yes,ABCD1
339722,ABCD1,rs1055847,chrX,153744629,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,Adrenoleukodystrophy,benign,"criteria provided, single submitter",368067,,,,,0/1,122,58,58,26,32,55.17%,5.693E-13,19,7,20,12,,,0.4599,0.1944,0.6069,0.4921,0.5196,0.6253,,,,,,,,,,0.4496,0.2174,0.5332,0.5703,0.5165,0.5175,0.5434,0.5805,0.5324,0.457,0.606,0.6253,SAS,protein coding CDS not defined,,,,,1,,Yes,ABCD1


In [81]:
merged_2.columns

Index(['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Gene Match', 'Matched_Ge

In [83]:
merged_2 = merged_2.drop(columns=['Gene Match'], axis=1)
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,


In [85]:
merged_2 = pd.merge(merged_2, df_gene, on= 'Matched_Gene', how = 'left', sort = False)
merged_2['Gene Match'] = merged_2['Gene Match'].fillna('No')
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene,Gene Match
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,,No
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,,No
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,No
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,,No
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,,No
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,,No
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,,No
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,,No


In [86]:
merged_2['Gene Match'].value_counts()

No                  337941
Cerebellarataxia      1873
Parkinson              843
Name: Gene Match, dtype: int64

In [87]:
df_3

Unnamed: 0,CHROM,POS,Literature
0,chr1,7962863,Parkinson
1,chr1,7984930,Parkinson
2,chr1,7984981,Parkinson
3,chr1,7984954,Parkinson
4,chr1,7970951,Parkinson
...,...,...,...
3146,chr7,107915692,Cerebellarataxia
3147,chr7,107916864,Cerebellarataxia
3148,chr7,107915532,Cerebellarataxia
3149,chr7,107917349,Cerebellarataxia


In [92]:
merged_2['POS'] = merged_2['POS'].astype('int64')
df_3['POS'] = df_3['POS'].astype('int64')

print("merged_2['CHROM'] data type:", merged_2['CHROM'].dtype)
print("merged_2['POS'] data type:", merged_2['POS'].dtype)
print("df_3['CHROM'] data type:", df_3['CHROM'].dtype)
print("df_3['POS'] data type:", df_3['POS'].dtype)

merged_2['CHROM'] data type: object
merged_2['POS'] data type: int64
df_3['CHROM'] data type: object
df_3['POS'] data type: int64


In [93]:
merged_3 = pd.merge(merged_2, df_3, on=['CHROM', 'POS'], how='left', sort=False)
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene,Gene Match,Literature
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,,No,No
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,,No,No
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,No,No
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,,No,No
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,,No,No
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,,No,No
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,,No,No
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,,No,No


In [94]:
merged_3.Literature.value_counts()

No                  340591
Parkinson               40
Cerebellarataxia        26
Name: Literature, dtype: int64

In [95]:
merged_3.columns

Index(['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Matched_Gene', 'Gene Mat

In [96]:
merged_3 = merged_3[['Gene Name', 'Gene Match', 'rsID', 'Literature', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_3

Unnamed: 0,Gene Name,Gene Match,rsID,Literature,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,OR4F5,No,rs2691305,No,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,
1,OR4F5,No,rs200676709,No,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,
2,,No,.,No,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
3,,No,.,No,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,
4,,No,.,No,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,No,rs2037999,No,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,
340653,IL9R,No,rs147385831,No,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,
340654,IL9R,No,rs147385831,No,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,
340655,IL9R,No,rs147385831,No,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,


In [97]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHAIGHGPTTL207_depth_vcf_processed.xlsx', index=False)

# Literature check

In [115]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/MODY_covered_list.xlsx')
data

Unnamed: 0,samples,GENE,CHROM,POS,ID
0,12652713,,chr16,31779,rs141542623
1,12652713,TUBB8,chr10,49286,rs6560829
2,17751397,TUBB8,chr10,49286,rs6560829
3,17751406,TUBB8,chr10,49286,rs6560829
4,12652700,TUBB8,chr10,49286,rs6560829
...,...,...,...,...,...
165543,17751406,ZNF692,chr1,248855917,rs13313088
165544,17751406,ZNF692,chr1,248856006,rs13313009
165545,17751397,PGBD2,chr1,248913954,rs12025760
165546,17751397,PGBD2,chr1,248916897,rs74157349


In [127]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/Depth_files/srinivas_sir_covered.bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65645
1,chr1,65811,65993
2,chr1,69461,69620
3,chr1,785981,786159
4,chr1,786130,786446
...,...,...,...
230714,chrY,57190028,57190328
230715,chrY,57190299,57190439
230716,chrY,57190874,57191014
230717,chrY,57191846,57192058


In [101]:
df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977
...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093


In [102]:
df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df

Unnamed: 0,chromosome,Start_pos,End_pos,INFO,Extended_Start_pos,Extended_End_pos,gene_symbol
0,chr1,65509,65629,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,65489,65649,OR4F5
1,chr1,69027,70017,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,69007,70037,OR4F5
2,chr1,450730,451686,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,450710,451706,OR4F29
3,chr1,685706,686662,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,685686,686682,OR4F16
4,chr1,924421,924957,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,924401,924977,SAMD11
...,...,...,...,...,...,...,...
208906,chrY,25038801,25038921,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25038781,25038941,BPY2C
208907,chrY,25041766,25041886,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25041746,25041906,BPY2C
208908,chrY,25043908,25044028,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,25043888,25044048,BPY2C
208909,chrY,25622433,25624073,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",25622413,25624093,CDY1


In [103]:
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]
df

Unnamed: 0,chromosome,Extended_Start_pos,Extended_End_pos,INFO,gene_symbol
0,chr1,65489,65649,ensembl_gene_id=ENSG00000186092;gene_symbol=OR4F5,OR4F5
1,chr1,69007,70037,ccds_id=CCDS30547.1;ensembl_gene_id=ENSG000001...,OR4F5
2,chr1,450710,451706,ccds_id=CCDS72675.1;ensembl_gene_id=ENSG000002...,OR4F29
3,chr1,685686,686682,ccds_id=CCDS41221.1;ensembl_gene_id=ENSG000002...,OR4F16
4,chr1,924401,924977,ensembl_gene_id=ENSG00000187634;gene_symbol=SA...,SAMD11
...,...,...,...,...,...
208906,chrY,25038781,25038941,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208907,chrY,25041746,25041906,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208908,chrY,25043888,25044048,ccds_id=CCDS44030.1;ensembl_gene_id=ENSG000001...,BPY2C
208909,chrY,25622413,25624093,"ccds_id=CCDS14801.1,CCDS14802.1;ensembl_gene_i...",CDY1


In [128]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Start_pos']
    end_pos = row['End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
data['Covered/Not_Covered'] = data.apply(check_coverage, axis=1)
data

Unnamed: 0,samples,GENE,CHROM,POS,ID,Covered/Not_Covered
0,12652713,,chr16,31779,rs141542623,Covered
1,12652713,TUBB8,chr10,49286,rs6560829,Covered
2,17751397,TUBB8,chr10,49286,rs6560829,Covered
3,17751406,TUBB8,chr10,49286,rs6560829,Covered
4,12652700,TUBB8,chr10,49286,rs6560829,Covered
...,...,...,...,...,...,...
165543,17751406,ZNF692,chr1,248855917,rs13313088,Covered
165544,17751406,ZNF692,chr1,248856006,rs13313009,Covered
165545,17751397,PGBD2,chr1,248913954,rs12025760,Covered
165546,17751397,PGBD2,chr1,248916897,rs74157349,Covered


In [129]:
data['Covered/Not_Covered'].value_counts()

Covered    165548
Name: Covered/Not_Covered, dtype: int64

In [131]:
import pandas as pd
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Mody_Lit_final_Positions_hg38_hg37.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]

df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']

df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20

df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]


# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
df_3['Covered_status'] = df_3.apply(check_coverage, axis=1)

df_3 = df_3[df_3['Covered_status'] == 'Covered']
df_3

Unnamed: 0,CHROM,POS,Literature,Covered_status
0,chr12,120978691,Yes,Covered
1,chr12,120978908,Yes,Covered
2,chr12,120997590,Yes,Covered
3,chr12,120996531,Yes,Covered
4,chr12,120978928,Yes,Covered
...,...,...,...,...
2206,chr11,17394331,Yes,Covered
2207,chr11,17396955,Yes,Covered
2208,chr11,17395249,Yes,Covered
2209,chr11,17413396,Yes,Covered


In [132]:
merged_3 = pd.merge(data, df_3, on=['CHROM', 'POS'], how='left', sort=False)
merged_3

Unnamed: 0,samples,GENE,CHROM,POS,ID,Covered/Not_Covered,Literature,Covered_status
0,12652713,,chr16,31779,rs141542623,Covered,,
1,12652713,TUBB8,chr10,49286,rs6560829,Covered,,
2,17751397,TUBB8,chr10,49286,rs6560829,Covered,,
3,17751406,TUBB8,chr10,49286,rs6560829,Covered,,
4,12652700,TUBB8,chr10,49286,rs6560829,Covered,,
...,...,...,...,...,...,...,...,...
165543,17751406,ZNF692,chr1,248855917,rs13313088,Covered,,
165544,17751406,ZNF692,chr1,248856006,rs13313009,Covered,,
165545,17751397,PGBD2,chr1,248913954,rs12025760,Covered,,
165546,17751397,PGBD2,chr1,248916897,rs74157349,Covered,,


In [133]:
merged_3.Literature.value_counts()

Yes    9
Name: Literature, dtype: int64

In [135]:
x= merged_3[merged_3['Literature'] == 'Yes']
x

Unnamed: 0,samples,GENE,CHROM,POS,ID,Covered/Not_Covered,Literature,Covered_status
28075,12652713,,chr11,17394397,.,Covered,Yes,Covered
128673,12652713,HNF1A,chr12,120978847,rs1169288,Covered,Yes,Covered
128674,17751397,HNF1A,chr12,120979061,rs1800574,Covered,Yes,Covered
128675,17751406,HNF1A,chr12,120979061,rs1800574,Covered,Yes,Covered
128678,12652700,HNF1A,chr12,120989017,rs1057520291,Covered,Yes,Covered
128682,12652700,HNF1A,chr12,120994314,rs56348580,Covered,Yes,Covered
128685,17751406,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered
128686,17751397,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered
128687,12652713,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered


In [136]:
x.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/lit_mody.xlsx', index=False)
x

Unnamed: 0,samples,GENE,CHROM,POS,ID,Covered/Not_Covered,Literature,Covered_status
28075,12652713,,chr11,17394397,.,Covered,Yes,Covered
128673,12652713,HNF1A,chr12,120978847,rs1169288,Covered,Yes,Covered
128674,17751397,HNF1A,chr12,120979061,rs1800574,Covered,Yes,Covered
128675,17751406,HNF1A,chr12,120979061,rs1800574,Covered,Yes,Covered
128678,12652700,HNF1A,chr12,120989017,rs1057520291,Covered,Yes,Covered
128682,12652700,HNF1A,chr12,120994314,rs56348580,Covered,Yes,Covered
128685,17751406,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered
128686,17751397,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered
128687,12652713,HNF1A,chr12,120997672,rs2464195,Covered,Yes,Covered


In [107]:
data['Covered/Not_Covered'].value_counts()

Covered        2353
Not_Covered     798
Name: Covered/Not_Covered, dtype: int64

In [110]:
data[data['Covered/Not_Covered'] == 'Covered']

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr1,7962863,Parkinson,Covered
4,chr1,7970951,Parkinson,Covered
5,chr1,7985014,Parkinson,Covered
8,chr1,7985041,Parkinson,Covered
9,chr1,7965425,Parkinson,Covered
...,...,...,...,...
3146,chr7,107915692,Cerebellarataxia,Covered
3147,chr7,107916864,Cerebellarataxia,Covered
3148,chr7,107915532,Cerebellarataxia,Covered
3149,chr7,107917349,Cerebellarataxia,Covered


In [112]:
merged_3 = pd.merge(merged_2, data, on=['CHROM', 'POS'], how='left', sort=False)
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene,Gene Match,Literature,Covered/Not_Covered
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,86,34,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,,No,No,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,82,98,98,73,25,25.51%,5.1517E-9,41,32,11,14,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,,No,No,
2,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,No,No,
3,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000419394.2,n.480+17991A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,3/3,,,-1,,,No,No,
4,,.,chr1,685694,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000440200.5,n.169+34338A>G,,,0/1,34,28,28,18,10,35.71%,3.6855E-4,17,1,9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,processed transcript,,1/2,,,-1,,,No,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340652,IL9R,rs2037999,chrX,156003433,T,C,Homozygous,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,58,11,11,0,11,100%,1.4176E-6,0,0,7,4,,,,,,,,,0.441,0.8217,0.3483,0.4304,0.702,0.5274,0.3271,0.4035,0.5578,0.5073,0.809,0.356,0.3772,0.4374,0.7099,0.5742,0.4209,0.3323,0.4516,0.5465,0.8217,gnomADe AFR,enhancer,,,,,,,,No,No,
340653,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,missense variant,7/10,MODERATE,5.0,not provided,benign,"criteria provided, single submitter",771064,ENST00000244174.11,c.1094G>A,ENSP00000244174.5,p.Arg365His,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,tolerated(0.56),benign(0),,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,R365H,cGt/cAt,1,,,No,No,
340654,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,3 prime UTR variant,3/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,ENST00000369423.7,c.*85G>A,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,protein coding,9/9,,,,1,,,No,No,
340655,IL9R,rs147385831,chrX,156009937,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,not provided,benign,"criteria provided, single submitter",771064,,,,,0/1,27,20,20,12,8,40%,1.638E-3,12,0,8,0,,,,,,,,,0.01405,0.02867,0.007828,0.01496,0.01587,0.005438,0.008185,0.02348,0.0411,0.02864,0.03691,0.04154,0.01196,0.03051,0.02488,0.0722,0.03478,0.02331,0.02312,0.04483,0.0722,gnomADg FIN,lncRNA,,,,,-1,,,No,No,


In [114]:
merged_3['Covered/Not_Covered'].value_counts()

Covered    66
Name: Covered/Not_Covered, dtype: int64