In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    # Add a new INFO field indicating coverage
                    fields[7] += f';Coverage=Covered'
                else:
                    # Add a new INFO field indicating non-coverage
                    fields[7] += f';Coverage=Not_Covered'
                filtered_vcf_records.append('\t'.join(fields) + '\n')
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/Gstones_Annotations.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/Condition_specific/Gstones_Annotations.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [3]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/Condition_specific/Gstones_Annotations.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...
1,chr2,43844604,rs4148211,A,G,.,.,RS=4148211;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...
2,chr2,43845437,rs4299376,G,"C,T",.,.,RS=4299376;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...
3,chr2,43846861,rs4953023,G,A,.,.,RS=4953023;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...
4,chr2,43872294,rs4148217,C,"A,T",.,.,RS=4148217;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...


In [4]:
vcf['Coverage'] = vcf['INFO'].str.extract(r'Coverage=([^;]+)')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered
1,chr2,43844604,rs4148211,A,G,.,.,RS=4148211;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...,Covered
2,chr2,43845437,rs4299376,G,"C,T",.,.,RS=4299376;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...,Not_Covered
3,chr2,43846861,rs4953023,G,A,.,.,RS=4953023;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...,Not_Covered
4,chr2,43872294,rs4148217,C,"A,T",.,.,RS=4148217;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...,Covered


In [5]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
#vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage,Gene_Name,Gene Name
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5"
1,chr2,43844604,rs4148211,A,G,.,.,RS=4148211;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...,Covered,ABCG8:64241|LOC102725159:102725159,"ABCG8,LOC102725159"
2,chr2,43845437,rs4299376,G,"C,T",.,.,RS=4299376;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...,Not_Covered,ABCG8:64241|LOC102725159:102725159,"ABCG8,LOC102725159"
3,chr2,43846861,rs4953023,G,A,.,.,RS=4953023;dbSNPBuildID=111;SSR=0;GENEINFO=ABC...,Not_Covered,ABCG8:64241,ABCG8
4,chr2,43872294,rs4148217,C,"A,T",.,.,RS=4148217;dbSNPBuildID=110;SSR=0;GENEINFO=ABC...,Covered,ABCG8:64241,ABCG8


In [6]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage,Gene_Name,Gene Name,CSQ,csq
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|missense_variant|MODERATE|ABCG8|ENSG00000143...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|missense_variant|MODERATE|ABCG8|ENSG00000143...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|intron_variant&non_coding_transcript_variant...
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|intron_variant&non_coding_transcript_variant...


In [7]:
########################################################### Required columns extraction from the CSQ column ####################
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['Allele'] = vcf['csq'].str.split('|').str[0]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]
################################################## Frequency columns extraction ################################################
vcf['AF'] = vcf['csq'].str.split('|').str[42]
vcf['AFR_AF'] = vcf['csq'].str.split('|').str[43]
vcf['AMR_AF'] = vcf['csq'].str.split('|').str[44]
vcf['EAS_AF'] = vcf['csq'].str.split('|').str[45]
vcf['EUR_AF'] = vcf['csq'].str.split('|').str[46]
vcf['SAS_AF'] = vcf['csq'].str.split('|').str[47]
vcf['gnomADe_AF'] = vcf['csq'].str.split('|').str[48]
vcf['gnomADe_AFR_AF'] = vcf['csq'].str.split('|').str[49]
vcf['gnomADe_AMR_AF'] = vcf['csq'].str.split('|').str[50]
vcf['gnomADe_ASJ_AF'] = vcf['csq'].str.split('|').str[51]
vcf['gnomADe_EAS_AF'] = vcf['csq'].str.split('|').str[52]
vcf['gnomADe_FIN_AF'] = vcf['csq'].str.split('|').str[53]
vcf['gnomADe_NFE_AF'] = vcf['csq'].str.split('|').str[54]
vcf['gnomADe_OTH_AF'] = vcf['csq'].str.split('|').str[55]
vcf['gnomADe_SAS_AF'] = vcf['csq'].str.split('|').str[56]
vcf['gnomADg_AF'] = vcf['csq'].str.split('|').str[57]
vcf['gnomADg_AFR_AF'] = vcf['csq'].str.split('|').str[58]
vcf['gnomADg_AMI_AF'] = vcf['csq'].str.split('|').str[59]
vcf['gnomADg_AMR_AF'] = vcf['csq'].str.split('|').str[60]
vcf['gnomADg_ASJ_AF'] = vcf['csq'].str.split('|').str[61]
vcf['gnomADg_EAS_AF'] = vcf['csq'].str.split('|').str[62]
vcf['gnomADg_FIN_AF'] = vcf['csq'].str.split('|').str[63]
vcf['gnomADg_MID_AF'] = vcf['csq'].str.split('|').str[64]
vcf['gnomADg_NFE_AF'] = vcf['csq'].str.split('|').str[65]
vcf['gnomADg_OTH_AF'] = vcf['csq'].str.split('|').str[66]
vcf['gnomADg_SAS_AF'] = vcf['csq'].str.split('|').str[67]
vcf['MAX_AF'] = vcf['csq'].str.split('|').str[68]
vcf['MAX_AF_POPS'] = vcf['csq'].str.split('|').str[69]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|missense_variant|MODERATE|ABCG8|ENSG00000143...,,A,,,,ENST00000272286.4:c.55G>A,ENSP00000272286.2:p.Asp19Asn,benign(0.084),protein_coding,1/13,,19.0,D/N,Gat/Aat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.18),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|missense_variant|MODERATE|ABCG8|ENSG00000143...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000272286.4:c.55G>C,ENSP00000272286.2:p.Asp19His,possibly_damaging(0.535),protein_coding,1/13,,19.0,D/H,Gat/Cat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.05),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000643284.1:n.521-5399G>A,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|intron_variant&non_coding_transcript_variant...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000643284.1:n.521-5399G>C,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR


In [8]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|missense_variant|MODERATE|ABCG8|ENSG00000143...,,A,,,,ENST00000272286.4:c.55G>A,ENSP00000272286.2:p.Asp19Asn,benign(0.084),protein_coding,1/13,,19.0,D/N,Gat/Aat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.18),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,D19N
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|missense_variant|MODERATE|ABCG8|ENSG00000143...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000272286.4:c.55G>C,ENSP00000272286.2:p.Asp19His,possibly_damaging(0.535),protein_coding,1/13,,19.0,D/H,Gat/Cat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.05),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,D19H
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000643284.1:n.521-5399G>A,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|intron_variant&non_coding_transcript_variant...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000643284.1:n.521-5399G>C,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,


In [9]:
vcf[['HGVSc', 'HGVSc (Transcript)']] = vcf['HGVSC'].str.split(':', 1, expand=True)
vcf[['HGVSp', 'HGVSp (Transcript)']] = vcf['HGVSP'].str.split(':', 1, expand=True)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Coverage,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript)
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|missense_variant|MODERATE|ABCG8|ENSG00000143...,,A,,,,ENST00000272286.4:c.55G>A,ENSP00000272286.2:p.Asp19Asn,benign(0.084),protein_coding,1/13,,19.0,D/N,Gat/Aat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.18),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,D19N,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|missense_variant|MODERATE|ABCG8|ENSG00000143...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000272286.4:c.55G>C,ENSP00000272286.2:p.Asp19His,possibly_damaging(0.535),protein_coding,1/13,,19.0,D/H,Gat/Cat,1,21708280&28652652&21862702&27286809&19060906&1...,missense_variant,MODERATE,tolerated_low_confidence(0.05),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,D19H,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,protein_coding,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,,A,,,,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|upstream_gene_variant|MODIFIER|ABCG5|ENSG000...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,retained_intron,,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...,upstream_gene_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,,,,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000643284.1:n.521-5399G>A,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,,ENST00000643284.1,n.521-5399G>A,,
0,chr2,43839108,rs11887534,G,"A,C",.,.,RS=11887534;dbSNPBuildID=120;SSR=0;GENEINFO=AB...,Covered,ABCG5:64240|ABCG8:64241,"ABCG8,ABCG5",A|missense_variant|MODERATE|ABCG8|ENSG00000143...,C|intron_variant&non_coding_transcript_variant...,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,C,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000643284.1:n.521-5399G>C,,,retained_intron,,1/2,,,,1,21708280&28652652&21862702&27286809&19060906&1...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,,ENST00000643284.1,n.521-5399G>C,,


In [10]:
vcf_final = vcf[['Gene Name', 'rsID','Coverage', 'CHROM', 'POS', 'REF', 'ALT', 'Allele', 'Consequence', 'IMPACT',
          'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
          'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
          'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
vcf_final

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense_variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated_low_confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,protein_coding,1/13,,D19N,Gat/Aat,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense_variant,MODERATE,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated_low_confidence(0.05),possibly_damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,protein_coding,1/13,,D19H,Gat/Cat,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,protein_coding,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,protein_coding,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,1/2,,,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,1/2,,,1,21708280&28652652&21862702&27286809&19060906&1...


In [11]:
# Define the terms to remove
remove_terms = set(["not_specified", "not_provided"])

# Apply the filtering operation to 'Column1' only
vcf_final['ClinVar_CLNDN'] = vcf_final['ClinVar_CLNDN'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['CLIN_SIG'] = vcf_final['CLIN_SIG'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['ClinVar_CLNREVSTAT'] = vcf_final['ClinVar_CLNREVSTAT'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)

# Print the modified DataFrame
vcf_final

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense_variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated_low_confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,protein_coding,1/13,,D19N,Gat/Aat,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense_variant,MODERATE,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated_low_confidence(0.05),possibly_damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,protein_coding,1/13,,D19H,Gat/Cat,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,protein_coding,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,protein_coding,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream_gene_variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream_gene_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,,,,-1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe_NFE,retained_intron,,1/2,,,1,21708280&28652652&21862702&27286809&19060906&1...
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,Sitosterolemia_1&Sitosterolemia_2&Cardiovascul...,benign&benign/likely_benign&pathogenic,criteria_provided&_multiple_submitters&_no_con...,4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe_AMR,retained_intron,,1/2,,,1,21708280&28652652&21862702&27286809&19060906&1...


In [12]:
vcf_final = vcf_final.astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
vcf_final

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense variant,MODERATE,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1..."
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1..."


In [13]:
vcf_final['consequence'] = vcf_final['Consequence'].str.split(',').str[0]
vcf_final

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense variant,MODERATE,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant


In [14]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [15]:
merged_1 = pd.merge(vcf_final, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant,7/10
1,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense variant,MODERATE,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant,7/10
2,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
3,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
4,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
5,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
6,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
7,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10
8,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant,2/10
9,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant,2/10


In [16]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [17]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,missense variant,MODERATE,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant,7/10,5.0
1,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,missense variant,MODERATE,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1,"21708280,28652652,21862702,27286809,19060906,1...",missense variant,7/10,5.0
2,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
3,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
4,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
5,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
6,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,upstream gene variant,MODIFIER,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
7,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,upstream gene variant,MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,"21708280,28652652,21862702,27286809,19060906,1...",upstream gene variant,2/10,1.5
8,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant,2/10,1.5
9,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1,"21708280,28652652,21862702,27286809,19060906,1...",intron variant,2/10,1.5


In [18]:
merged_2.columns

Index(['Gene Name', 'rsID', 'Coverage', 'CHROM', 'POS', 'REF', 'ALT', 'Allele',
       'Consequence', 'IMPACT', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF',
       'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED', 'consequence', 'Consequence_score', 'IMPACT_score'],
      dtype='object')

In [19]:
merged_2 = merged_2[['Gene Name', 'rsID', 'Coverage', 'CHROM', 'POS', 'REF', 'Allele',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND']]
merged_2

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,Allele,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,missense variant,7/10,MODERATE,5.0,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1
1,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,missense variant,7/10,MODERATE,5.0,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1
2,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1
3,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1
4,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1
5,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1
6,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1
7,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1
8,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1
9,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1


In [22]:
df_3 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Gall_stones.xlsx')
df_3 = df_3.rename(columns={'Variant Alteration': 'rsID'})
df_3

Unnamed: 0,Gene,rsID,Citation
0,ABCG8,rs11887534,PMID: 29283909
1,ABCG8,rs4953023,PMID: 29283909
2,ABCG8,rs4299376,PMID: 29283909
3,ABCG8,rs4148211,PMID: 35390434
4,ABCG8,rs4148217,PMID: 35390434


In [23]:
merged_3 = pd.merge(merged_2, df_3, on = ['rsID'], how= 'left', sort=False)
merged_3

Unnamed: 0,Gene Name,rsID,Coverage,CHROM,POS,REF,Allele,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,Gene,Citation
0,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,missense variant,7/10,MODERATE,5.0,,,,,ENST00000272286.4,c.55G>A,ENSP00000272286.2,p.Asp19Asn,tolerated low confidence(0.18),benign(0.084),,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,1/13,,D19N,Gat/Aat,1,ABCG8,PMID: 29283909
1,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,missense variant,7/10,MODERATE,5.0,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000272286.4,c.55G>C,ENSP00000272286.2,p.Asp19His,tolerated low confidence(0.05),possibly damaging(0.535),0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,1/13,,D19H,Gat/Cat,1,ABCG8,PMID: 29283909
2,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,protein coding,,,,,-1,ABCG8,PMID: 29283909
3,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,protein coding,,,,,-1,ABCG8,PMID: 29283909
4,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,ABCG8,PMID: 29283909
5,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,ABCG8,PMID: 29283909
6,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,,,,-1,ABCG8,PMID: 29283909
7,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,upstream gene variant,2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,,,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,,,,-1,ABCG8,PMID: 29283909
8,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,A,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000643284.1,n.521-5399G>A,,,,,,,,,,,6.448e-06,0.0,0.0,0.0,0.0,0.0,1.684e-05,0.0,0.0,,,,,,,,,,,,1.684e-05,gnomADe NFE,retained intron,,1/2,,,1,ABCG8,PMID: 29283909
9,"ABCG8,ABCG5",rs11887534,Covered,chr2,43839108,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,"Sitosterolemia 1,Sitosterolemia 2,Cardiovascul...","benign,benign/likely benign,pathogenic","criteria provided, multiple submitters, no con...",4975.0,ENST00000643284.1,n.521-5399G>C,,,,,0.0605,0.0764,0.0965,0.0139,0.0795,0.0419,0.06687,0.06379,0.09901,0.09368,0.01328,0.09419,0.06588,0.06782,0.03366,0.06442,0.06511,0.04825,0.06919,0.08963,0.01508,0.08878,0.05732,0.06413,0.05683,0.03558,0.09901,gnomADe AMR,retained intron,,1/2,,,1,ABCG8,PMID: 29283909


In [24]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Gstones_Annotations.xlsx', index=False)