In [30]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [31]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/Schizophrenia_Annotations.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/Schizophrenia_Annotations.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [9]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/Fatty_liver_Annotations.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']#, 'FORMAT', 'SAMPLE']

#sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
#sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
#vcf = pd.concat([vcf, sample_cols], axis=1)
#vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...
1,chr2,27518370,rs780094,T,C,.,.,RS=780094;dbSNPBuildID=86;SSR=0;GENEINFO=GCKR:...
2,chr2,27519736,rs780093,T,C,.,.,RS=780093;dbSNPBuildID=86;SSR=0;GENEINFO=GCKR:...
3,chr4,87310240,rs72613567,T,TA,.,.,RS=72613567;dbSNPBuildID=130;SSR=0;GENEINFO=HS...
4,chr4,87310277,rs62305723,G,A,.,.,RS=62305723;dbSNPBuildID=129;SSR=0;GENEINFO=HS...
5,chr19,19268740,rs58542926,C,"A,T",.,.,RS=58542926;dbSNPBuildID=129;SSR=0;GENEINFO=TM...
6,chr22,43928847,rs738409,C,"A,G,T",.,.,RS=738409;dbSNPBuildID=86;SSR=0;GENEINFO=PNPLA...
7,chr22,43930116,rs12483959,G,"A,C,T",.,.,RS=12483959;dbSNPBuildID=120;SSR=0;GENEINFO=PN...
8,chr22,43932850,rs4823173,G,A,.,.,RS=4823173;dbSNPBuildID=111;SSR=0;GENEINFO=PNP...
9,chr22,43933050,rs143392071,A,"C,G",.,.,RS=143392071;dbSNPBuildID=134;SSR=0;GENEINFO=P...


In [5]:
vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)
vcf

KeyError: 'GT'

In [10]:
vcf["Gene_Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
#vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')
vcf['Gene Name'] = vcf['Gene_Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene_Name,Gene Name
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR
1,chr2,27518370,rs780094,T,C,.,.,RS=780094;dbSNPBuildID=86;SSR=0;GENEINFO=GCKR:...,GCKR:2646,GCKR
2,chr2,27519736,rs780093,T,C,.,.,RS=780093;dbSNPBuildID=86;SSR=0;GENEINFO=GCKR:...,GCKR:2646,GCKR
3,chr4,87310240,rs72613567,T,TA,.,.,RS=72613567;dbSNPBuildID=130;SSR=0;GENEINFO=HS...,HSD17B13:345275,HSD17B13
4,chr4,87310277,rs62305723,G,A,.,.,RS=62305723;dbSNPBuildID=129;SSR=0;GENEINFO=HS...,HSD17B13:345275,HSD17B13
5,chr19,19268740,rs58542926,C,"A,T",.,.,RS=58542926;dbSNPBuildID=129;SSR=0;GENEINFO=TM...,TM6SF2:53345,TM6SF2
6,chr22,43928847,rs738409,C,"A,G,T",.,.,RS=738409;dbSNPBuildID=86;SSR=0;GENEINFO=PNPLA...,PNPLA3:80339,PNPLA3
7,chr22,43930116,rs12483959,G,"A,C,T",.,.,RS=12483959;dbSNPBuildID=120;SSR=0;GENEINFO=PN...,PNPLA3:80339,PNPLA3
8,chr22,43932850,rs4823173,G,A,.,.,RS=4823173;dbSNPBuildID=111;SSR=0;GENEINFO=PNP...,PNPLA3:80339,PNPLA3
9,chr22,43933050,rs143392071,A,"C,G",.,.,RS=143392071;dbSNPBuildID=134;SSR=0;GENEINFO=P...,PNPLA3:80339,PNPLA3


In [11]:
vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene_Name,Gene Name,CSQ,csq
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|downstream_gene_variant|MODIFIER|GCKR|ENSG00...
...,...,...,...,...,...,...,...,...,...,...,...,...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|intron_variant&non_coding_transcript_variant...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|regulatory_region_variant|MODIFIER|||Regulat...


In [12]:
########################################################### Required columns extraction from the CSQ column ####################
vcf['ClinVar_CLNDN'] = vcf['csq'].str.split('|').str[82]
vcf['Allele'] = vcf['csq'].str.split('|').str[0]
vcf['CLIN_SIG'] = vcf['csq'].str.split('|').str[70]
vcf['ClinVar_CLNREVSTAT'] = vcf['csq'].str.split('|').str[81]
vcf['ClinVar'] = vcf['csq'].str.split('|').str[79]
vcf['HGVSC'] = vcf['csq'].str.split('|').str[10]
vcf['HGVSP'] = vcf['csq'].str.split('|').str[11]
vcf['PolyPhen'] = vcf['csq'].str.split('|').str[38]
vcf['BIOTYPE'] = vcf['csq'].str.split('|').str[7]
vcf['EXON'] = vcf['csq'].str.split('|').str[8]
vcf['INTRON'] = vcf['csq'].str.split('|').str[9]
vcf['Protein_position'] = vcf['csq'].str.split('|').str[14]
vcf['Amino_acids'] = vcf['csq'].str.split('|').str[15]
vcf['Codons'] = vcf['csq'].str.split('|').str[16]
vcf['STRAND'] = vcf['csq'].str.split('|').str[19]
vcf['PUBMED'] = vcf['csq'].str.split('|').str[73]
vcf['Consequence'] = vcf['csq'].str.split('|').str[1]
vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf['SIFT'] = vcf['csq'].str.split('|').str[37]
################################################## Frequency columns extraction ################################################
vcf['AF'] = vcf['csq'].str.split('|').str[42]
vcf['AFR_AF'] = vcf['csq'].str.split('|').str[43]
vcf['AMR_AF'] = vcf['csq'].str.split('|').str[44]
vcf['EAS_AF'] = vcf['csq'].str.split('|').str[45]
vcf['EUR_AF'] = vcf['csq'].str.split('|').str[46]
vcf['SAS_AF'] = vcf['csq'].str.split('|').str[47]
vcf['gnomADe_AF'] = vcf['csq'].str.split('|').str[48]
vcf['gnomADe_AFR_AF'] = vcf['csq'].str.split('|').str[49]
vcf['gnomADe_AMR_AF'] = vcf['csq'].str.split('|').str[50]
vcf['gnomADe_ASJ_AF'] = vcf['csq'].str.split('|').str[51]
vcf['gnomADe_EAS_AF'] = vcf['csq'].str.split('|').str[52]
vcf['gnomADe_FIN_AF'] = vcf['csq'].str.split('|').str[53]
vcf['gnomADe_NFE_AF'] = vcf['csq'].str.split('|').str[54]
vcf['gnomADe_OTH_AF'] = vcf['csq'].str.split('|').str[55]
vcf['gnomADe_SAS_AF'] = vcf['csq'].str.split('|').str[56]
vcf['gnomADg_AF'] = vcf['csq'].str.split('|').str[57]
vcf['gnomADg_AFR_AF'] = vcf['csq'].str.split('|').str[58]
vcf['gnomADg_AMI_AF'] = vcf['csq'].str.split('|').str[59]
vcf['gnomADg_AMR_AF'] = vcf['csq'].str.split('|').str[60]
vcf['gnomADg_ASJ_AF'] = vcf['csq'].str.split('|').str[61]
vcf['gnomADg_EAS_AF'] = vcf['csq'].str.split('|').str[62]
vcf['gnomADg_FIN_AF'] = vcf['csq'].str.split('|').str[63]
vcf['gnomADg_MID_AF'] = vcf['csq'].str.split('|').str[64]
vcf['gnomADg_NFE_AF'] = vcf['csq'].str.split('|').str[65]
vcf['gnomADg_OTH_AF'] = vcf['csq'].str.split('|').str[66]
vcf['gnomADg_SAS_AF'] = vcf['csq'].str.split('|').str[67]
vcf['MAX_AF'] = vcf['csq'].str.split('|').str[68]
vcf['MAX_AF_POPS'] = vcf['csq'].str.split('|').str[69]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000264717.7:c.1337T>C,ENSP00000264717.2:p.Leu446Pro,benign(0),protein_coding,15/19,,446,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000264717.7:c.1337T>G,ENSP00000264717.2:p.Leu446Arg,benign(0),protein_coding,15/19,,446,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000411584.1:c.440T>C,ENSP00000416917.1:p.Leu147Pro,benign(0.001),protein_coding,5/7,,147,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000411584.1:c.440T>G,ENSP00000416917.1:p.Leu147Arg,benign(0.286),protein_coding,5/7,,147,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|downstream_gene_variant|MODIFIER|GCKR|ENSG00...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,,,,retained_intron,,,,,,1,21467728&30409984&18439548&21054877&21647738&2...,downstream_gene_variant,MODIFIER,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000493621.1:n.100+316G>C,,,protein_coding_CDS_not_defined,,1/1,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000494795.1:n.3057-532G>A,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000494795.1:n.3057-532G>C,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|regulatory_region_variant|MODIFIER|||Regulat...,,A,,,,,,,promoter,,,,,,,26780889&18940312&25018854&27458502&22719876&2...,regulatory_region_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS


In [13]:
vcf['Protein Position and Amino Acid'] = vcf['Amino_acids'].str[0] + vcf['Protein_position'] + np.where(vcf['Amino_acids'].str[-1] == vcf['Amino_acids'].str[0], '', vcf['Amino_acids'].str[-1])
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000264717.7:c.1337T>C,ENSP00000264717.2:p.Leu446Pro,benign(0),protein_coding,15/19,,446,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,L446P
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000264717.7:c.1337T>G,ENSP00000264717.2:p.Leu446Arg,benign(0),protein_coding,15/19,,446,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,L446R
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000411584.1:c.440T>C,ENSP00000416917.1:p.Leu147Pro,benign(0.001),protein_coding,5/7,,147,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,L147P
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000411584.1:c.440T>G,ENSP00000416917.1:p.Leu147Arg,benign(0.286),protein_coding,5/7,,147,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,L147R
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|downstream_gene_variant|MODIFIER|GCKR|ENSG00...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,,,,retained_intron,,,,,,1,21467728&30409984&18439548&21054877&21647738&2...,downstream_gene_variant,MODIFIER,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000493621.1:n.100+316G>C,,,protein_coding_CDS_not_defined,,1/1,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000494795.1:n.3057-532G>A,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000494795.1:n.3057-532G>C,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|regulatory_region_variant|MODIFIER|||Regulat...,,A,,,,,,,promoter,,,,,,,26780889&18940312&25018854&27458502&22719876&2...,regulatory_region_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,


In [14]:
vcf[['HGVSc', 'HGVSc (Transcript)']] = vcf['HGVSC'].str.split(':', 1, expand=True)
vcf[['HGVSp', 'HGVSp (Transcript)']] = vcf['HGVSP'].str.split(':', 1, expand=True)
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene_Name,Gene Name,CSQ,csq,ClinVar_CLNDN,Allele,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSC,HGVSP,PolyPhen,BIOTYPE,EXON,INTRON,Protein_position,Amino_acids,Codons,STRAND,PUBMED,Consequence,IMPACT,SIFT,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,Protein Position and Amino Acid,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript)
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000264717.7:c.1337T>C,ENSP00000264717.2:p.Leu446Pro,benign(0),protein_coding,15/19,,446,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,L446P,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000264717.7:c.1337T>G,ENSP00000264717.2:p.Leu446Arg,benign(0),protein_coding,15/19,,446,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,L446R,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|missense_variant&splice_region_variant|MODER...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000411584.1:c.440T>C,ENSP00000416917.1:p.Leu147Pro,benign(0.001),protein_coding,5/7,,147,L/P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(1),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,L147P,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,G|missense_variant&splice_region_variant|MODER...,,G,,,,ENST00000411584.1:c.440T>G,ENSP00000416917.1:p.Leu147Arg,benign(0.286),protein_coding,5/7,,147,L/R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...,missense_variant&splice_region_variant,MODERATE,tolerated(0.06),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,L147R,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg
0,chr2,27508073,rs1260326,T,"C,G",.,.,RS=1260326;dbSNPBuildID=87;SSR=0;GENEINFO=GCKR...,GCKR:2646,GCKR,C|missense_variant&splice_region_variant|MODER...,C|downstream_gene_variant|MODIFIER|GCKR|ENSG00...,not_provided&Fasting_plasma_glucose_level_quan...,C,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,,,,retained_intron,,,,,,1,21467728&30409984&18439548&21054877&21647738&2...,downstream_gene_variant,MODIFIER,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000493621.1:n.100+316G>C,,,protein_coding_CDS_not_defined,,1/1,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ENST00000493621.1,n.100+316G>C,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|intron_variant&non_coding_transcript_variant...,,A,,,,ENST00000494795.1:n.3057-532G>A,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,,ENST00000494795.1,n.3057-532G>A,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,C|intron_variant&non_coding_transcript_variant...,,C,,,,ENST00000494795.1:n.3057-532G>C,,,retained_intron,,2/2,,,,1,26780889&18940312&25018854&27458502&22719876&2...,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ENST00000494795.1,n.3057-532G>C,,
13,chr22,43995806,rs2143571,G,"A,C",.,.,RS=2143571;dbSNPBuildID=96;SSR=0;GENEINFO=SAMM...,SAMM50:25813,SAMM50,A|intron_variant|MODIFIER|SAMM50|ENSG000001003...,A|regulatory_region_variant|MODIFIER|||Regulat...,,A,,,,,,,promoter,,,,,,,26780889&18940312&25018854&27458502&22719876&2...,regulatory_region_variant,MODIFIER,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,,,,,


In [15]:
vcf_final = vcf[['Gene Name', 'rsID','CHROM', 'POS', 'REF', 'ALT', 'Allele', 'Consequence', 'IMPACT',
          'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT',
          'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
          'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,missense_variant&splice_region_variant,MODERATE,not_provided&Fasting_plasma_glucose_level_quan...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein_coding,15/19,,L446P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,missense_variant&splice_region_variant,MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,protein_coding,15/19,,L446R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,missense_variant&splice_region_variant,MODERATE,not_provided&Fasting_plasma_glucose_level_quan...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein_coding,5/7,,L147P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,missense_variant&splice_region_variant,MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,protein_coding,5/7,,L147R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream_gene_variant,MODIFIER,not_provided&Fasting_plasma_glucose_level_quan...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained_intron,,,,,1,21467728&30409984&18439548&21054877&21647738&2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein_coding_CDS_not_defined,,1/1,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,retained_intron,,2/2,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained_intron,,2/2,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory_region_variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,promoter,,,,,,26780889&18940312&25018854&27458502&22719876&2...


In [16]:
# Define the terms to remove
remove_terms = set(["not_specified", "not_provided"])

# Apply the filtering operation to 'Column1' only
vcf_final['ClinVar_CLNDN'] = vcf_final['ClinVar_CLNDN'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['CLIN_SIG'] = vcf_final['CLIN_SIG'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)


vcf_final['ClinVar_CLNREVSTAT'] = vcf_final['ClinVar_CLNREVSTAT'].apply(lambda row: "&".join(
    [term for term in row.split("&") if term not in remove_terms]
    ) if isinstance(row, str) and not all(term in remove_terms for term in row.split("&")) else row)

# Print the modified DataFrame
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,missense_variant&splice_region_variant,MODERATE,Fasting_plasma_glucose_level_quantitative_trai...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein_coding,15/19,,L446P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,missense_variant&splice_region_variant,MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,protein_coding,15/19,,L446R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,missense_variant&splice_region_variant,MODERATE,Fasting_plasma_glucose_level_quantitative_trai...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein_coding,5/7,,L147P,cTg/cCg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,missense_variant&splice_region_variant,MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe_EAS,protein_coding,5/7,,L147R,cTg/cGg,1,21467728&30409984&18439548&21054877&21647738&2...
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream_gene_variant,MODIFIER,Fasting_plasma_glucose_level_quantitative_trai...,benign&association,criteria_provided&_multiple_submitters&_no_con...,8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained_intron,,,,,1,21467728&30409984&18439548&21054877&21647738&2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein_coding_CDS_not_defined,,1/1,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,retained_intron,,2/2,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,intron_variant&non_coding_transcript_variant,MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained_intron,,2/2,,,1,26780889&18940312&25018854&27458502&22719876&2...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory_region_variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg_EAS,promoter,,,,,,26780889&18940312&25018854&27458502&22719876&2...


In [17]:
vcf_final = vcf_final.astype(str).applymap(lambda x: x.replace('&', ',').replace('_', ' '))
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,15/19,,L446P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2..."
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2..."
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,5/7,,L147P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2..."
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2..."
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream gene variant,MODIFIER,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained intron,,,,,1,"21467728,30409984,18439548,21054877,21647738,2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory region variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,promoter,,,,,,"26780889,18940312,25018854,27458502,22719876,2..."


In [18]:
vcf_final['consequence'] = vcf_final['Consequence'].str.split(',').str[0]
vcf_final

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,15/19,,L446P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,5/7,,L147P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant
0,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream gene variant,MODIFIER,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained intron,,,,,1,"21467728,30409984,18439548,21054877,21647738,2...",downstream gene variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant
13,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory region variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,promoter,,,,,,"26780889,18940312,25018854,27458502,22719876,2...",regulatory region variant


In [19]:
df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1

Unnamed: 0,consequence,Consequence_score
0,transcript ablation,10/10
1,splice acceptor variant,8/10
2,splice donor variant,8/10
3,stop gained,10/10
4,frameshift variant,10/10
5,stop lost,9/10
6,start lost,9/10
7,transcript amplification,8/10
8,inframe insertion,6/10
9,inframe deletion,6/10


In [20]:
merged_1 = pd.merge(vcf_final, df_1, on='consequence', how='left', sort=False)
merged_1

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,15/19,,L446P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10
1,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10
2,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,5/7,,L147P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10
3,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10
4,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream gene variant,MODIFIER,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained intron,,,,,1,"21467728,30409984,18439548,21054877,21647738,2...",downstream gene variant,2/10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10
106,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10
107,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10
108,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory region variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,promoter,,,,,,"26780889,18940312,25018854,27458502,22719876,2...",regulatory region variant,2/10


In [21]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')
df_2

Unnamed: 0,IMPACT,IMPACT_score
0,HIGH,10.0
1,MODERATE,5.0
2,LOW,2.5
3,MODIFIER,1.5


In [22]:
merged_2 = pd.merge(merged_1, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,IMPACT,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,consequence,Consequence_score,IMPACT_score
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,15/19,,L446P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10,5.0
1,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10,5.0
2,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",MODERATE,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,5/7,,L147P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10,5.0
3,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",MODERATE,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2...",missense variant,7/10,5.0
4,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream gene variant,MODIFIER,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained intron,,,,,1,"21467728,30409984,18439548,21054877,21647738,2...",downstream gene variant,2/10,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10,1.5
106,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10,1.5
107,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",MODIFIER,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2...",intron variant,2/10,1.5
108,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory region variant,MODIFIER,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,promoter,,,,,,"26780889,18940312,25018854,27458502,22719876,2...",regulatory region variant,2/10,1.5


In [23]:
merged_2 = merged_2[['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Allele',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Allele,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,15/19,,L446P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2..."
1,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2..."
2,GCKR,rs1260326,chr2,27508073,T,"C,G",C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,protein coding,5/7,,L147P,cTg/cCg,1,"21467728,30409984,18439548,21054877,21647738,2..."
3,GCKR,rs1260326,chr2,27508073,T,"C,G",G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,3.982e-06,0,0,0,5.444e-05,0,0,0,0,,,,,,,,,,,,5.444e-05,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1,"21467728,30409984,18439548,21054877,21647738,2..."
4,GCKR,rs1260326,chr2,27508073,T,"C,G",C,downstream gene variant,2/10,MODIFIER,1.5,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.6331,0.8679,0.6662,0.4668,0.5108,0.6426,0.5902,0.6005,0.758,0.67,0.8582,0.7083,0.626,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.9062,AFR,retained intron,,,,,1,"21467728,30409984,18439548,21054877,21647738,2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
106,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
107,SAMM50,rs2143571,chr22,43995806,G,"A,C",C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1,"26780889,18940312,25018854,27458502,22719876,2..."
108,SAMM50,rs2143571,chr22,43995806,G,"A,C",A,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.227,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.3875,gnomADg EAS,promoter,,,,,,"26780889,18940312,25018854,27458502,22719876,2..."


In [42]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Diab_all_genes_conditions.xlsx')
df_gene

Unnamed: 0,Gene Name,Matched_Gene
0,CTLA4,Type1Diabetes
1,INS,"Type1Diabetes, MODY"
2,IL2RA,Type1Diabetes
3,HNF1A,"Type1Diabetes, Type2Diabetes, MODY"
4,CEL,Type1Diabetes
5,HLADQ,Type1Diabetes
6,HLADR,Type1Diabetes
7,PTPN22,Type1Diabetes
8,IL2,Type1Diabetes
9,ERBB3,Type1Diabetes


In [43]:
merged_2['Gene Match'] = 'No'
merged_2['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in merged_2['Gene Name'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df_gene['Gene Name'].values:
                merged_2.at[index, 'Gene Match'] = 'Yes'
                merged_2.at[index, 'Matched_Gene'] = gene
                break
    
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene Match,Matched_Gene
0,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,No,
1,,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,No,
2,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,No,
3,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,,No,
4,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,No,
306165,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,,No,
306166,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,,No,
306167,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,,No,


In [44]:
df_gene = df_gene.rename({'Gene Name':'Matched_Gene', 'Matched_Gene':'Gene Match'}, axis=1)
df_gene

Unnamed: 0,Matched_Gene,Gene Match
0,CTLA4,Type1Diabetes
1,INS,"Type1Diabetes, MODY"
2,IL2RA,Type1Diabetes
3,HNF1A,"Type1Diabetes, Type2Diabetes, MODY"
4,CEL,Type1Diabetes
5,HLADQ,Type1Diabetes
6,HLADR,Type1Diabetes
7,PTPN22,Type1Diabetes
8,IL2,Type1Diabetes
9,ERBB3,Type1Diabetes


In [45]:
merged_2 = merged_2.drop(columns=['Gene Match'], axis=1)
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene
0,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,
1,,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,
2,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,
3,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,,
4,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,
306165,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,,
306166,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,,
306167,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,,


In [46]:
merged_2 = pd.merge(merged_2, df_gene, on= 'Matched_Gene', how = 'left', sort = False)
merged_2['Gene Match'] = merged_2['Gene Match'].fillna('No')
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene,Gene Match
0,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,,No
1,,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,,No
2,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,,No
3,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,,,No
4,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,No
306165,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,,,No
306166,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,,,No
306167,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,,,No


In [47]:
merged_2['Gene Match'].value_counts()

No                                    304214
Type2Diabetes, MODY                      589
Type2Diabetes                            416
Type1Diabetes                            389
Type1Diabetes, Type2Diabetes, MODY       206
Diabetic_Nephropathy                     124
Diabetic_Neuropathy                      117
Type1Diabetes, MODY                       65
MODY                                      34
Diabetic_Retinopathy                      15
Name: Gene Match, dtype: int64

In [48]:
merged_2.columns

Index(['Gene Name', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF',
       'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF',
       'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_OTH_AF', 'gnomADg_SAS_AF',
       'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED',
       'Matched_Gene', 'Gene Mat

In [49]:
merged_2 = merged_2[['Gene Name', 'Gene Match', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_2['POS'] = merged_2['POS'].astype('int64')
merged_2

Unnamed: 0,Gene Name,Gene Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,
1,,No,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,
2,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,
3,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,
4,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,No,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
306165,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,
306166,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,
306167,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,


In [21]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene_Match
0,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,No
1,,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,No
2,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,No
3,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,,No
4,OR4F5,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,No
306165,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,,No
306166,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,,No
306167,IL9R,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,,No


In [22]:
merged_2.Gene_Match.value_counts()

No     303375
Yes      2794
Name: Gene_Match, dtype: int64

In [23]:
merged_2 = merged_2[['Gene Name', 'Gene_Match', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_2['POS'] = merged_2['POS'].astype('int64')
merged_2

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,
1,,No,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,
2,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,
3,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,
4,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,No,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
306165,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,
306166,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,
306167,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,


In [50]:
import pandas as pd
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Diab_lit_variants_all.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]

df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']

df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20

df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]


# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
df_3['Covered/Not_Covered'] = df_3.apply(check_coverage, axis=1)

df_3 = df_3[df_3['Covered/Not_Covered'] == 'Covered']
df_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHGLBS578/Hypoglycemia_lit_variants.xlsx', index = False) 

In [51]:
df_3

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr12,120978691,Yes,Covered
1,chr12,120978908,Yes,Covered
2,chr12,120997590,Yes,Covered
3,chr12,120996531,Yes,Covered
4,chr12,120978928,Yes,Covered
...,...,...,...,...
4761,chr11,17408513,Yes,Covered
4798,chr11,66616700,Yes,Covered
4803,chr11,66616749,Yes,Covered
4806,chr11,66616734,Yes,Covered


In [52]:
merged_3 = pd.merge(merged_2, df_3, on=['CHROM', 'POS'], how='left', sort=False)
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,Gene Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature,Covered/Not_Covered
0,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,No,
1,,No,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,No,
2,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,No,
3,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,,No,
4,OR4F5,No,rs1260343719,chr1,69063,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,No,.,chrX,155459837,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,No,
306165,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,,No,
306166,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,,No,
306167,IL9R,No,rs2073467,chrX,156005252,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,,No,


In [53]:
merged_3.Literature.value_counts()

No     306097
Yes        72
Name: Literature, dtype: int64

In [54]:
merged_3[merged_3['Literature'] == 'Yes']

Unnamed: 0,Gene Name,Gene Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature,Covered/Not_Covered
69781,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs1169288,chr12,120978847,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,"Maturity onset diabetes mellitus in young,Nonp...","risk factor,benign","criteria provided, conflicting interpretations",14937,ENST00000257555.11,c.79A>C,ENSP00000257555.5,p.Ile27Leu,0/1,64,40,40,22,18,45%,3.1919E-7,18,4,11,7,tolerated(0.11),benign(0.282),0.2985,0.0832,0.33,0.3889,0.339,0.4325,0.3549,0.1179,0.3666,0.4666,0.4147,0.3992,0.3365,0.3613,0.4268,0.2888,0.1248,0.3998,0.3444,0.4579,0.4113,0.4038,0.5222,0.3279,0.3109,0.4168,0.5222,gnomADg MID,protein coding,1/10,,I27L,Atc/Ctc,1,"25741868,21094359,21647738,21931794,21937998,2...",Yes,Covered
69782,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs1169288,chr12,120978847,A,C,Heterozygous,missense variant,7/10,MODERATE,5.0,"Maturity onset diabetes mellitus in young,Nonp...","risk factor,benign","criteria provided, conflicting interpretations",14937,ENST00000400024.6,c.79A>C,ENSP00000476181.1,p.Ile27Leu,0/1,64,40,40,22,18,45%,3.1919E-7,18,4,11,7,tolerated(0.11),probably damaging(0.938),0.2985,0.0832,0.33,0.3889,0.339,0.4325,0.3549,0.1179,0.3666,0.4666,0.4147,0.3992,0.3365,0.3613,0.4268,0.2888,0.1248,0.3998,0.3444,0.4579,0.4113,0.4038,0.5222,0.3279,0.3109,0.4168,0.5222,gnomADg MID,protein coding,1/7,,I27L,Atc/Ctc,1,"25741868,21094359,21647738,21931794,21937998,2...",Yes,Covered
69783,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs1169288,chr12,120978847,A,C,Heterozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,"Maturity onset diabetes mellitus in young,Nonp...","risk factor,benign","criteria provided, conflicting interpretations",14937,ENST00000402929.5,n.214A>C,,,0/1,64,40,40,22,18,45%,3.1919E-7,18,4,11,7,,,0.2985,0.0832,0.33,0.3889,0.339,0.4325,0.3549,0.1179,0.3666,0.4666,0.4147,0.3992,0.3365,0.3613,0.4268,0.2888,0.1248,0.3998,0.3444,0.4579,0.4113,0.4038,0.5222,0.3279,0.3109,0.4168,0.5222,gnomADg MID,retained intron,1/6,,,,1,"25741868,21094359,21647738,21931794,21937998,2...",Yes,Covered
69784,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs1169288,chr12,120978847,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,"Maturity onset diabetes mellitus in young,Nonp...","risk factor,benign","criteria provided, conflicting interpretations",14937,ENST00000433033.3,n.134+767T>G,,,0/1,64,40,40,22,18,45%,3.1919E-7,18,4,11,7,,,0.2985,0.0832,0.33,0.3889,0.339,0.4325,0.3549,0.1179,0.3666,0.4666,0.4147,0.3992,0.3365,0.3613,0.4268,0.2888,0.1248,0.3998,0.3444,0.4579,0.4113,0.4038,0.5222,0.3279,0.3109,0.4168,0.5222,gnomADg MID,lncRNA,,1/3,,,-1,"25741868,21094359,21647738,21931794,21937998,2...",Yes,Covered
69785,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs1169288,chr12,120978847,A,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,"Maturity onset diabetes mellitus in young,Nonp...","risk factor,benign","criteria provided, conflicting interpretations",14937,ENST00000535301.2,n.322+1797T>G,,,0/1,64,40,40,22,18,45%,3.1919E-7,18,4,11,7,,,0.2985,0.0832,0.33,0.3889,0.339,0.4325,0.3549,0.1179,0.3666,0.4666,0.4147,0.3992,0.3365,0.3613,0.4268,0.2888,0.1248,0.3998,0.3444,0.4579,0.4113,0.4038,0.5222,0.3279,0.3109,0.4168,0.5222,gnomADg MID,lncRNA,,1/1,,,-1,"25741868,21094359,21647738,21931794,21937998,2...",Yes,Covered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69860,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs2464195,chr12,120997672,G,A,Homozygous,"3 prime UTR variant,NMD transcript variant",3/10,MODIFIER,1.5,"Nonpapillary renal cell carcinoma,Maturity ons...","benign,uncertain significance","criteria provided, conflicting interpretations",129231,ENST00000541924.5,c.*522G>A,,,1/1,206,36,36,0,36,100%,2.2598E-21,0,0,30,6,,,0.3596,0.1263,0.3963,0.4802,0.3777,0.5061,0.3898,0.1331,0.3995,0.5007,0.4853,0.3716,0.3637,0.3934,0.5206,0.3176,0.1446,0.3786,0.3923,0.5026,0.4909,0.3844,0.5854,0.3557,0.3654,0.5137,0.5854,gnomADg MID,nonsense mediated decay,6/6,,,,1,"25741868,24763700,24728327,18599530,18439552,1...",Yes,Covered
69861,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs2464195,chr12,120997672,G,A,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,"Nonpapillary renal cell carcinoma,Maturity ons...","benign,uncertain significance","criteria provided, conflicting interpretations",129231,ENST00000543255.1,n.552G>A,,,1/1,206,36,36,0,36,100%,2.2598E-21,0,0,30,6,,,0.3596,0.1263,0.3963,0.4802,0.3777,0.5061,0.3898,0.1331,0.3995,0.5007,0.4853,0.3716,0.3637,0.3934,0.5206,0.3176,0.1446,0.3786,0.3923,0.5026,0.4909,0.3844,0.5854,0.3557,0.3654,0.5137,0.5854,gnomADg MID,retained intron,2/2,,,,1,"25741868,24763700,24728327,18599530,18439552,1...",Yes,Covered
69862,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs2464195,chr12,120997672,G,A,Homozygous,"splice region variant,intron variant",4/10,LOW,2.5,"Nonpapillary renal cell carcinoma,Maturity ons...","benign,uncertain significance","criteria provided, conflicting interpretations",129231,ENST00000544413.2,c.1501+7G>A,,,1/1,206,36,36,0,36,100%,2.2598E-21,0,0,30,6,,,0.3596,0.1263,0.3963,0.4802,0.3777,0.5061,0.3898,0.1331,0.3995,0.5007,0.4853,0.3716,0.3637,0.3934,0.5206,0.3176,0.1446,0.3786,0.3923,0.5026,0.4909,0.3844,0.5854,0.3557,0.3654,0.5137,0.5854,gnomADg MID,protein coding,,7/9,,,1,"25741868,24763700,24728327,18599530,18439552,1...",Yes,Covered
69863,HNF1A,"Type1Diabetes, Type2Diabetes, MODY",rs2464195,chr12,120997672,G,A,Homozygous,"3 prime UTR variant,NMD transcript variant",3/10,MODIFIER,1.5,"Nonpapillary renal cell carcinoma,Maturity ons...","benign,uncertain significance","criteria provided, conflicting interpretations",129231,ENST00000544574.5,c.*271G>A,,,1/1,206,36,36,0,36,100%,2.2598E-21,0,0,30,6,,,0.3596,0.1263,0.3963,0.4802,0.3777,0.5061,0.3898,0.1331,0.3995,0.5007,0.4853,0.3716,0.3637,0.3934,0.5206,0.3176,0.1446,0.3786,0.3923,0.5026,0.4909,0.3844,0.5854,0.3557,0.3654,0.5137,0.5854,gnomADg MID,nonsense mediated decay,3/3,,,,1,"25741868,24763700,24728327,18599530,18439552,1...",Yes,Covered


In [55]:
merged_3 = merged_3[['Gene Name', 'Gene Match', 'rsID', 'CHROM', 'POS', 'Literature', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_3

Unnamed: 0,Gene Name,Gene Match,rsID,CHROM,POS,Literature,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,
1,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,
2,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,22,31,31,24,7,22.58%,5.3469E-3,17,7,3,4,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,
3,OR4F5,No,rs1260343719,chr1,69063,No,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.36T>C,ENSP00000493376.2,p.Asn12%3D,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,protein coding,3/3,,N12,aaT/aaC,1,
4,OR4F5,No,rs1260343719,chr1,69063,No,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,67,45,45,26,19,42.22%,1.6979E-7,23,3,18,1,,,,,,,,,,,,,,,,,,0.05175,0.04058,0.02941,0.04766,0.04545,0.06778,0.0303,0.1538,0.06103,0.05789,0.1089,0.1538,gnomADg MID,lncRNA,,,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306164,,No,.,chrX,155459837,No,A,G,Heterozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,71,103,103,81,22,21.36%,6.7976E-8,54,27,14,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,
306165,IL9R,No,rs2073467,chrX,156005252,No,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000244174.11,c.580-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,5/8,,,1,
306166,IL9R,No,rs2073467,chrX,156005252,No,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000369423.7,c.685-26C>T,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding,,6/8,,,1,
306167,IL9R,No,rs2073467,chrX,156005252,No,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,31,12,12,4,8,66.67%,6.7304E-4,4,0,8,0,,,,,,,,,0.02802,0.001789,0.0527,9.934e-05,0.2012,0,0.0005125,0.01736,0.04299,0.01208,0.001739,0,0.02902,0.000288,0.2012,0,0,0.0003382,0.0134,0.04662,0.2012,"gnomADe EAS,gnomADg EAS",protein coding CDS not defined,,,,,1,


In [24]:
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Fatty_liver_data.xlsx', index=False)

In [20]:
merged_2['Gene Match'] = 'No'
merged_2['Matched_Gene'] = ''
    
# Iterate through each gene in vcf['Gene']
for index, genes in merged_2['Gene Name'].iteritems():
    if isinstance(genes, str):
        gene_list = genes.split(',')
        for gene in gene_list:
            if gene in df_gene['Gene Name'].values:
                merged_2.at[index, 'Gene Match'] = 'Yes'
                merged_2.at[index, 'Matched_Gene'] = gene
                break
    
df_gene = df_gene.rename({'Gene Name':'Matched_Gene', 'Gene_Match':'Gene Match'}, axis=1)

merged_2 = merged_2.drop(columns=['Gene Match'], axis=1)

merged_2 = pd.merge(merged_2, df_gene, on= 'Matched_Gene', how = 'left', sort = False)
#merged_2['Gene Match'] = merged_2['Gene Match'].fillna('No')
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,251,251,0,250,99.6%,8.5657E-150,0,0,194,56,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,171,179,179,128,51,28.49%,6.8954E-18,86,42,35,16,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,
2,OR4F29,rs2808347,chr1,450718,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,30,26,26,17,9,34.62%,8.4928E-4,17,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,
3,OR4F29,rs2808347,chr1,450718,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+34322A>G,,,0/1,30,26,26,17,9,34.62%,8.4928E-4,17,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,lncRNA,,1/2,,,-1,,
4,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,30,25,25,16,9,36%,8.1542E-4,16,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336023,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",
336024,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",
336025,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",
336026,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",


In [21]:
merged_2.Matched_Gene.value_counts()

           333579
PMS2          446
BRCA1         421
NBN           302
TP53          211
SMARCA4       180
NF1           162
BARD1         133
ATM           116
MUTYH          96
BRCA2          74
BRIP1          72
RECQL          38
PTEN           30
RAD50          26
RAD51D         24
PALB2          22
MSH2           21
MSH6           18
CDKN2A         17
SLX4           14
STK11          10
CDH1           10
MRE11           6
Name: Matched_Gene, dtype: int64

In [22]:
df_3 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Hereditary_Cancer_risk_Oncokb.xlsx')
df_3

Unnamed: 0,Matched_Gene,Protein Position and Amino Acid,Oncogenic,Mutation effect
0,ATM,Deletion,Likely Oncogenic,Likely Loss-of-function
1,ATM,Truncating Mutations,Likely Oncogenic,Likely Loss-of-function
2,ATM,A1742P,Likely Oncogenic,Likely Loss-of-function
3,ATM,A2062V,Likely Oncogenic,Likely Loss-of-function
4,ATM,A2067D,Likely Oncogenic,Likely Loss-of-function
...,...,...,...,...
1650,TP53,A347T,Likely Oncogenic,Likely Loss-of-function
1651,TP53,L348F,Likely Oncogenic,Likely Loss-of-function
1652,TP53,L348S,Likely Oncogenic,Likely Loss-of-function
1653,TP53,G356A,Inconclusive,Inconclusive


In [23]:
merged_3 = pd.merge(merged_2, df_3, on=['Matched_Gene', 'Protein Position and Amino Acid'], how='left', sort=False)
merged_3

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Matched_Gene,Oncogenic,Mutation effect
0,OR4F5,rs2691305,chr1,69511,A,G,Homozygous,missense variant,7/10,MODERATE,5.0,,,,,ENST00000641515.2,c.484A>G,ENSP00000493376.2,p.Thr162Ala,1/1,255,251,251,0,250,99.6%,8.5657E-150,0,0,194,56,tolerated(0.92),benign(0),,,,,,,0.9497,0.6075,0.9514,0.9767,0.9994,0.9916,0.9726,0.9506,0.9854,0.846,0.5948,0.998,0.8951,0.9784,0.9998,0.9907,0.9,0.9674,0.8624,0.9772,0.9998,gnomADg EAS,protein coding,3/3,,T162A,Aca/Gca,1,,,,
1,OR4F5,rs200676709,chr1,69897,T,C,Heterozygous,synonymous variant,3/10,LOW,2.5,,,,,ENST00000641515.2,c.870T>C,ENSP00000493376.2,p.Ser290%3D,0/1,171,179,179,128,51,28.49%,6.8954E-18,86,42,35,16,,,0.6881,0.407,0.6254,0.876,0.7942,0.8098,0.7209,0.307,0.6749,0.8131,0.7793,0.8652,0.8438,0.7932,0.8049,0.4864,0.2916,0.6511,0.4782,0.6714,0.6047,0.6987,0.6441,0.6751,0.4659,0.622,0.876,EAS,protein coding,3/3,,S290,tcT/tcC,1,,,,
2,OR4F29,rs2808347,chr1,450718,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,30,26,26,17,9,34.62%,8.4928E-4,17,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,,
3,OR4F29,rs2808347,chr1,450718,T,C,Heterozygous,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000455207.5,n.169+34322A>G,,,0/1,30,26,26,17,9,34.62%,8.4928E-4,17,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,lncRNA,,1/2,,,-1,,,,
4,,.,chr1,685694,T,C,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,30,25,25,16,9,36%,8.1542E-4,16,0,9,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding,,,,,-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336023,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",,,
336024,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",,,
336025,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",,,
336026,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,188,33,33,0,33,100%,1.3852E-19,0,0,24,9,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",,,


In [24]:
merged_3.Oncogenic.value_counts()

Likely Neutral    21
Inconclusive      11
Name: Oncogenic, dtype: int64

In [25]:
merged_3['Mutation effect'].value_counts()

Likely- Neutral    12
Inconclusive       11
Likely Neutral      9
Name: Mutation effect, dtype: int64

In [26]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHHSPTGPTTL60_depth_vcf_processed.xlsx', index=False)

In [50]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,Gene Name,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Gene_Match
0,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,No
1,,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,No
2,,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,No
3,,rs1166163776,chr1,65649,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,transcribed unprocessed pseudogene,,,,,1,,No
4,,rs1166163776,chr1,65649,G,A,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+76G>A,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,protein coding,,2/2,,,1,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297877,EIF1AY,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,52,10,10,0,10,100%,5.4125E-6,0,0,3,7,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No
297878,,rs17250535,chrY,21311315,T,A,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,,,,,ENST00000437359.1,n.48T>A,,,1/1,64,12,12,0,12,100%,3.698E-7,0,0,6,6,,,0.0681,0,0.0059,0,0.05,0.2731,,,,,,,,,,,,,,,,,,,,,0.2731,SAS,unprocessed pseudogene,1/9,,,,1,3442492935176104,No
297879,DAZ3,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000315357.9,c.714+2253G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/15,,,-1,,No
297880,DAZ3,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000382365.7,c.715-72G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/18,,,-1,,No


In [52]:
merged_2['Gene_Match'].value_counts()

No     296225
Yes      1657
Name: Gene_Match, dtype: int64

In [53]:
merged_2 = merged_2[['Gene Name', 'Gene_Match', 'rsID', 'CHROM', 'POS', 'REF', 'ALT', 'Zygosity',
       'Consequence', 'Consequence_score', 'IMPACT', 'IMPACT_score',
       'ClinVar_CLNDN', 'CLIN_SIG', 'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc',
       'HGVSc (Transcript)', 'HGVSp', 'HGVSp (Transcript)', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT',
       'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE', 'EXON', 'INTRON',
       'Protein Position and Amino Acid', 'Codons', 'STRAND', 'PUBMED']]
merged_2['POS'] = merged_2['POS'].astype('int64')
merged_2

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,
1,,No,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,
2,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,
3,,No,rs1166163776,chr1,65649,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,transcribed unprocessed pseudogene,,,,,1,
4,,No,rs1166163776,chr1,65649,G,A,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+76G>A,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,protein coding,,2/2,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297877,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,52,10,10,0,10,100%,5.4125E-6,0,0,3,7,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
297878,,No,rs17250535,chrY,21311315,T,A,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,,,,,ENST00000437359.1,n.48T>A,,,1/1,64,12,12,0,12,100%,3.698E-7,0,0,6,6,,,0.0681,0,0.0059,0,0.05,0.2731,,,,,,,,,,,,,,,,,,,,,0.2731,SAS,unprocessed pseudogene,1/9,,,,1,3442492935176104
297879,DAZ3,No,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000315357.9,c.714+2253G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/15,,,-1,
297880,DAZ3,No,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000382365.7,c.715-72G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/18,,,-1,


In [54]:
import pandas as pd
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Reproductivehealth_Male_Lit_final_Positions_hg38_hg37.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]

df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']

df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20

df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]


# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
df_3['Covered/Not_Covered'] = df_3.apply(check_coverage, axis=1)

df_3 = df_3[df_3['Covered/Not_Covered'] == 'Covered']
df_3

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr12,101728759,Yes,Covered
1,chr12,101729212,Yes,Covered
3,chr12,101728656,Yes,Covered
4,chr12,101729109,Yes,Covered
6,chr19,57232128,Yes,Covered
...,...,...,...,...
438,chr11,32396328,Yes,Covered
439,chr11,32400021,Yes,Covered
441,chr11,32396277,Yes,Covered
450,chr11,32392036,Yes,Covered


In [55]:
merged_3 = pd.merge(merged_2, df_3, on=['CHROM', 'POS'], how='left', sort=False)
merged_3['Literature'] = merged_3['Literature'].fillna('No')
merged_3

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED,Literature,Covered/Not_Covered
0,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,,No,
1,,No,rs1416222198,chr1,65591,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,,No,
2,,No,rs1416222198,chr1,65591,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,,No,
3,,No,rs1166163776,chr1,65649,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,transcribed unprocessed pseudogene,,,,,1,,No,
4,,No,rs1166163776,chr1,65649,G,A,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+76G>A,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,protein coding,,2/2,,,1,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297877,EIF1AY,No,rs9786153,chrY,20577481,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,52,10,10,0,10,100%,5.4125E-6,0,0,3,7,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3...",No,
297878,,No,rs17250535,chrY,21311315,T,A,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,,,,,ENST00000437359.1,n.48T>A,,,1/1,64,12,12,0,12,100%,3.698E-7,0,0,6,6,,,0.0681,0,0.0059,0,0.05,0.2731,,,,,,,,,,,,,,,,,,,,,0.2731,SAS,unprocessed pseudogene,1/9,,,,1,3442492935176104,No,
297879,DAZ3,No,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000315357.9,c.714+2253G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/15,,,-1,,No,
297880,DAZ3,No,rs1156813394,chrY,24796170,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000382365.7,c.715-72G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/18,,,-1,,No,


In [56]:
merged_3.Literature.value_counts()

No    297882
Name: Literature, dtype: int64

In [57]:
merged_3 = merged_3[['Gene Name', 'Gene_Match', 'rsID', 'CHROM', 'POS', 'Literature', 'REF', 'ALT', 'Zygosity',
       'Consequence','Consequence_score', 'IMPACT', 'IMPACT_score', 'ClinVar_CLNDN', 'CLIN_SIG',
       'ClinVar_CLNREVSTAT', 'ClinVar', 'HGVSc', 'HGVSc (Transcript)', 'HGVSp',
       'HGVSp (Transcript)', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ',
       'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'SIFT', 'PolyPhen', 'AF', 'AFR_AF',
       'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF',
       'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF',
       'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'gnomADg_AF',
       'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF',
       'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF',
       'gnomADg_OTH_AF', 'gnomADg_SAS_AF', 'MAX_AF', 'MAX_AF_POPS', 'BIOTYPE',
       'EXON', 'INTRON', 'Protein Position and Amino Acid', 'Codons', 'STRAND',
       'PUBMED']]
merged_3

Unnamed: 0,Gene Name,Gene_Match,rsID,CHROM,POS,Literature,REF,ALT,Zygosity,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,PUBMED
0,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,transcribed unprocessed pseudogene,,,,,1,
1,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+18C>T,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,protein coding,,2/2,,,1,
2,,No,rs1416222198,chr1,65591,No,C,T,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,23,26,26,19,7,26.92%,4.9169E-3,10,9,4,3,,,,,,,,,,,,,,,,,,0.03416,0.02306,0.1667,0.02711,0.03333,0.001639,0,0.05,0.09167,0.05357,0.02083,0.1667,gnomADg AMI,lncRNA,,,,,1,
3,,No,rs1166163776,chr1,65649,No,G,A,Heterozygous,downstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,transcribed unprocessed pseudogene,,,,,1,
4,,No,rs1166163776,chr1,65649,No,G,A,Heterozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000641515.2,c.9+76G>A,,,0/1,20,19,19,13,6,31.58%,9.828E-3,1,12,2,4,,,,,,,,,,,,,,,,,,0.03432,0.01435,0.05263,0.03176,0.09639,0.001832,0.0122,0.07895,0.09393,0.02439,0.03321,0.09639,gnomADg ASJ,protein coding,,2/2,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297877,EIF1AY,No,rs9786153,chrY,20577481,No,C,T,Homozygous,upstream gene variant,2/10,MODIFIER,1.5,,,,,,,,,1/1,52,10,10,0,10,100%,5.4125E-6,0,0,3,7,,,0.8273,0.9624,0.5824,0.9959,0.4667,0.9962,,,,,,,,,,0.6593,0.8119,0.2864,0.5121,0.9004,0.9992,0.9714,0.9865,0.4599,0.6294,0.9906,0.9992,gnomADg EAS,lncRNA,,,,,-1,"22271044,29391530,24262073,15896936,32996047,3..."
297878,,No,rs17250535,chrY,21311315,No,T,A,Homozygous,non coding transcript exon variant,2/10,MODIFIER,1.5,,,,,ENST00000437359.1,n.48T>A,,,1/1,64,12,12,0,12,100%,3.698E-7,0,0,6,6,,,0.0681,0,0.0059,0,0.05,0.2731,,,,,,,,,,,,,,,,,,,,,0.2731,SAS,unprocessed pseudogene,1/9,,,,1,3442492935176104
297879,DAZ3,No,rs1156813394,chrY,24796170,No,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000315357.9,c.714+2253G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/15,,,-1,
297880,DAZ3,No,rs1156813394,chrY,24796170,No,C,G,Homozygous,intron variant,2/10,MODIFIER,1.5,,,,,ENST00000382365.7,c.715-72G>C,,,1/1,74,22,22,5,17,77.27%,3.8368E-8,1,4,11,6,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,"gnomADg AFR,gnomADg AMI,gnomADg AMR,gnomADg AS...",protein coding,,9/18,,,-1,


In [58]:
merged_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/KHHSPTGPTTL59_depth_vcf_processed.xlsx', index=False)

In [75]:
import pandas as pd
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Mody_Lit_final_Positions_hg38_hg37 (2).xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt_38'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]

df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']

df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20

df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]


# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
df_3['Covered/Not_Covered'] = df_3.apply(check_coverage, axis=1)

df_3 = df_3[df_3['Covered/Not_Covered'] == 'Covered']
df_3

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr12,120978691,Yes,Covered
1,chr12,120978908,Yes,Covered
2,chr12,120997590,Yes,Covered
3,chr12,120996531,Yes,Covered
4,chr12,120978928,Yes,Covered
...,...,...,...,...
2206,chr11,17394331,Yes,Covered
2207,chr11,17396955,Yes,Covered
2208,chr11,17395249,Yes,Covered
2209,chr11,17413396,Yes,Covered


In [76]:
df_3 = df_3[['CHROM', 'POS', 'Literature']]
df_3

Unnamed: 0,CHROM,POS,Literature
0,chr12,120978691,Yes
1,chr12,120978908,Yes
2,chr12,120997590,Yes
3,chr12,120996531,Yes
4,chr12,120978928,Yes
...,...,...,...
2206,chr11,17394331,Yes
2207,chr11,17396955,Yes
2208,chr11,17395249,Yes
2209,chr11,17413396,Yes


In [77]:
df_3.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Mody_lit_variants.xlsx', index=False)

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [4]:
vcf_data_df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/220_final.vcf', comment= '#', sep = '\t', header=None, low_memory=False, encoding='ISO-8859-1')
vcf_data_df.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf_data_df['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf_data_df = pd.concat([vcf_data_df, sample_cols], axis=1)
vcf_data_df = vcf_data_df[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf_data_df

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,14574,rs28503599,A,G,.,PASS,ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=DDX11...,1/1,24,8,8,2,6,75%,3.4965E-3,2,0,6,0
1,chr1,14590,rs707679,G,A,.,PASS,ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=DDX11...,1/1,24,8,8,2,6,75%,3.4965E-3,2,0,6,0
2,chr1,14599,rs707680,T,A,.,PASS,"ADP=10;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,0....",0/1,22,10,10,4,6,60%,5.418E-3,4,0,6,0
3,chr1,14604,rs541940975,A,G,.,PASS,"ADP=9;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,.,0...",0/1,23,9,9,3,6,66.67%,4.5249E-3,3,0,6,0
4,chr1,14610,rs878986575,T,C,.,PASS,ADP=9;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX11...,0/1,23,9,9,3,6,66.67%,4.5249E-3,3,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61971,chrY,57196682,.,A,G,.,PASS,ADP=16;WT=0;HET=1;HOM=0;NC=0;CSQ=G|missense_va...,0/1,20,16,16,10,6,37.5%,8.837E-3,10,0,4,2
61972,chrY,57208367,.,A,G,.,PASS,ADP=21;WT=0;HET=0;HOM=1;NC=0;CSQ=G|intron_vari...,1/1,69,21,21,5,16,76.19%,1.2221E-7,4,1,7,9
61973,chrY,57208581,.,G,C,.,PASS,ADP=19;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,28,19,19,11,8,42.11%,1.5455E-3,11,0,8,0
61974,chrY,57209184,.,T,C,.,PASS,ADP=31;WT=0;HET=1;HOM=0;NC=0;CSQ=C|non_coding_...,0/1,46,31,31,18,13,41.94%,2.4825E-5,15,3,11,2


In [17]:
gene_list = ["CSF1", "FGF1", "FGF2", "ELK1", "FGF3", "IGF1R", "FGF4", "FGF5", "FGF6", "FGF7", "FGF8", "FGF9", "PRKACG", "AKT2", "AKT3", "KDR", "AKT1", "PRKACA", "PRKACB", "PDGFRB", "PRKCG", "PDGFRA", "MAP2K1", "MAP2K2", "PRKCB", "HGF", "FLT3LG", 
    "PRKCA", "PGF", "RAF1", "EPHA2", "CSF1R", "PDGFB", "PDGFBA", "TGFA", "STK4", 
    "EFNA5", "RASGRP2", "RASGRP1", "EFNA4", "RASGRP4", "RASGRP3", "RAP1B", "RAP1A", 
    "RRAS", "PDGFD", "PDGFC", "FGF20", "FGF23", "FGF22", "FGF21", "NGFR", "BDNF", 
    "INSR", "IGF2", "IGF1", "GNG12", "NFKB1", "FGF17", "EFNA1", "FGF16", "EFNA3", 
    "EFNA2", "FGF19", "FGF18", "NF1", "GRB2", "FGFR4", "FGFR3", "FGFR2", "FGF10", 
    "FGFR1", "FLT1", "FLT3", "FLT4", "RASGRF2", "RASGRF1", "FASLG", "IKBKB", "RAC2", 
    "RAC3", "IKBKG", "RAC1", "HRAS", "PLA2G4F", "PLA2G4D", "CHUK", "PLA2G4E", 
    "PLA2G4B", "PLA2G4C", "RRAS2", "PLA2G4A", "NGF", "MRAS", "RASA1", "RASA2", 
    "KIT", "SOS1", "MET", "SOS2", "RELA", "EGFR", "INS", "NTF4", "CDC42", "MAPK9", 
    "NRAS", "PAK1", "MAPK8", "NTF3", "MAPK1", "PAK2", "MAPK3", "NTRK1", "NTRK2", 
    "ANGPT4", "ANGPT2", "ANGPT1", "EGF", "VEGFB", "VEGFC", "VEGFD", "VEGFA", 
    "MAPK10", "KITLG", "KRAS", "TEK"]
print(len(gene_list))



["CSF1", "FGF1", "FGF2", "ELK1", "FGF3", "IGF1R", "FGF4", "FGF5", "FGF6", "FGF7", 
"FGF8", "FGF9", "PRKACG", "AKT2", "AKT3", "KDR", "AKT1", "PRKACA", "PRKACB", 
"PDGFRB", "PRKCG", "PDGFRA", "MAP2K1", "MAP2K2", "PRKCB", "HGF", "FLT3LG", 
"PRKCA", "PGF", "RAF1", "EPHA2", "CSF1R", "PDGFB", "PDGFA", "TGFA", "STK4", 
"EFNA5", "RASGRP2", "RASGRP1", "EFNA4", "RASGRP4", "RASGRP3", "RAP1B", "RAP1A", 
"RRAS", "PDGFD", "PDGFC", "FGF20", "FGF23", "FGF22", "FGF21", "NGFR", "BDNF", 
"INSR", "IGF2", "IGF1", "GNG12", "NFKB1", "FGF17", "EFNA1", "FGF16", "EFNA3", 
"EFNA2", "FGF19", "FGF18", "NF1", "GRB2", "FGFR4", "FGFR3", "FGFR2", "FGF10", 
"FGFR1", "FLT1", "FLT3", "FLT4", "RASGRF2", "RASGRF1", "FASLG", "IKBKB", "RAC2", 
"RAC3", "IKBKG", "RAC1", "HRAS", "PLA2G4F", "PLA2G4D", "CHUK", "PLA2G4E", 
"PLA2G4B", "PLA2G4C", "RRAS2", "PLA2G4A", "NGF", "MRAS", "RASA1", "RASA2", 
"KIT", "SOS1", "MET", "SOS2", "RELA", "EGFR", "INS", "NTF4", "CDC42", "MAPK9", 
"NRAS", "PAK1", "MAPK8", "NTF3", "MAPK1", "PAK2", "MAPK3", "NTRK1", "NTRK2", 
"ANGPT4", "ANGPT2", "ANGPT1", "EGF", "VEGFB", "VEGFC", "VEGFD", "VEGFA", 
"MAPK10", "KITLG", "KRAS", "TEK"]

127


In [5]:
# Create a list of genes you want to search for
gene_list = ["CSF1", "FGF1", "FGF2", "ELK1", "FGF3", "IGF1R", "FGF4", "FGF5", "FGF6", "FGF7", 
             "FGF8", "FGF9", "PRKACG", "AKT2", "AKT3", "KDR", "AKT1", "PRKACA", "PRKACB", 
             "PDGFRB", "PRKCG", "PDGFRA", "MAP2K1", "MAP2K2", "PRKCB", "HGF", "FLT3LG", 
             "PRKCA", "PGF", "RAF1", "EPHA2", "CSF1R", "PDGFB", "PDGFA", "TGFA", "STK4", 
             "EFNA5", "RASGRP2", "RASGRP1", "EFNA4", "RASGRP4", "RASGRP3", "RAP1B", "RAP1A", 
             "RRAS", "PDGFD", "PDGFC", "FGF20", "FGF23", "FGF22", "FGF21", "NGFR", "BDNF", 
             "INSR", "IGF2", "IGF1", "GNG12", "NFKB1", "FGF17", "EFNA1", "FGF16", "EFNA3", 
             "EFNA2", "FGF19", "FGF18", "NF1", "GRB2", "FGFR4", "FGFR3", "FGFR2", "FGF10", 
             "FGFR1", "FLT1", "FLT3", "FLT4", "RASGRF2", "RASGRF1", "FASLG", "IKBKB", "RAC2", 
             "RAC3", "IKBKG", "RAC1", "HRAS", "PLA2G4F", "PLA2G4D", "CHUK", "PLA2G4E", 
             "PLA2G4B", "PLA2G4C", "RRAS2", "PLA2G4A", "NGF", "MRAS", "RASA1", "RASA2", 
             "KIT", "SOS1", "MET", "SOS2", "RELA", "EGFR", "INS", "NTF4", "CDC42", "MAPK9", 
             "NRAS", "PAK1", "MAPK8", "NTF3", "MAPK1", "PAK2", "MAPK3", "NTRK1", "NTRK2", 
             "ANGPT4", "ANGPT2", "ANGPT1", "EGF", "VEGFB", "VEGFC", "VEGFD", "VEGFA", 
             "MAPK10", "KITLG", "KRAS", "TEK"]

# Create a new DataFrame to store the matching rows
matched_rows = pd.DataFrame(columns=vcf_data_df.columns)  # Create an empty DataFrame with the same columns

# Create a new column to store matched genes
matched_rows["Matched_Genes"] = ""

# Create a regex pattern to match genes within any separator
gene_pattern = r'\b(' + '|'.join(gene_list) + r')\b'

# Iterate through each row in the data
for index, row in vcf_data_df.iterrows():
    info = row["INFO"]
    
    # Use regex to find all matched genes within the INFO column
    matched_genes = re.findall(gene_pattern, info, re.IGNORECASE)
    
    if matched_genes:
        # Update the "Matched_Genes" column with matched genes
        matched_rows.at[index, "Matched_Genes"] = ', '.join(matched_genes)
        matched_rows = pd.concat([matched_rows, vcf_data_df.loc[[index]]])
        
matched_rows

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,Matched_Genes
866,,,,,,,,,,,,,,,,,,,,,"EPHA2, EPHA2"
866,chr1,16124918,rs1803527,C,T,.,PASS,"ADP=23;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.09425,0...",1/1,129,23,23,0,23,100%,1.2146E-13,0,0,16,7,
867,,,,,,,,,,,,,,,,,,,,,"EPHA2, EPHA2, EPHA2, EPHA2"
867,chr1,16132319,rs2291805,C,T,.,PASS,"ADP=38;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8726,0....",0/1,25,38,38,30,8,21.05%,2.5935E-3,18,12,7,1,
868,,,,,,,,,,,,,,,,,,,,,"EPHA2, EPHA2, EPHA2, EPHA2"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61808,chrX,154556836,.,C,T,.,PASS,ADP=20;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intron_vari...,0/1,27,20,20,12,8,40%,1.638E-3,9,3,4,4,
61809,,,,,,,,,,,,,,,,,,,,,"IKBKG, IKBKG, IKBKG, IKBKG, IKBKG, IKBKG, IKBK..."
61809,chrX,154558768,rs55810978,G,C,.,PASS,ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;GENEINFO=IKBKG...,1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,
61810,,,,,,,,,,,,,,,,,,,,,"IKBKG, IKBKG, IKBKG, IKBKG, IKBKG, IKBKG, IKBK..."


In [6]:
758/2

379.0

In [35]:
matched_rows.INFO.iloc[3]

'ADP=9;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.003195,0.9968;COMMON=1;GENEINFO=CSF1:1435;GNO;HD;INT;KGPhase1;KGPhase3;RS=333972;RSPOS=109925123;SAO=0;SLO;SSR=0;TOPMED=0.00305810397553516,0.99694189602446483;VC=SNV;VLD;VP=0x05010008000504053e000100;WGT=1;dbSNPBuildID=79;CSQ=G|intron_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000329608|protein_coding||7/8||||||||||1||HGNC|HGNC:2432|||||,G|intron_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000369801|protein_coding||8/8||||||||||1||HGNC|HGNC:2432|||||,G|intron_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000369802|protein_coding||7/8||||||||||1||HGNC|HGNC:2432|||||,G|intron_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000420111|protein_coding||7/8||||||||||1||HGNC|HGNC:2432|||||,G|downstream_gene_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000488198|protein_coding|||||||||||1524|1|cds_end_NF|HGNC|HGNC:2432|||||,G|downstream_gene_variant|MODIFIER|CSF1|ENSG00000184371|Transcript|ENST00000525659|p

In [25]:
vcf_data_df.Matching_Genes.value_counts()

[]    61976
Name: Matching_Genes, dtype: int64

In [7]:
matched_rows.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/220_output.xlsx', index=False)

In [27]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Fatty_Liver.xlsx')
df

Unnamed: 0,Gene,rsID,Citatation
0,PNPLA3,rs738409,"PMID: 32333362, PMID: 27278285, PMID: 26409295"
1,PNPLA3,rs4823173,PMID: 36000237
2,PNPLA3,rs2896019,PMID: 36000237
3,PNPLA3,rs3810622,PMID: 36000237
4,PNPLA3,rs2281135,PMID: 36000237
5,PNPLA3,rs12483959,PMID: 36000237
6,PNPLA3,rs143392071,PMID: 36000237
7,PNPLA3,rs2143571,PMID: 36000237
8,TM6SF2,rs58542926,"PMID: 26807382, PMID: 36353245, PMID: 31713012..."
9,GCKR,rs1260326,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."


In [26]:
data = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Fatty_liver_data.xlsx')
data

Unnamed: 0,Gene Name,rsID,Covered/Not_covered,CHROM,POS,REF,ALT,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND
0,GCKR,rs1260326,Covered,chr2,27508073,T,C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,protein coding,15/19,,L446P,cTg/cCg,1.0
1,GCKR,rs1260326,Covered,chr2,27508073,T,G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,0.000004,0.0000,0.0000,0.0000,0.000054,0.0000,0.0000,0.0000,0.000,,,,,,,,,,,,0.000054,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1.0
2,GCKR,rs1260326,Covered,chr2,27508073,T,C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,protein coding,5/7,,L147P,cTg/cCg,1.0
3,GCKR,rs1260326,Covered,chr2,27508073,T,G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,0.000004,0.0000,0.0000,0.0000,0.000054,0.0000,0.0000,0.0000,0.000,,,,,,,,,,,,0.000054,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1.0
4,GCKR,rs1260326,Covered,chr2,27508073,T,C,downstream gene variant,2/10,MODIFIER,1.5,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,retained intron,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1.0
106,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,A,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.2270,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.387500,gnomADg EAS,retained intron,,2/2,,,1.0
107,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1.0
108,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,A,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.2270,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.387500,gnomADg EAS,promoter,,,,,


In [28]:
merged_df = pd.merge(data, df, on = 'rsID', how='left', sort=False)
merged_df

Unnamed: 0,Gene Name,rsID,Covered/Not_covered,CHROM,POS,REF,ALT,Consequence,Consequence_score,IMPACT,IMPACT_score,ClinVar_CLNDN,CLIN_SIG,ClinVar_CLNREVSTAT,ClinVar,HGVSc,HGVSc (Transcript),HGVSp,HGVSp (Transcript),SIFT,PolyPhen,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_NFE_AF,gnomADe_OTH_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_OTH_AF,gnomADg_SAS_AF,MAX_AF,MAX_AF_POPS,BIOTYPE,EXON,INTRON,Protein Position and Amino Acid,Codons,STRAND,Gene,Citatation
0,GCKR,rs1260326,Covered,chr2,27508073,T,C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,ENST00000264717.7,c.1337T>C,ENSP00000264717.2,p.Leu446Pro,tolerated(1),benign(0),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,protein coding,15/19,,L446P,cTg/cCg,1.0,GCKR,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."
1,GCKR,rs1260326,Covered,chr2,27508073,T,G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000264717.7,c.1337T>G,ENSP00000264717.2,p.Leu446Arg,tolerated(0.06),benign(0),,,,,,,0.000004,0.0000,0.0000,0.0000,0.000054,0.0000,0.0000,0.0000,0.000,,,,,,,,,,,,0.000054,gnomADe EAS,protein coding,15/19,,L446R,cTg/cGg,1.0,GCKR,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."
2,GCKR,rs1260326,Covered,chr2,27508073,T,C,"missense variant,splice region variant",7/10,MODERATE,5.0,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,ENST00000411584.1,c.440T>C,ENSP00000416917.1,p.Leu147Pro,tolerated(1),benign(0.001),0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,protein coding,5/7,,L147P,cTg/cCg,1.0,GCKR,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."
3,GCKR,rs1260326,Covered,chr2,27508073,T,G,"missense variant,splice region variant",7/10,MODERATE,5.0,,,,,ENST00000411584.1,c.440T>G,ENSP00000416917.1,p.Leu147Arg,tolerated(0.06),benign(0.286),,,,,,,0.000004,0.0000,0.0000,0.0000,0.000054,0.0000,0.0000,0.0000,0.000,,,,,,,,,,,,0.000054,gnomADe EAS,protein coding,5/7,,L147R,cTg/cGg,1.0,GCKR,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."
4,GCKR,rs1260326,Covered,chr2,27508073,T,C,downstream gene variant,2/10,MODIFIER,1.5,Fasting plasma glucose level quantitative trai...,"benign,association","criteria provided, multiple submitters, no con...",8751.0,,,,,,,0.7067,0.9062,0.6383,0.5188,0.5895,0.7996,0.633100,0.8679,0.6662,0.4668,0.510800,0.6426,0.5902,0.6005,0.758,0.6700,0.8582,0.7083,0.6260,0.4585,0.5014,0.6378,0.4904,0.5898,0.6106,0.7566,0.906200,AFR,retained intron,,,,,1.0,GCKR,"PMID: 32383295,PMID: 34535985, PMID: 34841290,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000493621.1,n.100+316G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,protein coding CDS not defined,,1/1,,,1.0,PNPLA3,PMID: 36000237
106,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,A,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>A,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.2270,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.387500,gnomADg EAS,retained intron,,2/2,,,1.0,PNPLA3,PMID: 36000237
107,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,C,"intron variant,non coding transcript variant",2/10,MODIFIER,1.5,,,,,ENST00000494795.1,n.3057-532G>C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,retained intron,,2/2,,,1.0,PNPLA3,PMID: 36000237
108,SAMM50,rs2143571,Not_Covered,chr22,43995806,G,A,regulatory region variant,2/10,MODIFIER,1.5,,,,,,,,,,,0.3075,0.3797,0.3415,0.3661,0.2087,0.2270,,,,,,,,,,0.2519,0.3536,0.2325,0.3173,0.1317,0.3875,0.2268,0.2184,0.1789,0.2555,0.2035,0.387500,gnomADg EAS,promoter,,,,,,PNPLA3,PMID: 36000237


In [29]:
merged_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Processed_vcf_files/Fatty_liver_data.xlsx', index=False)

In [32]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/KHHSPTGPCSP22_genes.xlsx')
df

Unnamed: 0,Gene Name,Matched Gene
0,CTLA4,Type1 Daibetes
1,INS,Type1 Daibetes
2,IL2RA,Type1 Daibetes
3,HNF1A,Type1 Daibetes
4,CEL,Type1 Daibetes
...,...,...
278,TRDN,Cardiac
279,LDLRAP1,Cardiac
280,PCSK9,Cardiac
281,APOB,Cardiac


In [33]:
result = df.groupby('Gene Name')['Matched Gene'].agg(lambda x: ', '.join(x.unique())).reset_index()
result

Unnamed: 0,Gene Name,Matched Gene
0,ABCA1,"Cholesterol, Cardiac"
1,ABCB4,Cholesterol
2,ABCC8,"Type2 Diabetes, MODY, Obesity"
3,ABCC9,Cardiac
4,ABCG5,Cholesterol
...,...,...
241,WNK1,Hypertension
242,WNK4,Hypertension
243,WRN,Type2 Diabetes
244,ZMPSTE24,Cholesterol


In [35]:
result.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/KHHSPTGPCSP22_genes.xlsx')