In [None]:
KHGLBS433_final

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
import psycopg2

print('Libraries Loaded')

def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHGLBS433_final.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS433_final.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

print('Covered Extracted')

vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHGLBS433_final.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'Rvcf', 'RDR', 'Avcf', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','Rvcf', 'RDR', 'Avcf', 'ADR']]

print('Data Loaded')

vcf['HET'] = vcf['INFO'].str.extract(r'HET=(\d)')
vcf['HOM'] = vcf['INFO'].str.extract(r'HOM=(\d)')

# Create a new column 'Zygosity' based on conditions
vcf['Zygosity'] = ''

vcf.loc[vcf['HOM'] == '1', 'Zygosity'] = 'Homozygous'
vcf.loc[vcf['HET'] == '1', 'Zygosity'] = 'Heterozygous'
vcf['GT'] = vcf['GT'].astype(str)

print('Zygosity Extracted')

vcf["Gene Name"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene Name'] = vcf['Gene Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')

vcf['CSQ'] = vcf['INFO'].str.extract(r'CSQ=(.*)')
vcf['csq'] = vcf['CSQ'].str.split(',')
vcf = vcf.explode('csq')

print('Gene and CSQ splitted')

vcf['IMPACT'] = vcf['csq'].str.split('|').str[2]
vcf

Libraries Loaded
Covered Extracted
Data Loaded
Zygosity Extracted
Gene and CSQ splitted


Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,Rvcf,RDR,Avcf,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,IMPACT
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=76;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=O...,1/1,255,76,76,2,74,97.37%,8.1413E-42,2,0,57,17,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,MODERATE
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,MODIFIER
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22761,chrY,2221159,.,T,C,.,PASS,ADP=9;WT=0;HET=0;HOM=1;NC=0;CSQ=C|missense_var...,1/1,46,9,9,0,9,100%,2.0568E-5,0,0,6,3,0,1,Homozygous,,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,MODERATE
22762,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER
22762,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER
22762,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER


In [2]:
df_2 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/IMPACT.xlsx')

In [4]:
merged_2 = pd.merge(vcf, df_2, on = 'IMPACT', how='left', sort=False)
merged_2

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,Rvcf,RDR,Avcf,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,IMPACT,IMPACT_score
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=76;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=O...,1/1,255,76,76,2,74,97.37%,8.1413E-42,2,0,57,17,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,MODERATE,5.0
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,MODIFIER,1.5
2,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5
3,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5
4,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191374,chrY,2221159,.,T,C,.,PASS,ADP=9;WT=0;HET=0;HOM=1;NC=0;CSQ=C|missense_var...,1/1,46,9,9,0,9,100%,2.0568E-5,0,0,6,3,0,1,Homozygous,,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,MODERATE,5.0
191375,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5
191376,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5
191377,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5


In [9]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/433_genes.xlsx')
df_gene

Unnamed: 0,Gene Name
0,BARD1
1,BRCA1
2,BRCA2
3,BRIP1
4,CDK12
5,CHEK1
6,CHEK2
7,FANCL
8,NTRK1
9,NTRK2


In [10]:
merged_2['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged_2['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged_2.loc[merged_2['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
merged_2

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,Rvcf,RDR,Avcf,ADR,HET,HOM,Zygosity,Gene Name,CSQ,csq,IMPACT,IMPACT_score,Gene_Match
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=76;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=O...,1/1,255,76,76,2,74,97.37%,8.1413E-42,2,0,57,17,0,1,Homozygous,OR4F5,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,MODERATE,5.0,No
1,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,MODIFIER,1.5,No
2,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5,No
3,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5,No
4,chr1,942335,rs6605066,C,G,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,99,18,18,0,18,100%,1.1019E-10,0,0,8,10,0,1,Homozygous,SAMD11,G|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,G|intron_variant|MODIFIER|SAMD11|ENSG000001876...,MODIFIER,1.5,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191374,chrY,2221159,.,T,C,.,PASS,ADP=9;WT=0;HET=0;HOM=1;NC=0;CSQ=C|missense_var...,1/1,46,9,9,0,9,100%,2.0568E-5,0,0,6,3,0,1,Homozygous,,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,C|missense_variant|MODERATE|DHRSX|ENSG00000169...,MODERATE,5.0,No
191375,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5,No
191376,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5,No
191377,chrY,12340004,rs377316799,G,T,.,PASS,"ADP=8;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.9765,0.0...",1/1,41,8,8,0,8,100%,7.77E-5,0,0,1,7,0,1,Homozygous,,T|intron_variant&non_coding_transcript_variant...,T|intron_variant&non_coding_transcript_variant...,MODIFIER,1.5,No


In [11]:
merged_2.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHGLBS433_final.xlsx', index=False)