In [1]:
######################################## Loading the Libraries #############################################################
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

################################## Covered rows extraction ###############################################################

def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Desktop/Covered_regions.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/Madhu/KHCDPRGPTTL13/KHCDPRGPTTL13_annotated_indel.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHCDPRGPTTL13_annotated_indel.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()


####################### Loading the Data #################################################################################

data = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHCDPRGPTTL13_annotated_indel.vcf', comment='#', sep= '\t', header=None, low_memory=False)
data.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
sample_cols = data['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
data = pd.concat([data, sample_cols], axis=1)
data = data[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]

######################################## Extracting the Total columns ###################################################

# Create empty columns
columns = ['ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID', 'SLO',
           'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV', 'TPA', 'CFL', 'GNO',
           'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC',
           'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO',
           'GENEINFO', 'INT', 'G5', 'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar',
           'ClinVar_CLNSIG']

for col in columns:
    data[col] = ''

# Populate columns based on 'INFO' values
for i, row in data.iterrows():
    info = row['INFO']
    items = info.split(';')
    for item in items:
        key_value = item.split('=')
        key = key_value[0]
        if key in columns:
            if len(key_value) > 1:
                value = key_value[1]
                data.at[i, key] = f"{key}={value}"
            else:
                data.at[i, key] = key
        else:
            data.at[i, key] = 'null'


            
data["Gene_Name"] = data["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
data['Gene Name'] = data['Gene_Name'].apply(lambda x: ','.join([segment.split(':')[0] for segment in x.split('|')]) if pd.notnull(x) else '')


data = data[['CHROM', 'POS', 'Gene Name', 'rsID', 'REF', 'ALT', 'GT',
       'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF',
       'ADR', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD',
       'dbSNPBuildID', 'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON',
       'RS', 'RV', 'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5',
       'TOPMED', 'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF',
       'VC', 'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5',
       'OM', 'PMC', 'SSR', 'RSPOS', 'HD', 'PM', 'ClinVar', 'ClinVar_CLNSIG']]

data['rsid'] = data['rsID'].str.split(';')
data = data.explode('rsid')


df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/KHCDPRGPTTL10_11_12_13_clnsig_files/KHCDPRGPTTL13/KHCDPRGPTTL13_indel_clinical_significance.xlsx')
df = df.rename(columns={'gene_name': 'Gene name', 'zygocity':'zygosity'})

merged = pd.merge(data, df, on = 'rsid', how = 'outer', sort=False)

merged['Gene Name'] = merged['Gene Name'].fillna(merged['Gene name'])

cond_genes = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/cardiac genes.xlsx')

merged['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in merged['Gene Name']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in cond_genes['Gene Name'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            merged.loc[merged['Gene Name'] == genes, 'Gene_Match'] = 'Yes'
            
            
merged['Consequence'] = merged['consequence'].astype(str).apply(lambda x: x.replace('&', ',').replace('_', ' ').replace("'", '').replace("-", ' '))
merged['Consequence'] = merged['Consequence'].str.split(',').str[0]


df_1 = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/Madhu_folder_04_07_2023/kidney_health_final.vcf/consequence.xlsx')
df_1 = df_1.rename({'consequence':'Consequence'}, axis =1)


merged_1 = pd.merge(merged, df_1, on='Consequence', how='left', sort=False)


merged_1 = merged_1[['CHROM', 'POS', 'Gene Name', 'Gene_Match', 'rsID', 'REF', 'ALT', 'GT', 'GQ', 'SDP',
       'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RDF', 'RDR', 'ADF', 'ADR', 'ADP',
       'WT', 'HET', 'HOM', 'NC', 'CDA', 'OTH', 'S3D', 'WTD', 'dbSNPBuildID',
       'SLO', 'NSF', 'R3', 'R5', 'NSN', 'NSM', 'G5A', 'COMMON', 'RS', 'RV',
       'TPA', 'CFL', 'GNO', 'VLD', 'ASP', 'ASS', 'Ref', 'U3', 'U5', 'TOPMED',
       'WGT', 'MTP', 'LSD', 'NOC', 'DSS', 'SYN', 'KGPhase3', 'CAF', 'VC',
       'MUT', 'KGPhase1', 'NOV', 'VP', 'SAO', 'INT', 'G5', 'OM', 'PMC', 'SSR',
       'RSPOS', 'HD', 'PM', 'ClinVar', 'ClinVar_CLNSIG', 'rsid', 'allele',
       'zygosity', 'consequence', 'Consequence_score', 'clinical_significance',
       'associated_diseases', 'review_status', 'origin', 'variant_type',
       'variant_subtype', 'Phargkb_ann_exists', 'is_mutation',
       'Variant_is_precious']]
merged_1 = merged_1.fillna('NA')
merged_1

#merged_1.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/InDel_files/KHAIGHGPTTL207_InDel_final.xlsx', index=False)

Unnamed: 0,CHROM,POS,Gene Name,Gene_Match,rsID,REF,ALT,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR,ADP,WT,HET,HOM,NC,CDA,OTH,S3D,WTD,dbSNPBuildID,SLO,NSF,R3,R5,NSN,NSM,G5A,COMMON,RS,RV,TPA,CFL,GNO,VLD,ASP,ASS,Ref,U3,U5,TOPMED,WGT,MTP,LSD,NOC,DSS,SYN,KGPhase3,CAF,VC,MUT,KGPhase1,NOV,VP,SAO,INT,G5,OM,PMC,SSR,RSPOS,HD,PM,ClinVar,ClinVar_CLNSIG,rsid,allele,zygosity,consequence,Consequence_score,clinical_significance,associated_diseases,review_status,origin,variant_type,variant_subtype,Phargkb_ann_exists,is_mutation,Variant_is_precious
0,chr1,1353987.0,MXRA8,No,rs140777846,CTG,C,1/1,255,81,81,1,80,98.77%,2.241E-46,1,0,62,18,ADP=81,WT=0,HET=0,HOM=1,NC=0,,,,,dbSNPBuildID=134,SLO,,,,,,G5A,COMMON=1,RS=140777846,,,,GNO,VLD,ASP,,,,,"TOPMED=0.13763857033639143,0.86236142966360856",WGT=1,,,,,,KGPhase3,"CAF=0.1793,0.8207",VC=DIV,,,,VP=0x050100080005170126000200,SAO=0,INT,G5,,,SSR=0,RSPOS=1353988,,,,,rs140777846,,,,,,,,,,,,,
1,chr1,1355779.0,MXRA8,No,rs201260508,GA,G,0/1,59,19,19,5,14,73.68%,1.2025E-6,5,0,10,4,ADP=19,WT=0,HET=1,HOM=0,NC=0,,,,,dbSNPBuildID=137,,,,,,,,COMMON=0,RS=201260508,,,,,,ASP,,,,,"TOPMED=0.22555109582059123,0.77444890417940876",WGT=1,,,,,,KGPhase3,"CAF=0,1",VC=DIV,,KGPhase1,,VP=0x05000008000500003e000200,SAO=0,INT,,,,SSR=0,RSPOS=1355780,,,,,rs201260508,,,,,,,,,,,,,
2,chr1,1402505.0,,No,.,CCCGGGGAA,C,0/1,33,30,30,20,10,33.33%,3.9851E-4,19,1,9,1,ADP=30,WT=0,HET=1,HOM=0,NC=0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,.,,,,,,,,,,,,,
3,chr1,45003826.0,,No,.,G,GC,0/1,25,36,36,28,8,22.22%,2.5282E-3,21,7,3,5,ADP=36,WT=0,HET=1,HOM=0,NC=0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,.,,,,,,,,,,,,,
4,chr1,147757520.0,,No,.,A,AAGGTG,0/1,23,25,24,18,7,28%,4.8126E-3,18,0,5,2,ADP=24,WT=0,HET=1,HOM=0,NC=0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,.,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5406,,,STIL,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs75930208,"['TAA', T]",Heterozygous,3_prime_UTR_variant,3/10,Benign,"Primary_Microcephaly', '_Recessive","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
5407,,,GNPTAB,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs76300806,"['AGCC', A]",Heterozygous,5_prime_UTR_variant,3/10,Benign,Mucolipidosis_type_II|Pseudo-Hurler_polydystro...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
5408,,,LHFPL5,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs796097780,"['AT', A]",Heterozygous,3_prime_UTR_variant,3/10,Likely_benign,"Nonsyndromic_Hearing_Loss', '_Recessive","criteria_provided', '_single_submitter",germline,indel,Deletion,MayBe No,No,No
5409,,,ACADM,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,rs796117827,"['GT', G]",Homozygous,intron_variant,2/10,Benign,Medium-chain_acyl-coenzyme_A_dehydrogenase_def...,"criteria_provided', '_multiple_submitters', '_...",germline,indel,Deletion,MayBe No,No,Yes
