In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [25]:
father = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_father_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
father.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
father['DP_father'] = father['SAMPLE'].str.split(':').str[3].fillna('0').replace('', 0).astype(int)
father['RD_father'] = father['SAMPLE'].str.split(':').str[4].fillna('0').replace('', 0).astype(int)
father['AD_father'] = father['SAMPLE'].str.split(':').str[5].fillna('0').replace('', 0).astype(int)
father['VAF_father'] = father['AD_father'] / (father['RD_father'] + father['AD_father']).replace('', 0).astype('float')
father['CSQ'] = father['INFO'].str.extract(r'CSQ=(.*)')
father['gnomADe_AF'] = father['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
father['gnomADe_SAS_AF'] = father['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
father["Gene_Name"] = father["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')

father['HET'] = father['INFO'].str.extract(r'HET=(\d)')
father['HOM'] = father['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
father['Zygosity_father'] = ''
father.loc[father['HOM'] == '1', 'Zygosity_father'] = 'Homozygous'
father.loc[father['HET'] == '1', 'Zygosity_father'] = 'Heterozygous'

father['Gene_Name'] = father['Gene_Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
father = father[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'Gene_Name', 'Zygosity_father', 'DP_father', 'RD_father', 'AD_father', 'VAF_father', 'gnomADe_AF', 'gnomADe_SAS_AF']]
father

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene_Name,Zygosity_father,DP_father,RD_father,AD_father,VAF_father,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,rs201219564,A,G,OR4F5,Heterozygous,140,88,52,0.371429,0.8380,0.9005
1,chr1,69511,rs2691305,A,G,OR4F5,Homozygous,147,1,146,0.993197,0.9497,0.9854
2,chr1,69897,rs200676709,T,C,OR4F5,Heterozygous,137,99,38,0.277372,0.7209,0.8049
3,chr1,924533,rs112703963,A,G,"SAMD11,LOC107985728",Homozygous,93,0,93,1.000000,0.0000,0.0000
4,chr1,942451,rs6672356,T,C,SAMD11,Homozygous,80,0,80,1.000000,0.9999,0.9997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31681,chrX,155383098,.,T,C,,Heterozygous,87,59,28,0.321839,0.0000,0.0000
31682,chrX,156010159,rs200413398,A,G,IL9R,Heterozygous,32,11,21,0.656250,0.1634,0.2709
31683,chrX,156010162,rs150178903,A,G,IL9R,Heterozygous,33,13,20,0.606061,0.1568,0.2628
31684,chrY,3019783,rs9786184,A,C,LINC00278,Homozygous,39,0,39,1.000000,0.0000,0.0000


In [26]:
mother = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_mother_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
mother.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
mother['DP_mother'] = mother['SAMPLE'].str.split(':').str[3].fillna('0').replace('', 0).astype(int)
mother['RD_mother'] = mother['SAMPLE'].str.split(':').str[4].fillna('0').replace('', 0).astype(int)
mother['AD_mother'] = mother['SAMPLE'].str.split(':').str[5].fillna('0').replace('', 0).astype(int)
mother['VAF_mother'] = mother['AD_mother'] / (mother['RD_mother'] + mother['AD_mother']).replace('', 0).astype('float')

mother['HET'] = mother['INFO'].str.extract(r'HET=(\d)')
mother['HOM'] = mother['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
mother['Zygosity_mother'] = ''
mother.loc[mother['HOM'] == '1', 'Zygosity_mother'] = 'Homozygous'
mother.loc[mother['HET'] == '1', 'Zygosity_mother'] = 'Heterozygous'

mother['CSQ'] = mother['INFO'].str.extract(r'CSQ=(.*)')
mother['gnomADe_AF'] = mother['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
mother['gnomADe_SAS_AF'] = mother['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
mother = mother[['CHROM', 'POS', 'REF', 'ALT', 'Zygosity_mother', 'DP_mother', 'RD_mother', 'AD_mother', 'VAF_mother']]
mother

Unnamed: 0,CHROM,POS,REF,ALT,Zygosity_mother,DP_mother,RD_mother,AD_mother,VAF_mother
0,chr1,69270,A,G,Heterozygous,44,16,28,0.636364
1,chr1,69511,A,G,Homozygous,133,0,133,1.000000
2,chr1,69761,A,T,Heterozygous,66,48,18,0.272727
3,chr1,69897,T,C,Heterozygous,46,29,17,0.369565
4,chr1,924533,A,G,Homozygous,40,1,39,0.975000
...,...,...,...,...,...,...,...,...,...
29803,chrX,154653251,C,G,Heterozygous,41,26,15,0.365854
29804,chrX,154653499,C,T,Heterozygous,72,49,23,0.319444
29805,chrX,154766321,G,T,Homozygous,44,0,43,1.000000
29806,chrX,154792236,C,T,Heterozygous,47,17,30,0.638298


In [28]:
son = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_son_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
son.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
son['DP_son'] = son['SAMPLE'].str.split(':').str[3].fillna('0').replace('', 0).astype(int)
son['RD_son'] = son['SAMPLE'].str.split(':').str[4].fillna('0').replace('', 0).astype(int)
son['AD_son'] = son['SAMPLE'].str.split(':').str[5].fillna('0').replace('', 0).astype(int)
son['VAF_son'] = son['AD_son'] / (son['RD_son'] + son['AD_son']).replace('', 0).astype('float')

son['HET'] = son['INFO'].str.extract(r'HET=(\d)')
son['HOM'] = son['INFO'].str.extract(r'HOM=(\d)')
# Create a new column 'Zygosity' based on conditions
son['Zygosity_son'] = ''
son.loc[son['HOM'] == '1', 'Zygosity_son'] = 'Homozygous'
son.loc[son['HET'] == '1', 'Zygosity_son'] = 'Heterozygous'

son['CSQ'] = son['INFO'].str.extract(r'CSQ=(.*)')
son['gnomADe_AF'] = son['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
son['gnomADe_SAS_AF'] = son['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
son = son[['CHROM', 'POS', 'REF', 'ALT', 'Zygosity_son', 'DP_son', 'RD_son', 'AD_son', 'VAF_son']]
son

Unnamed: 0,CHROM,POS,REF,ALT,Zygosity_son,DP_son,RD_son,AD_son,VAF_son
0,chr1,69270,A,G,Heterozygous,104,30,74,0.711538
1,chr1,69511,A,G,Homozygous,205,0,205,1.000000
2,chr1,69897,T,C,Heterozygous,93,28,65,0.698925
3,chr1,924533,A,G,Homozygous,81,0,80,1.000000
4,chr1,942451,T,C,Homozygous,98,0,98,1.000000
...,...,...,...,...,...,...,...,...,...
32709,chrX,155277884,G,T,Homozygous,28,0,28,1.000000
32710,chrX,156003433,T,C,Heterozygous,9,3,6,0.666667
32711,chrY,3019783,A,C,Homozygous,42,0,42,1.000000
32712,chrY,12914512,C,A,Homozygous,18,0,18,1.000000


In [30]:
intersection = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/intersected_variants.xlsx')
intersection = intersection[['CHROM', 'POS', 'REF', 'ALT']]
intersection

Unnamed: 0,CHROM,POS,REF,ALT
0,chr1,69270,A,G
1,chr1,69511,A,G
2,chr1,69897,T,C
3,chr1,924533,A,G
4,chr1,942451,T,C
...,...,...,...,...
15198,chrX,154485448,T,C
15199,chrX,154563953,C,T
15200,chrX,154652556,C,A
15201,chrX,154653499,C,T


In [31]:
intersection_father = pd.merge(intersection, father, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
intersection_father

Unnamed: 0,CHROM,POS,REF,ALT,rsID,Gene_Name,Zygosity_father,DP_father,RD_father,AD_father,VAF_father,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,A,G,rs201219564,OR4F5,Heterozygous,140,88,52,0.371429,0.8380,0.9005
1,chr1,69511,A,G,rs2691305,OR4F5,Homozygous,147,1,146,0.993197,0.9497,0.9854
2,chr1,69897,T,C,rs200676709,OR4F5,Heterozygous,137,99,38,0.277372,0.7209,0.8049
3,chr1,924533,A,G,rs112703963,"SAMD11,LOC107985728",Homozygous,93,0,93,1.000000,0.0000,0.0000
4,chr1,942451,T,C,rs6672356,SAMD11,Homozygous,80,0,80,1.000000,0.9999,0.9997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15198,chrX,154485448,T,C,rs7057286,UBL4A,Homozygous,49,0,49,1.000000,0.0000,0.0000
15199,chrX,154563953,C,T,rs201709278;rs5945206,IKBKG,Heterozygous,59,15,44,0.745763,0.0000,0.0000
15200,chrX,154652556,C,A,rs4326559,"CTAG2,LOC105373387",Homozygous,40,1,39,0.975000,0.0000,0.0000
15201,chrX,154653499,C,T,rs17855367,"CTAG2,LOC105373387",Heterozygous,59,28,31,0.525424,0.0000,0.0000


In [32]:
intersection_mother = pd.merge(intersection_father, mother, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
intersection_mother

Unnamed: 0,CHROM,POS,REF,ALT,rsID,Gene_Name,Zygosity_father,DP_father,RD_father,AD_father,VAF_father,gnomADe_AF,gnomADe_SAS_AF,Zygosity_mother,DP_mother,RD_mother,AD_mother,VAF_mother
0,chr1,69270,A,G,rs201219564,OR4F5,Heterozygous,140,88,52,0.371429,0.8380,0.9005,Heterozygous,44,16,28,0.636364
1,chr1,69511,A,G,rs2691305,OR4F5,Homozygous,147,1,146,0.993197,0.9497,0.9854,Homozygous,133,0,133,1.000000
2,chr1,69897,T,C,rs200676709,OR4F5,Heterozygous,137,99,38,0.277372,0.7209,0.8049,Heterozygous,46,29,17,0.369565
3,chr1,924533,A,G,rs112703963,"SAMD11,LOC107985728",Homozygous,93,0,93,1.000000,0.0000,0.0000,Homozygous,40,1,39,0.975000
4,chr1,942451,T,C,rs6672356,SAMD11,Homozygous,80,0,80,1.000000,0.9999,0.9997,Homozygous,38,0,38,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15198,chrX,154485448,T,C,rs7057286,UBL4A,Homozygous,49,0,49,1.000000,0.0000,0.0000,Homozygous,75,0,75,1.000000
15199,chrX,154563953,C,T,rs201709278;rs5945206,IKBKG,Heterozygous,59,15,44,0.745763,0.0000,0.0000,Homozygous,35,0,35,1.000000
15200,chrX,154652556,C,A,rs4326559,"CTAG2,LOC105373387",Homozygous,40,1,39,0.975000,0.0000,0.0000,Heterozygous,39,23,16,0.410256
15201,chrX,154653499,C,T,rs17855367,"CTAG2,LOC105373387",Heterozygous,59,28,31,0.525424,0.0000,0.0000,Heterozygous,72,49,23,0.319444


In [33]:
intersection_all = pd.merge(intersection_mother, son, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
intersection_all

Unnamed: 0,CHROM,POS,REF,ALT,rsID,Gene_Name,Zygosity_father,DP_father,RD_father,AD_father,VAF_father,gnomADe_AF,gnomADe_SAS_AF,Zygosity_mother,DP_mother,RD_mother,AD_mother,VAF_mother,Zygosity_son,DP_son,RD_son,AD_son,VAF_son
0,chr1,69270,A,G,rs201219564,OR4F5,Heterozygous,140,88,52,0.371429,0.8380,0.9005,Heterozygous,44,16,28,0.636364,Heterozygous,104,30,74,0.711538
1,chr1,69511,A,G,rs2691305,OR4F5,Homozygous,147,1,146,0.993197,0.9497,0.9854,Homozygous,133,0,133,1.000000,Homozygous,205,0,205,1.000000
2,chr1,69897,T,C,rs200676709,OR4F5,Heterozygous,137,99,38,0.277372,0.7209,0.8049,Heterozygous,46,29,17,0.369565,Heterozygous,93,28,65,0.698925
3,chr1,924533,A,G,rs112703963,"SAMD11,LOC107985728",Homozygous,93,0,93,1.000000,0.0000,0.0000,Homozygous,40,1,39,0.975000,Homozygous,81,0,80,1.000000
4,chr1,942451,T,C,rs6672356,SAMD11,Homozygous,80,0,80,1.000000,0.9999,0.9997,Homozygous,38,0,38,1.000000,Homozygous,98,0,98,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15198,chrX,154485448,T,C,rs7057286,UBL4A,Homozygous,49,0,49,1.000000,0.0000,0.0000,Homozygous,75,0,75,1.000000,Homozygous,67,0,67,1.000000
15199,chrX,154563953,C,T,rs201709278;rs5945206,IKBKG,Heterozygous,59,15,44,0.745763,0.0000,0.0000,Homozygous,35,0,35,1.000000,Homozygous,33,0,33,1.000000
15200,chrX,154652556,C,A,rs4326559,"CTAG2,LOC105373387",Homozygous,40,1,39,0.975000,0.0000,0.0000,Heterozygous,39,23,16,0.410256,Homozygous,43,0,43,1.000000
15201,chrX,154653499,C,T,rs17855367,"CTAG2,LOC105373387",Heterozygous,59,28,31,0.525424,0.0000,0.0000,Heterozygous,72,49,23,0.319444,Heterozygous,86,41,45,0.523256


In [34]:
intersection_all.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Intersection_FMS_DP_VAF_Zygosity.xlsx', index=False)

In [16]:
father = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/Father_data_columns.xlsx')
father['CSQ'] = father['INFO'].str.extract(r'CSQ=(.*)')
father['gnomADe_AF'] = father['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
father['gnomADe_SAS_AF'] = father['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
father.head()

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF,CSQ,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,181,140,88,52,37.14%,6.3743e-19,59,49,77,11,43,9,140,0.6291,0.8855,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.838,0.9005
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,147,1,146,99.32%,1.0000999999999999e-85,37,55,1,0,115,31,147,0.846,0.9772,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,0.9497,0.9854
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",0/1,127,137,99,38,27.74%,1.8385e-13,56,50,63,36,27,11,137,0.4864,0.622,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.7209,0.8049
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,255,93,0,93,100%,1.7451e-55,0,54,0,0,70,23,93,0.8215,0.9098,G|upstream_gene_variant|MODIFIER|SAMD11|ENSG00...,0.0,0.0
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,255,80,0,80,100%,1.0864e-47,0,51,0,0,54,26,80,0.9998,0.9996,C|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,0.9999,0.9997


In [17]:
mother = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/Mother_data_columns.xlsx')
mother['CSQ'] = mother['INFO'].str.extract(r'CSQ=(.*)')
mother['gnomADe_AF'] = mother['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
mother['gnomADe_SAS_AF'] = mother['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
mother.head()

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF,CSQ,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=44;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEIN...,0/1,112,44,16,28,63.64%,5.6997e-12,66,47,15,1,22,6,44,0.6291,0.8855,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.838,0.9005
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=133;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,133,0,133,100%,1.7256e-79,0,55,0,0,108,25,133,0.846,0.9772,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,0.9497,0.9854
2,chr1,69761,rs200505207,A,T,.,PASS,"ADP=66;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9633,0....",0/1,60,66,48,18,27.27%,9.974e-07,57,61,35,13,14,4,66,0.06088,0.04502,T|missense_variant|MODERATE|OR4F5|ENSG00000186...,0.09168,0.07638
3,chr1,69897,rs200676709,T,C,.,PASS,"ADP=46;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,59,46,29,17,36.96%,1.2413e-06,62,59,22,7,15,2,46,0.4864,0.622,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.7209,0.8049
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=40;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,214,40,1,39,97.5%,3.8137e-22,62,56,1,0,30,9,40,0.8215,0.9098,G|upstream_gene_variant|MODIFIER|SAMD11|ENSG00...,0.0,0.0


In [18]:
son = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/Son_data_columns.xlsx')
son['CSQ'] = son['INFO'].str.extract(r'CSQ=(.*)')
son['gnomADe_AF'] = son['CSQ'].str.split('|').str[48].replace('', 0).astype('float')
son['gnomADe_SAS_AF'] = son['CSQ'].str.split('|').str[56].replace('', 0).astype('float')
son.head()

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR,DP,gnomADg_AF,gnomADg_SAS_AF,CSQ,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=104;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,255,104,30,74,71.15%,3.2093000000000003e-32,58,49,21,9,59,15,104,0.6291,0.8855,G|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.838,0.9005
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=205;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,205,0,205,100%,9.6033e-123,0,54,0,0,145,60,205,0.846,0.9772,G|missense_variant|MODERATE|OR4F5|ENSG00000186...,0.9497,0.9854
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=93;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",0/1,255,93,28,65,69.89%,4.0051e-28,53,51,15,13,49,16,93,0.4864,0.622,C|synonymous_variant|LOW|OR4F5|ENSG00000186092...,0.7209,0.8049
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=81;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",1/1,255,81,0,80,98.77%,1.0864e-47,0,52,0,0,48,32,81,0.8215,0.9098,G|upstream_gene_variant|MODIFIER|SAMD11|ENSG00...,0.0,0.0
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=98;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,255,98,0,98,100%,1.7493e-58,0,48,0,0,66,32,98,0.9998,0.9996,C|downstream_gene_variant|MODIFIER|NOC2L|ENSG0...,0.9999,0.9997


In [19]:
intersection = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/intersected_variants.xlsx')
intersection.head()

Unnamed: 0,CHROM,POS,REF,ALT,rsID
0,chr1,69270,A,G,rs201219564
1,chr1,69511,A,G,rs2691305
2,chr1,69897,T,C,rs200676709
3,chr1,924533,A,G,rs112703963
4,chr1,942451,T,C,rs6672356


In [20]:
father_dummy = father.copy()
father_dummy = father_dummy[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'INFO']]
father_dummy

Unnamed: 0,CHROM,POS,rsID,REF,ALT,INFO
0,chr1,69270,rs201219564,A,G,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...
1,chr1,69511,rs2691305,A,G,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...
2,chr1,69897,rs200676709,T,C,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0..."
3,chr1,924533,rs112703963,A,G,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0...."
4,chr1,942451,rs6672356,T,C,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO..."
...,...,...,...,...,...,...
31681,chrX,155383098,.,T,C,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...
31682,chrX,156010159,rs200413398,A,G,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1..."
31683,chrX,156010162,rs150178903,A,G,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0...."
31684,chrY,3019783,rs9786184,A,C,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0...."


In [21]:
father_dummy["Gene_Name"] = father_dummy["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
father_dummy['Gene_Name'] = father_dummy['Gene_Name'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
father_dummy = father_dummy[['CHROM', 'POS', 'REF', 'ALT', 'Gene_Name']]
father_dummy

Unnamed: 0,CHROM,POS,REF,ALT,Gene_Name
0,chr1,69270,A,G,OR4F5
1,chr1,69511,A,G,OR4F5
2,chr1,69897,T,C,OR4F5
3,chr1,924533,A,G,"LOC107985728,SAMD11"
4,chr1,942451,T,C,SAMD11
...,...,...,...,...,...
31681,chrX,155383098,T,C,
31682,chrX,156010159,A,G,IL9R
31683,chrX,156010162,A,G,IL9R
31684,chrY,3019783,A,C,LINC00278


In [22]:
intersection_data = pd.merge(intersection, father_dummy, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort = False)
intersection_data

Unnamed: 0,CHROM,POS,REF,ALT,rsID,Gene_Name
0,chr1,69270,A,G,rs201219564,OR4F5
1,chr1,69511,A,G,rs2691305,OR4F5
2,chr1,69897,T,C,rs200676709,OR4F5
3,chr1,924533,A,G,rs112703963,"LOC107985728,SAMD11"
4,chr1,942451,T,C,rs6672356,SAMD11
...,...,...,...,...,...,...
15198,chrX,154485448,T,C,rs7057286,UBL4A
15199,chrX,154563953,C,T,rs201709278;rs5945206,IKBKG
15200,chrX,154652556,C,A,rs4326559,"LOC105373387,CTAG2"
15201,chrX,154653499,C,T,rs17855367,"LOC105373387,CTAG2"


In [25]:
father_mapped = pd.merge(intersection_data, son, on = ['CHROM', 'POS', 'REF', 'ALT'], how = 'left', sort=False)
father_mapped = father_mapped[['CHROM', 'POS', 'REF', 'ALT', 'rsID_x', 'Gene_Name', 'DP', 'gnomADg_AF', 'gnomADg_SAS_AF', 'gnomADe_AF', 'gnomADe_SAS_AF']]
father_mapped.rename(columns={'rsID_x': 'rsID'}, inplace=True)
father_mapped.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/Son_intersection_mapped_new.xlsx', index=False)
father_mapped

Unnamed: 0,CHROM,POS,REF,ALT,rsID,Gene_Name,DP,gnomADg_AF,gnomADg_SAS_AF,gnomADe_AF,gnomADe_SAS_AF
0,chr1,69270,A,G,rs201219564,OR4F5,104,0.6291,0.8855,0.8380,0.9005
1,chr1,69511,A,G,rs2691305,OR4F5,205,0.8460,0.9772,0.9497,0.9854
2,chr1,69897,T,C,rs200676709,OR4F5,93,0.4864,0.6220,0.7209,0.8049
3,chr1,924533,A,G,rs112703963,"LOC107985728,SAMD11",81,0.8215,0.9098,0.0000,0.0000
4,chr1,942451,T,C,rs6672356,SAMD11,98,0.9998,0.9996,0.9999,0.9997
...,...,...,...,...,...,...,...,...,...,...,...
15198,chrX,154485448,T,C,rs7057286,UBL4A,67,0.6669,0.5396,0.0000,0.0000
15199,chrX,154563953,C,T,rs201709278;rs5945206,IKBKG,33,0.0000,0.0000,0.0000,0.0000
15200,chrX,154652556,C,A,rs4326559,"LOC105373387,CTAG2",43,0.4733,0.4243,0.0000,0.0000
15201,chrX,154653499,C,T,rs17855367,"LOC105373387,CTAG2",86,0.4428,0.3835,0.0000,0.0000


In [33]:
genes = father_mapped.copy()

In [34]:
genes['Gene_Name'] = genes['Gene_Name'].str.split(',')
genes = genes.explode('Gene_Name')

# Drop rows with null values
genes.dropna(subset=['Gene_Name'], inplace=True)

# Extract unique gene names
unique_genes = pd.DataFrame(genes['Gene_Name'].unique())
unique_genes

Unnamed: 0,0
0,OR4F5
1,SAMD11
2,LOC107985728
3,NOC2L
4,KLHL17
...,...
8269,UBL4A
8270,IKBKG
8271,LOC105373387
8272,CTAG2


In [35]:
unique_genes.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/latest_trio/unique_genes_new.xlsx', index=False)

In [4]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/KAPA_HyperExome_hg38_primary_targets_extended.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHAIGPRX6_final_son.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGPRX6_final_son_covered.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [9]:
import pandas as pd
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/KAPA_HyperExome_hg38_primary_targets_extended.bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65544,65593
1,chr1,69016,70028
2,chr1,450719,451698
3,chr1,685695,686674
4,chr1,924411,924968
...,...,...,...
242132,chrY,25038788,25038934
242133,chrY,25041748,25041906
242134,chrY,25043925,25044043
242135,chrY,25622422,25624085


In [20]:
# Value to check
value = 57209963

# Filter rows where the value lies between start and end positions
filtered_df = df[(df['Start_pos'] <= value) & (df['End_pos'] >= value)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,chromosome,Start_pos,End_pos
163897,chr14,57209932,57210081


In [6]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHAIGPRX5_final_father.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,12284,.,A,C,.,PASS,ADP=16;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:29:16:16:8:8:50%:1.2236E-3:49:58:7:1:5:3
1,chr1,14590,rs707679,G,A,.,PASS,ADP=12;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:21:12:12:6:6:50%:6.865E-3:43:37:6:0:6:0
2,chr1,14599,rs707680,T,A,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:34:15:15:6:9:60%:3.4983E-4:43:34:6:0:9:0
3,chr1,14604,rs541940975,A,G,.,PASS,"ADP=17;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,.,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:38:17:17:7:10:58.82%:1.4831E-4:51:35:7:0:10:0
4,chr1,14610,rs878986575,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:42:18:18:7:11:61.11%:5.2969E-5:59:37:7:0:11:0
...,...,...,...,...,...,...,...,...,...,...
82471,chrY,57209963,.,G,A,.,PASS,ADP=27;WT=0;HET=1;HOM=0;NC=0;CSQ=A|non_coding_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:26:27:27:19:8:29.63%:2.1337E-3:59:64:15:4:7:1
82472,chrY,57210759,.,C,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|non_coding_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:39:52:52:40:12:23.08%:1.1939E-4:67:61:36:4...
82473,chrY,57211544,.,C,T,.,PASS,ADP=58;WT=0;HET=1;HOM=0;NC=0;CSQ=T|splice_poly...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:64:58:58:39:19:32.76%:3.2669E-7:59:59:33:6...
82474,chrY,57211636,.,G,A,.,PASS,ADP=63;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:110:63:63:33:30:47.62%:9.545E-12:53:55:27:...


In [7]:
vcf_1 = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHAIGPRX5_final_father_covered.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
vcf_1.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf_1

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:181:140:140:88:52:37.14%:6.3743E-19:59:49:...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:147:147:1:146:99.32%:1.0001E-85:37:55:...
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:137:137:99:38:27.74%:1.8385E-13:56:50:...
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:93:0:93:100%:1.7451E-55:0:54:0:0:70:23
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:80:80:0:80:100%:1.0864E-47:0:51:0:0:54:26
...,...,...,...,...,...,...,...,...,...,...
31721,chrX,155383098,.,T,C,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:95:87:87:59:28:32.18%:2.7793E-10:52:61:46:...
31722,chrX,156010159,rs200413398,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:33:32:11:21:65.62%:3.1387E-9:56:52:8:3:...
31723,chrX,156010162,rs150178903,A,G,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:34:33:13:20:60.61%:1.4096E-8:53:54:9:4:...
31724,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:224:39:39:0:39:100%:3.6742E-23:0:63:0:0:33:6


In [10]:
merged_vcf = pd.merge(vcf, vcf_1, on=['CHROM', 'POS', 'REF', 'ALT'], how='inner', indicator=True)
merged_vcf

Unnamed: 0,CHROM,POS,rsID_x,REF,ALT,QUAL_x,FILTER_x,INFO_x,FORMAT_x,SAMPLE_x,rsID_y,QUAL_y,FILTER_y,INFO_y,FORMAT_y,SAMPLE_y,_merge
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:181:140:140:88:52:37.14%:6.3743E-19:59:49:...,rs201219564,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:181:140:140:88:52:37.14%:6.3743E-19:59:49:...,both
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:147:147:1:146:99.32%:1.0001E-85:37:55:...,rs2691305,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:147:147:1:146:99.32%:1.0001E-85:37:55:...,both
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:137:137:99:38:27.74%:1.8385E-13:56:50:...,rs200676709,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:137:137:99:38:27.74%:1.8385E-13:56:50:...,both
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:93:0:93:100%:1.7451E-55:0:54:0:0:70:23,rs112703963,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:93:0:93:100%:1.7451E-55:0:54:0:0:70:23,both
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:80:80:0:80:100%:1.0864E-47:0:51:0:0:54:26,rs6672356,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:80:80:0:80:100%:1.0864E-47:0:51:0:0:54:26,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31721,chrX,155383098,.,T,C,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:95:87:87:59:28:32.18%:2.7793E-10:52:61:46:...,.,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:95:87:87:59:28:32.18%:2.7793E-10:52:61:46:...,both
31722,chrX,156010159,rs200413398,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:33:32:11:21:65.62%:3.1387E-9:56:52:8:3:...,rs200413398,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:33:32:11:21:65.62%:3.1387E-9:56:52:8:3:...,both
31723,chrX,156010162,rs150178903,A,G,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:34:33:13:20:60.61%:1.4096E-8:53:54:9:4:...,rs150178903,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:34:33:13:20:60.61%:1.4096E-8:53:54:9:4:...,both
31724,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:224:39:39:0:39:100%:3.6742E-23:0:63:0:0:33:6,rs9786184,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:224:39:39:0:39:100%:3.6742E-23:0:63:0:0:33:6,both


In [11]:
# Merge the two DataFrames based on the specified columns ('CHROM', 'POS', 'REF', 'ALT')
merged_vcf = pd.merge(vcf, vcf_1, on=['CHROM', 'POS', 'REF', 'ALT'], how='outer', indicator=True)

# Get the non-common rows from the merged DataFrame
non_common_rows = merged_vcf[merged_vcf['_merge'] != 'both']

# Drop the '_merge' column, which was used for indicating commonality
non_common_rows.drop(columns='_merge', inplace=True)
non_common_rows

Unnamed: 0,CHROM,POS,rsID_x,REF,ALT,QUAL_x,FILTER_x,INFO_x,FORMAT_x,SAMPLE_x,rsID_y,QUAL_y,FILTER_y,INFO_y,FORMAT_y,SAMPLE_y
0,chr1,12284,.,A,C,.,PASS,ADP=16;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:29:16:16:8:8:50%:1.2236E-3:49:58:7:1:5:3,,,,,,
1,chr1,14590,rs707679,G,A,.,PASS,ADP=12;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:21:12:12:6:6:50%:6.865E-3:43:37:6:0:6:0,,,,,,
2,chr1,14599,rs707680,T,A,.,PASS,"ADP=15;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:34:15:15:6:9:60%:3.4983E-4:43:34:6:0:9:0,,,,,,
3,chr1,14604,rs541940975,A,G,.,PASS,"ADP=17;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8524,.,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:38:17:17:7:10:58.82%:1.4831E-4:51:35:7:0:10:0,,,,,,
4,chr1,14610,rs878986575,T,C,.,PASS,ADP=18;WT=0;HET=1;HOM=0;NC=0;ASP;GENEINFO=DDX1...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:42:18:18:7:11:61.11%:5.2969E-5:59:37:7:0:11:0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82471,chrY,57209963,.,G,A,.,PASS,ADP=27;WT=0;HET=1;HOM=0;NC=0;CSQ=A|non_coding_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:26:27:27:19:8:29.63%:2.1337E-3:59:64:15:4:7:1,,,,,,
82472,chrY,57210759,.,C,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0;CSQ=G|non_coding_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:39:52:52:40:12:23.08%:1.1939E-4:67:61:36:4...,,,,,,
82473,chrY,57211544,.,C,T,.,PASS,ADP=58;WT=0;HET=1;HOM=0;NC=0;CSQ=T|splice_poly...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:64:58:58:39:19:32.76%:3.2669E-7:59:59:33:6...,,,,,,
82474,chrY,57211636,.,G,A,.,PASS,ADP=63;WT=0;HET=1;HOM=0;NC=0;CSQ=A|intron_vari...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:110:63:63:33:30:47.62%:9.545E-12:53:55:27:...,,,,,,


In [12]:
50750 + 31726 

82476

In [21]:
vcf_3= pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/matching_positions_father.vcf.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
vcf_3.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
vcf_3

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:181:140:140:88:52:37.14%:6.3743E-19:59:49:...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:147:147:1:146:99.32%:1.0001E-85:37:55:...
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:137:137:99:38:27.74%:1.8385E-13:56:50:...
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:93:0:93:100%:1.7451E-55:0:54:0:0:70:23
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:80:80:0:80:100%:1.0864E-47:0:51:0:0:54:26
...,...,...,...,...,...,...,...,...,...,...
31681,chrX,155383098,.,T,C,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:95:87:87:59:28:32.18%:2.7793E-10:52:61:46:...
31682,chrX,156010159,rs200413398,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:33:32:11:21:65.62%:3.1387E-9:56:52:8:3:...
31683,chrX,156010162,rs150178903,A,G,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:34:33:13:20:60.61%:1.4096E-8:53:54:9:4:...
31684,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:224:39:39:0:39:100%:3.6742E-23:0:63:0:0:33:6


In [22]:
31726 - 31686 

40

In [23]:
# Merge the two DataFrames based on the specified columns ('CHROM', 'POS', 'REF', 'ALT')
merged_vcf = pd.merge(vcf_3, vcf_1, on=['CHROM', 'POS', 'REF', 'ALT'], how='outer', indicator=True)

# Get the non-common rows from the merged DataFrame
non_common_rows = merged_vcf[merged_vcf['_merge'] != 'both']

# Drop the '_merge' column, which was used for indicating commonality
non_common_rows.drop(columns='_merge', inplace=True)
non_common_rows

Unnamed: 0,CHROM,POS,rsID_x,REF,ALT,QUAL_x,FILTER_x,INFO_x,FORMAT_x,SAMPLE_x,rsID_y,QUAL_y,FILTER_y,INFO_y,FORMAT_y,SAMPLE_y
31686,chr16_KI270728v1_random,661635,,G,C,,,,,,.,.,PASS,ADP=15;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:29:15:15:7:8:53.33%:1.0995E-3:58:57:7:0:8:0
31687,chr16_KI270728v1_random,661744,,A,C,,,,,,.,.,PASS,ADP=17;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:24:17:17:10:7:41.18%:3.6151E-3:55:51:7:3:3:4
31688,chr16_KI270728v1_random,661775,,T,C,,,,,,.,.,PASS,ADP=13;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:21:13:13:7:6:46.15%:7.4534E-3:46:55:5:2:3:3
31689,chr16_KI270728v1_random,1436517,,G,C,,,,,,.,.,PASS,ADP=29;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:55:29:29:14:15:51.72%:2.6068E-6:46:34:14:0...
31690,chr16_KI270728v1_random,1436793,,G,T,,,,,,.,.,PASS,ADP=27;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:67:27:27:10:17:62.96%:1.7891E-7:43:39:3:7:...
31691,chr16_KI270728v1_random,1447573,,A,C,,,,,,.,.,PASS,ADP=46;WT=0;HET=1;HOM=0;NC=0;CSQ=C|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:80:46:46:24:22:47.83%:8.5409E-9:57:52:23:1...
31692,chr16_KI270728v1_random,1447646,,C,T,,,,,,.,.,PASS,ADP=70;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:103:70:70:40:29:41.43%:4.3774E-11:63:60:37...
31693,chr16_KI270728v1_random,1448375,,G,T,,,,,,.,.,PASS,ADP=27;WT=0;HET=1;HOM=0;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:22:27:27:20:7:25.93%:5.0143E-3:46:54:6:14:4:3
31694,chr4_GL000257v2_alt,505927,,C,T,,,,,,.,.,PASS,ADP=35;WT=0;HET=0;HOM=1;NC=0;CSQ=T|intergenic_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:200:35:35:0:35:100%:8.9137E-21:0:62:0:0:30:5
31695,chr7_KI270803v1_alt,290188,,A,C,,,,,,.,.,PASS,ADP=9;WT=0;HET=0;HOM=1;NC=0;CSQ=C|intergenic_v...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:46:9:9:0:9:100%:2.0568E-5:0:50:0:0:7:2


# Covered Analysis

In [2]:
father = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_father_new.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
father.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
father

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=140;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:181:140:140:88:52:37.14%:6.3743E-19:59:49:...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=147;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:147:147:1:146:99.32%:1.0001E-85:37:55:...
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=137;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:127:137:137:99:38:27.74%:1.8385E-13:56:50:...
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:93:93:0:93:100%:1.7451E-55:0:54:0:0:70:23
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=80;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:80:80:0:80:100%:1.0864E-47:0:51:0:0:54:26
...,...,...,...,...,...,...,...,...,...,...
31681,chrX,155383098,.,T,C,.,PASS,ADP=87;WT=0;HET=1;HOM=0;NC=0;CSQ=C|downstream_...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:95:87:87:59:28:32.18%:2.7793E-10:52:61:46:...
31682,chrX,156010159,rs200413398,A,G,.,PASS,"ADP=32;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.858,0.1...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:85:33:32:11:21:65.62%:3.1387E-9:56:52:8:3:...
31683,chrX,156010162,rs150178903,A,G,.,PASS,"ADP=33;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.8956,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:34:33:13:20:60.61%:1.4096E-8:53:54:9:4:...
31684,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=39;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:224:39:39:0:39:100%:3.6742E-23:0:63:0:0:33:6


In [4]:
mother = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_mother_new.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
mother.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
mother

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=44;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEIN...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:112:44:44:16:28:63.64%:5.6997E-12:66:47:15...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=133;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:133:133:0:133:100%:1.7256E-79:0:55:0:0...
2,chr1,69761,rs200505207,A,T,.,PASS,"ADP=66;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9633,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:60:66:66:48:18:27.27%:9.974E-7:57:61:35:13...
3,chr1,69897,rs200676709,T,C,.,PASS,"ADP=46;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:59:46:46:29:17:36.96%:1.2413E-6:62:59:22:7...
4,chr1,924533,rs112703963,A,G,.,PASS,"ADP=40;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:214:40:40:1:39:97.5%:3.8137E-22:62:56:1:0:...
...,...,...,...,...,...,...,...,...,...,...
29803,chrX,154653251,rs17328091,C,G,.,PASS,"ADP=41;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4906,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:51:41:41:26:15:36.59%:6.364E-6:44:52:26:0:...
29804,chrX,154653499,rs17855367,C,T,.,PASS,"ADP=72;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5428,.,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:78:72:72:49:23:31.94%:1.4681E-8:61:46:37:1...
29805,chrX,154766321,rs2728532,G,T,.,PASS,"ADP=44;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.007417,...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:248:44:44:0:43:97.73%:1.5066E-25:0:55:0:0:...
29806,chrX,154792236,rs782318569,C,T,.,PASS,"ADP=47;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9995,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:120:47:47:17:30:63.83%:8.4848E-13:47:48:13...


In [5]:
son = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Covered_son_new.vcf', comment='#', sep='\t', header=None, low_memory=False, encoding='latin-1')
son.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']
son

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SAMPLE
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=104;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:255:104:104:30:74:71.15%:3.2093E-32:58:49:...
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=205;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:205:205:0:205:100%:9.6033E-123:0:54:0:...
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=93;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:255:93:93:28:65:69.89%:4.0051E-28:53:51:15...
3,chr1,924533,rs112703963,A,G,.,PASS,"ADP=81;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.2502,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:81:81:0:80:98.77%:1.0864E-47:0:52:0:0:...
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=98;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:255:98:98:0:98:100%:1.7493E-58:0:48:0:0:66:32
...,...,...,...,...,...,...,...,...,...,...
32709,chrX,155277884,rs559165,G,T,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6458,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:158:28:28:0:28:100%:1.3074E-16:0:54:0:0:25:3
32710,chrX,156003433,rs2037999,T,C,.,PASS,"ADP=9;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4079,0.5...",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,0/1:23:9:9:3:6:66.67%:4.5249E-3:61:67:3:0:6:0
32711,chrY,3019783,rs9786184,A,C,.,PASS,"ADP=42;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.1744,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:242:42:42:0:42:100%:5.9562E-25:0:54:0:0:31:11
32712,chrY,12914512,rs2032624,C,A,.,PASS,"ADP=18;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.7575,0....",GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:A...,1/1:99:18:18:0:18:100%:1.1019E-10:0:56:0:0:15:3


In [37]:
mother_son = pd.merge(mother, son, on = ['CHROM', 'POS', 'rsID', 'REF', 'ALT'], how = 'inner', sort=False)
mother_son = mother_son[['CHROM', 'POS', 'rsID', 'REF', 'ALT']]
mother_son

Unnamed: 0,CHROM,POS,rsID,REF,ALT
0,chr1,69270,rs201219564,A,G
1,chr1,69511,rs2691305,A,G
2,chr1,69897,rs200676709,T,C
3,chr1,924533,rs112703963,A,G
4,chr1,942451,rs6672356,T,C
...,...,...,...,...,...
22170,chrX,154556836,.,C,T
22171,chrX,154563953,rs201709278;rs5945206,C,T
22172,chrX,154652556,rs4326559,C,A
22173,chrX,154653499,rs17855367,C,T


In [38]:
father_son.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Mother_Son_common_variants.xlsx', index = False)

In [41]:
# Merge the two DataFrames on the common column with an indicator
merged = son.merge(mother_son, on=['CHROM', 'POS', 'rsID', 'REF', 'ALT'], how='left', indicator=True)

# Filter rows where the indicator column is set to 'left_only'
non_common_rows_df1 = merged[merged['_merge'] == 'left_only']

# Drop the indicator column if you don't need it
non_common_rows_df1 = non_common_rows_df1.drop(columns=['_merge'])
non_common_rows_df1 = non_common_rows_df1[['CHROM', 'POS', 'rsID', 'REF', 'ALT']]
non_common_rows_df1

Unnamed: 0,CHROM,POS,rsID,REF,ALT
10,chr1,953778,rs13303056,G,C
11,chr1,953779,rs13302945,A,C
29,chr1,1020239,rs201073369,G,C
30,chr1,1022260,rs6657048,C,T
33,chr1,1047342,rs142416636,A,G
...,...,...,...,...,...
32709,chrX,155277884,rs559165,G,T
32710,chrX,156003433,rs2037999,T,C
32711,chrY,3019783,rs9786184,A,C
32712,chrY,12914512,rs2032624,C,A


In [42]:
non_common_rows_df1.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Son_only_variants_(M&S_analysis).xlsx', index=False)

In [11]:
df = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/MODY_new_05_10_2023/All_5samples.xlsx')
df['CHROM'] = 'chr' + df['Chr:Pos'].str.split(':').str[0]
df['POS'] = df['Chr:Pos'].str.split(':').str[1].astype(int)
df['REF'] = df['Ref/Alt'].str.split('/').str[0]
df['ALT'] = df['Ref/Alt'].str.split('/').str[1]
columns_to_check = ['CHROM', 'POS', 'REF', 'ALT']
# Use drop_duplicates to remove duplicated rows based on the specified columns
df = df.drop_duplicates(subset=columns_to_check)
df

Unnamed: 0,Chr:Pos,Ref/Alt,Gene Name,Gene Inheritance,Entrez Gene ID,Transcript Name,Transcript Strand,HGVS cDot,HGVS pDot,Sequence Ontology,Other Transcript Effects,ACMG Classification Criteria,ACMG Classification Criteria Description,Classification,Auto Classification,Previous Classification Count,Previous Classification,Last Classification Date,Max Sub Population Freq Group Name,Max MAF,Max Allele Number,Max Allele Count,CHROM,POS,REF,ALT
0,1:69511,A/G,OR4F5,Default (Recessive),79501,NM_001005484.2,+,NM_001005484.2:c.484A>G,NP_001005484.2:p.Thr162Ala,missense_variant,Single Transcript Gene,"BA1,BP4",OR4F5:Allele frequency is above 0.01 recessive...,Benign,Benign,,,,Annotated gnomAD East Asian,0.999461,16698,16689,chr1,69511,A,G
1,1:817514,T/C,FAM87B,Default (Recessive),400728,NR_103536.1,+,NR_103536.1:n.144T>C,,non_coding_exon_variant,Single Transcript Gene,"BA1,BS2,BP7",FAM87B:Allele frequency is above 0.01 recessiv...,Benign,Benign,,,,Annotated 1kG All,0.753195,5008,3772,chr1,817514,T,C
2,1:826893,G/A,LINC00115,Default (Recessive),79854,NR_024321.1,-,NR_024321.1:n.630C>T,,non_coding_exon_variant,Single Transcript Gene,"BA1,BS2,BP7",LINC00115:Allele frequency is above 0.01 reces...,Benign,Benign,,,,Annotated gnomAD East Asian,0.896992,10106,9065,chr1,826893,G,A
3,1:827209,G/C,LINC00115,Default (Recessive),79854,NR_024321.1,-,NR_024321.1:n.314C>G,,non_coding_exon_variant,Single Transcript Gene,"BA1,BS2,BP7",LINC00115:Allele frequency is above 0.01 reces...,Benign,Benign,,,,Annotated gnomAD East Asian,0.894115,7952,7110,chr1,827209,G,C
4,1:827212,C/G,LINC00115,Default (Recessive),79854,NR_024321.1,-,NR_024321.1:n.311G>C,,non_coding_exon_variant,Single Transcript Gene,"BA1,BS2,BP7",LINC00115:Allele frequency is above 0.01 reces...,Benign,Benign,,,,Annotated gnomAD East Asian,0.894327,7826,6999,chr1,827212,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481215,X:154641461,,,,,,,,,,,,,,,,,,,,,,chrX,154641461,,
481219,X:154773313,A/T,DKC1,Recessive,1736,NM_001363.5,+,NM_001363.5:c.1155+64A>T,,intron_variant,Other Transcripts Have Same Effect,"PM2,BP7",DKC1:Variant is missing from all sub populatio...,VUS/Conflicting,VUS/Conflicting,,,,,,,,chrX,154773313,A,T
481223,X:154785208,-/C,MPP1,Default (Recessive),4354,NM_002436.4,-,NM_002436.4:c.678-51_678-50insG,,intron_variant,Other Transcripts Have Same Effect,"BA1,BS2,BP7",MPP1:Allele frequency is above 0.01 recessive ...,Benign,Benign,,,,Annotated 1kG All,0.0492715,3775,186,chrX,154785208,-,C
481224,X:154785208,A/C,MPP1,Default (Recessive),4354,NM_002436.4,-,NM_002436.4:c.678-51T>G,,intron_variant,Other Transcripts Have Same Effect,"BA1,BS2,BP7",MPP1:Allele frequency is above 0.01 recessive ...,Benign,Benign,,,,Annotated 1kG All,0.132715,3775,501,chrX,154785208,A,C


In [43]:
mother_son['POS'] = mother_son['POS'].astype(int)

merged = pd.merge(mother_son, df, on=['CHROM', 'POS', 'REF', 'ALT'], how='left', sort=False)
merged = merged.drop_duplicates(subset=['CHROM', 'POS', 'REF', 'ALT'])

# To fill NaN values in the 'Classification' column with 'NA', you need to assign the result back to the column.
merged['Classification'] = merged['Classification'].fillna('NA')
merged= merged.fillna('-')
# List of your specified priorities
priorities = ['Pathogenic', 'Likely Pathogenic', 'VUS/Weak Pathogenic', 'VUS/Conflicting', 'VUS', 'VUS/Weak Benign', 'Likely Benign', 'Benign']

# Function to select the highest ranked value from Original_Column
def select_highest_ranked(row):
    values = row['Classification'].split(',')
    for rank in priorities:
        if rank in values:
            return rank
    return None  # Return None if no rank found

# Apply the function to create the final selected rank column
merged['Classification'] = merged.apply(select_highest_ranked, axis=1)
merged

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Chr:Pos,Ref/Alt,Gene Name,Gene Inheritance,Entrez Gene ID,Transcript Name,Transcript Strand,HGVS cDot,HGVS pDot,Sequence Ontology,Other Transcript Effects,ACMG Classification Criteria,ACMG Classification Criteria Description,Classification,Auto Classification,Previous Classification Count,Previous Classification,Last Classification Date,Max Sub Population Freq Group Name,Max MAF,Max Allele Number,Max Allele Count
0,chr1,69270,rs201219564,A,G,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
1,chr1,69511,rs2691305,A,G,1:69511,A/G,OR4F5,Default (Recessive),79501,NM_001005484.2,+,NM_001005484.2:c.484A>G,NP_001005484.2:p.Thr162Ala,missense_variant,Single Transcript Gene,"BA1,BP4",OR4F5:Allele frequency is above 0.01 recessive...,Benign,Benign,-,-,-,Annotated gnomAD East Asian,0.999461,16698,16689
2,chr1,69897,rs200676709,T,C,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
3,chr1,924533,rs112703963,A,G,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
4,chr1,942451,rs6672356,T,C,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22170,chrX,154556836,.,C,T,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
22171,chrX,154563953,rs201709278;rs5945206,C,T,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-
22172,chrX,154652556,rs4326559,C,A,X:154652556,C/A,CTAG2,Default (Recessive),30848,NM_172377.5,-,NM_172377.5:c.345G>T,NP_758965.2:p.Pro115=,synonymous_variant,Other Transcripts Have Same Effect,"BA1,BS2,BP4,BP7,BP6",CTAG2:Allele frequency is above 0.01 recessive...,Benign,Benign,-,-,-,Annotated gnomAD East Asian,0.799206,12844,10265
22173,chrX,154653499,rs17855367,C,T,-,-,-,-,-,-,-,-,-,-,-,-,-,,-,-,-,-,-,-,-,-


In [44]:
merged.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Mother_Son_ACMG.xlsx', index = False)

In [45]:
value_counts_with_null_bcftools = merged['Classification'].value_counts(dropna=False)
#result_df = pd.DataFrame({'Counts': value_counts_with_null_bcftools})
result_df = pd.DataFrame({'Variant_classification': value_counts_with_null_bcftools.index, 'Counts': value_counts_with_null_bcftools.values})
result_df

Unnamed: 0,Variant_classification,Counts
0,Benign,16252
1,,5432
2,Likely Benign,212
3,VUS/Conflicting,184
4,VUS/Weak Pathogenic,86
5,VUS/Weak Benign,6
6,VUS,2
7,Likely Pathogenic,1


In [46]:
result_df.Counts.sum()

22175

In [47]:
result_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Desktop/Trio_analysis/new_trio_files/Mother_Son_Classification_counts.xlsx', index = False)

In [2]:
import pandas as pd
import os

file_directory = r'C:/Users/GenepoweRx_Madhu/Downloads/TRIO/'

# Define a function to read and process an Excel file
def read_excel_file(file_path):
    # Construct the full file path
    full_file_path = os.path.join(file_directory, file_path)

    # Read the Excel file into a Pandas DataFrame, specifying header and usecols
    excel_df = pd.read_excel(full_file_path, header=1, usecols=[0, 1])
    return excel_df

# List of Excel file names (without full paths)
excel_files = ['Germline_KHAFAMGPCSP1.xlsx', 'Germline_KHAFAMGPCSP2.xlsx']  # Replace with your Excel file names

# Initialize the result DataFrame with the data from the first Excel file
result_df = read_excel_file(excel_files[0])

# Define the columns to join on
join_columns = ['Chr:Pos', 'Ref/Alt']

# Iterate through the rest of the Excel files and merge with the result based on the join columns
for file_name in excel_files[1:]:
    excel_df = read_excel_file(file_name)
    result_df = pd.merge(result_df, excel_df, on=join_columns, how='inner')

# Specify the full file path for the output Excel file
output_excel_path = os.path.join(file_directory, 'intersected_data.xlsx')

# Save the result to an Excel file
result_df.to_excel(output_excel_path, index=False)
