In [1]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
pd.set_option('display.max_columns',None)
import psycopg2
#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
import warnings
warnings.filterwarnings("ignore")
#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [63]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/clinvar_20240206.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/clinvar_20240206.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()

In [3]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/clinvar_20240206.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:..."
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:..."
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:..."
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:..."
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:..."
...,...,...,...,...,...,...,...,...
2353858,NT_187693.1,273806,2219599,G,A,.,.,"ALLELEID=2206917;CLNDISDB=MeSH:D030342,MedGen:..."
2353859,NT_187693.1,273866,2237818,A,C,.,.,"ALLELEID=2232003;CLNDISDB=MeSH:D030342,MedGen:..."
2353860,NT_187693.1,274366,2206666,G,C,.,.,"ALLELEID=2200058;CLNDISDB=MeSH:D030342,MedGen:..."
2353861,NT_187693.1,275068,2241971,T,C,.,.,"ALLELEID=2226217;CLNDISDB=MeSH:D030342,MedGen:..."


In [4]:
vcf["Gene"] = vcf["INFO"].str.extract('GENEINFO=(?P<GENEINFO>.+?);')
vcf['Gene'] = vcf['Gene'].apply(lambda x: ','.join(set([segment.split(':')[0] for segment in x.split('|')])) if pd.notnull(x) else '')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5
...,...,...,...,...,...,...,...,...,...
2353858,NT_187693.1,273806,2219599,G,A,.,.,"ALLELEID=2206917;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3
2353859,NT_187693.1,273866,2237818,A,C,.,.,"ALLELEID=2232003;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3
2353860,NT_187693.1,274366,2206666,G,C,.,.,"ALLELEID=2200058;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3
2353861,NT_187693.1,275068,2241971,T,C,.,.,"ALLELEID=2226217;CLNDISDB=MeSH:D030342,MedGen:...","LOC126057115,LILRA3"


In [5]:
vcf["CLNSIG"] = vcf["INFO"].str.extract('CLNSIG=(?P<CLNSIG>.+?);')
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,Gene,CLNSIG
0,1,69134,2205837,A,G,.,.,"ALLELEID=2193183;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5,Likely_benign
1,1,69581,2252161,C,G,.,.,"ALLELEID=2238986;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5,Uncertain_significance
2,1,69682,2396347,G,A,.,.,"ALLELEID=2386655;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5,Uncertain_significance
3,1,69769,2288999,T,C,.,.,"ALLELEID=2278803;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5,Uncertain_significance
4,1,69995,2351346,G,C,.,.,"ALLELEID=2333177;CLNDISDB=MeSH:D030342,MedGen:...",OR4F5,Uncertain_significance
...,...,...,...,...,...,...,...,...,...,...
2353858,NT_187693.1,273806,2219599,G,A,.,.,"ALLELEID=2206917;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3,Likely_benign
2353859,NT_187693.1,273866,2237818,A,C,.,.,"ALLELEID=2232003;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3,Uncertain_significance
2353860,NT_187693.1,274366,2206666,G,C,.,.,"ALLELEID=2200058;CLNDISDB=MeSH:D030342,MedGen:...",LILRA3,Uncertain_significance
2353861,NT_187693.1,275068,2241971,T,C,.,.,"ALLELEID=2226217;CLNDISDB=MeSH:D030342,MedGen:...","LOC126057115,LILRA3",Uncertain_significance


In [7]:
vcf_new = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'Gene', 'CLNSIG']]
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG
0,1,69134,2205837,A,G,OR4F5,Likely_benign
1,1,69581,2252161,C,G,OR4F5,Uncertain_significance
2,1,69682,2396347,G,A,OR4F5,Uncertain_significance
3,1,69769,2288999,T,C,OR4F5,Uncertain_significance
4,1,69995,2351346,G,C,OR4F5,Uncertain_significance
...,...,...,...,...,...,...,...
2353858,NT_187693.1,273806,2219599,G,A,LILRA3,Likely_benign
2353859,NT_187693.1,273866,2237818,A,C,LILRA3,Uncertain_significance
2353860,NT_187693.1,274366,2206666,G,C,LILRA3,Uncertain_significance
2353861,NT_187693.1,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance


In [8]:
df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed', sep = '\t', header = None, error_bad_lines=False)
df.columns = ['chromosome', 'Start_pos', 'End_pos']
df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65645
1,chr1,65811,65993
2,chr1,69461,69620
3,chr1,785981,786159
4,chr1,786130,786446
...,...,...,...
230714,chrY,57190028,57190328
230715,chrY,57190299,57190439
230716,chrY,57190874,57191014
230717,chrY,57191846,57192058


In [9]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Start_pos']
    end_pos = row['End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
vcf_new['Covered_status'] = vcf_new.apply(check_coverage, axis=1)
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,1,69134,2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,1,69581,2252161,C,G,OR4F5,Uncertain_significance,Not_Covered
2,1,69682,2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,1,69769,2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,1,69995,2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,NT_187693.1,273806,2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,NT_187693.1,273866,2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,NT_187693.1,274366,2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,NT_187693.1,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [17]:
vcf_new.CHROM.value_counts()

chr2                    222933
chr1                    206078
chr17                   147671
chr11                   139443
chr3                    126372
chr16                   124927
chr19                   123537
chr5                    119884
chr7                    112893
chr12                   105629
chr9                    105004
chr6                    104971
chrX                     87730
chr10                    87560
chr15                    82402
chr4                     80960
chr8                     79867
chr14                    74855
chr22                    51217
chr13                    50780
chr20                    47570
chr18                    41051
chr21                    27517
chrMT                     2910
chrY                        78
chr19_KI270938v1_alt         9
chr17_KI270909v1_alt         7
chr22_KI270879v1_alt         6
chrUn_GL000218v1             1
chr9_KN196479v1_fix          1
Name: CHROM, dtype: int64

In [12]:
vcf_new['CHROM'] = vcf_new['CHROM'].replace({'NT_187693.1': 'chr19_KI270938v1_alt', 'NT_187661.1': 'chr17_KI270909v1_alt', 
                                            'NT_187633.1': 'chr22_KI270879v1_alt', 'NT_113889.1': 'chrUn_GL000218v1', 
                                            'NW_009646201.1': 'chr9_KN196479v1_fix'})
vcf_new 

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,1,69134,2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,1,69581,2252161,C,G,OR4F5,Uncertain_significance,Not_Covered
2,1,69682,2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,1,69769,2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,1,69995,2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chr19_KI270938v1_alt,273806,2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chr19_KI270938v1_alt,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [66]:
#NT_187693.1 = chr19_KI270938v1_alt
#NT_187661.1 = chr17_KI270909v1_alt
#NT_187633.1 = chr22_KI270879v1_alt
#NT_113889.1 = chrUn_GL000218v1
#NW_009646201.1 = chr9_KN196479v1_fix

In [14]:
vcf_new['CHROM'] = 'chr' + vcf_new['CHROM']
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,2252161,C,G,OR4F5,Uncertain_significance,Not_Covered
2,chr1,69682,2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chrchr19_KI270938v1_alt,273806,2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chrchr19_KI270938v1_alt,273866,2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chrchr19_KI270938v1_alt,274366,2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chrchr19_KI270938v1_alt,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [16]:
vcf_new['CHROM'] = vcf_new['CHROM'].replace({'chrchr19_KI270938v1_alt': 'chr19_KI270938v1_alt', 'chrchr17_KI270909v1_alt': 'chr17_KI270909v1_alt', 
                                            'chrchr22_KI270879v1_alt': 'chr22_KI270879v1_alt', 'chrchrUn_GL000218v1': 'chrUn_GL000218v1', 
                                            'chrchr9_KN196479v1_fix': 'chr9_KN196479v1_fix'})
vcf_new 

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,2252161,C,G,OR4F5,Uncertain_significance,Not_Covered
2,chr1,69682,2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chr19_KI270938v1_alt,273806,2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chr19_KI270938v1_alt,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [18]:
# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Start_pos']
    end_pos = row['End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in data
vcf_new['Covered_status'] = vcf_new.apply(check_coverage, axis=1)
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,2252161,C,G,OR4F5,Uncertain_significance,Covered
2,chr1,69682,2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chr19_KI270938v1_alt,273806,2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chr19_KI270938v1_alt,275068,2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [20]:
vcf_new['rsID'] = 'rs' + vcf_new['rsID'].astype('str')
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,rs2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,rs2252161,C,G,OR4F5,Uncertain_significance,Covered
2,chr1,69682,rs2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,rs2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,rs2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chr19_KI270938v1_alt,273806,rs2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,rs2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,rs2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chr19_KI270938v1_alt,275068,rs2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [67]:
vcf_new = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/clinvar_20240206.tsv', sep = '\t')
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,rs2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,rs2252161,C,G,OR4F5,Uncertain_significance,Covered
2,chr1,69682,rs2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,rs2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,rs2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353858,chr19_KI270938v1_alt,273806,rs2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,rs2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,rs2206666,G,C,LILRA3,Uncertain_significance,Not_Covered
2353861,chr19_KI270938v1_alt,275068,rs2241971,T,C,"LOC126057115,LILRA3",Uncertain_significance,Not_Covered


In [72]:
rows_with_Tp53 = vcf_new[vcf_new['CHROM'].str.contains('chr1', case=False, na=False)]
rows_with_Tp53

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
0,chr1,69134,rs2205837,A,G,OR4F5,Likely_benign,Not_Covered
1,chr1,69581,rs2252161,C,G,OR4F5,Uncertain_significance,Covered
2,chr1,69682,rs2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,rs2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,rs2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
...,...,...,...,...,...,...,...,...
2353857,chr19_KI270938v1_alt,273753,rs2650447,G,A,LILRA3,Benign,Not_Covered
2353858,chr19_KI270938v1_alt,273806,rs2219599,G,A,LILRA3,Likely_benign,Not_Covered
2353859,chr19_KI270938v1_alt,273866,rs2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,rs2206666,G,C,LILRA3,Uncertain_significance,Not_Covered


In [76]:
vcf_new.CLNSIG.value_counts(dropna=False)

Uncertain_significance                                              1137676
Likely_benign                                                        624662
Benign                                                               193028
Pathogenic                                                           135155
Conflicting_classifications_of_pathogenicity                         107971
                                                                     ...   
Conflicting_classifications_of_pathogenicity|drug_response|other          1
Benign/Likely_benign|risk_factor                                          1
Benign/Likely_benign|other|risk_factor                                    1
other|risk_factor                                                         1
Likely_benign|risk_factor                                                 1
Name: CLNSIG, Length: 90, dtype: int64

In [77]:
rows_with_Tp53 = vcf_new[vcf_new['CLNSIG'].str.contains('Pathogenic', case=False, na=False)]
rows_with_Tp53

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
35,chr1,930200,rs1043045,G,A,SAMD11,Conflicting_classifications_of_pathogenicity,Covered
147,chr1,935839,rs1085785,C,T,SAMD11,Conflicting_classifications_of_pathogenicity,Covered
203,chr1,939117,rs1427749,G,A,SAMD11,Conflicting_classifications_of_pathogenicity,Covered
312,chr1,941284,rs1006858,G,A,SAMD11,Conflicting_classifications_of_pathogenicity,Covered
819,chr1,943995,rs950448,C,T,SAMD11,Pathogenic,Covered
...,...,...,...,...,...,...,...,...
2353780,chrMT,15923,rs39575,A,G,MT-TT,Conflicting_classifications_of_pathogenicity,Not_Covered
2353812,chrMT,15958,rs870596,A,T,MT-TP,Pathogenic,Not_Covered
2353814,chrMT,15967,rs9572,G,A,MT-TP,Pathogenic,Not_Covered
2353822,chrMT,15990,rs9570,C,T,MT-TP,Likely_pathogenic,Not_Covered


In [79]:
query_result = rows_with_Tp53.loc[(rows_with_Tp53['CLNSIG'] == 'Pathogenic') & (rows_with_Tp53['Covered_status'] == 'Covered')]
query_result 

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
819,chr1,943995,rs950448,C,T,SAMD11,Pathogenic,Covered
1068,chr1,976215,rs1320032,A,G,PERM1,Pathogenic,Covered
1103,chr1,1013983,rs1028857,G,A,ISG15,Pathogenic,Covered
1139,chr1,1014143,rs183381,C,T,ISG15,Pathogenic,Covered
1173,chr1,1014316,rs161455,C,CG,ISG15,Pathogenic,Covered
...,...,...,...,...,...,...,...,...
2350894,chrY,2787551,rs9754,C,T,,Pathogenic,Covered
2350895,chrY,2787592,rs9751,A,T,SRY,Pathogenic,Covered
2350897,chrY,2787600,rs9753,G,A,SRY,Pathogenic,Covered
2350901,chrY,7063898,rs625467,A,T,"LOC126057105,TBL1Y",Pathogenic,Covered


In [65]:
vcf_new.Covered_status.value_counts(dropna=False)

Covered        2041980
Not_Covered     311883
Name: Covered_status, dtype: int64

In [53]:
vcf_new.CLNSIG.value_counts(dropna=False)

Uncertain_significance                                              1137676
Likely_benign                                                        624662
Benign                                                               193028
Pathogenic                                                           135155
Conflicting_classifications_of_pathogenicity                         107971
                                                                     ...   
Conflicting_classifications_of_pathogenicity|drug_response|other          1
Benign/Likely_benign|risk_factor                                          1
Benign/Likely_benign|other|risk_factor                                    1
other|risk_factor                                                         1
Likely_benign|risk_factor                                                 1
Name: CLNSIG, Length: 90, dtype: int64

In [55]:
vcf_new[vcf_new['CLNSIG'] == 'Uncertain_significance']

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
1,chr1,69581,rs2252161,C,G,OR4F5,Uncertain_significance,Covered
2,chr1,69682,rs2396347,G,A,OR4F5,Uncertain_significance,Not_Covered
3,chr1,69769,rs2288999,T,C,OR4F5,Uncertain_significance,Not_Covered
4,chr1,69995,rs2351346,G,C,OR4F5,Uncertain_significance,Not_Covered
5,chr1,925946,rs1924157,C,G,SAMD11,Uncertain_significance,Covered
...,...,...,...,...,...,...,...,...
2353853,chr19_KI270938v1_alt,273340,rs2249453,T,A,LILRA3,Uncertain_significance,Not_Covered
2353855,chr19_KI270938v1_alt,273632,rs2360545,G,A,LILRA3,Uncertain_significance,Not_Covered
2353859,chr19_KI270938v1_alt,273866,rs2237818,A,C,LILRA3,Uncertain_significance,Not_Covered
2353860,chr19_KI270938v1_alt,274366,rs2206666,G,C,LILRA3,Uncertain_significance,Not_Covered


In [56]:
query_result = vcf_new.loc[(vcf_new['CLNSIG'] == 'Uncertain_significance') & (vcf_new['Covered_status'] == 'Covered')]
query_result 

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
1,chr1,69581,rs2252161,C,G,OR4F5,Uncertain_significance,Covered
5,chr1,925946,rs1924157,C,G,SAMD11,Uncertain_significance,Covered
6,chr1,925952,rs1019397,G,A,SAMD11,Uncertain_significance,Covered
8,chr1,925961,rs2069387,A,T,SAMD11,Uncertain_significance,Covered
10,chr1,925976,rs1362713,T,C,SAMD11,Uncertain_significance,Covered
...,...,...,...,...,...,...,...,...
2350913,chrY,12840450,rs2438505,C,T,USP9Y,Uncertain_significance,Covered
2350914,chrY,12842370,rs2627757,G,T,USP9Y,Uncertain_significance,Covered
2350921,chrY,14622379,rs559565,G,A,NLGN4Y,Uncertain_significance,Covered
2350924,chrY,14830121,rs391879,C,A,NLGN4Y,Uncertain_significance,Covered


In [62]:
query_result = vcf_new.loc[(vcf_new['CLNSIG'] == 'Uncertain_significance') & (vcf_new['Gene'] == 'USP9Y') & (vcf_new['Covered_status'] == 'Covered')]
query_result

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
2350907,chrY,12793053,rs2438504,C,G,USP9Y,Uncertain_significance,Covered
2350910,chrY,12818466,rs2499164,T,C,USP9Y,Uncertain_significance,Covered
2350912,chrY,12839999,rs2661889,G,T,USP9Y,Uncertain_significance,Covered
2350913,chrY,12840450,rs2438505,C,T,USP9Y,Uncertain_significance,Covered
2350914,chrY,12842370,rs2627757,G,T,USP9Y,Uncertain_significance,Covered


In [58]:
102050 + 1035626

1137676

In [47]:
vcf_new.Gene.value_counts(dropna=False)

NaN                    21261
BRCA2                  17174
TTN-AS1,TTN            14138
APC                    12955
NF1                    11802
                       ...  
LOC124819405,SLC8B1        1
LOC130008814,OAS3          1
MIR6761,ALDH2              1
CUX2,SH2B3                 1
LOC126057115,LILRA3        1
Name: Gene, Length: 27969, dtype: int64

In [51]:
rows_with_Tp53 = vcf_new[vcf_new['Gene'].str.contains('MIR6761', case=False, na=False)]
rows_with_Tp53

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status
1469884,chr12,111799896,rs728511,C,T,"MIR6761,ALDH2",Likely_benign,Covered


In [46]:
rows_with_Tp53.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/clinvar_20240206_IGHV.xlsx', index = False)

In [37]:
df_gene = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/clinvar_20240206_genes.xlsx')
df_gene

Unnamed: 0,Gene
0,Tp53
1,POLE
2,IGHV


In [38]:
vcf_new['Gene_Match'] = 'No'

# Iterate through each gene in df1
for genes in vcf_new['Gene']:
    if isinstance(genes, str):  # Check if the gene value is a non-null string
        gene_list = genes.split(',')  # Split the genes by comma to create a list
        match = any(gene in df_gene['Gene'].values for gene in gene_list)  # Check if any gene in the list exists in df2
        if match:
            vcf_new.loc[vcf_new['Gene'] == genes, 'Gene_Match'] = 'Yes'
            
vcf_new = vcf_new[vcf_new['Gene_Match'] == 'Yes']
vcf_new

KeyboardInterrupt: 

In [26]:
vcf_new.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/clinvar_20240206_Tp53_PLOE_IGHV.xlsx', index = False)

In [30]:
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,Gene,CLNSIG,Covered_status,Gene_Match


In [None]:
vcf_n