In [17]:
import numpy as np
import pandas as pd
import polars as pl
import sys
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px


pd.set_option('display.max_columns',None)
import psycopg2


#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Metrics to evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [20]:
def read_bed_file(bed_file):
    bed_positions = set()
    with open(bed_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Skip header lines if present
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                try:
                    start = int(fields[1])
                    end = int(fields[2])
                except ValueError:
                    continue  # Skip this line if start or end position is not an integer
                for pos in range(start, end + 1):
                    bed_positions.add((chrom, pos))
    return bed_positions

def normalize_chrom_name(chrom):
    return chrom.split('_')[0]

def filter_vcf_file(vcf_file, bed_positions):
    filtered_vcf_records = []
    with open(vcf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):  # Preserve header lines in the output
                filtered_vcf_records.append(line)
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 2:
                raw_chrom = fields[0]
                chrom = normalize_chrom_name(raw_chrom)
                try:
                    pos = int(fields[1])
                except ValueError:
                    continue  # Skip this line if 'POS' is not an integer
                if (chrom, pos) in bed_positions:
                    filtered_vcf_records.append(line)
    return filtered_vcf_records

def write_filtered_vcf(filtered_vcf_records, output_file):
    with open(output_file, 'w') as f:
        for record in filtered_vcf_records:
            f.write(record)

def main():
    bed_file = r'C:/Users/GenepoweRx_Madhu/Desktop/kalyani_mam_covered.bed'
    vcf_file = r'C:/Users/GenepoweRx_Madhu/Downloads/vcf_files_all/KHHSPTGPCSP11/KHHSPTGPCSP11_final.vcf'
    output_file = r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHHSPTGPCSP11_final_After_edit.vcf'

    bed_positions = read_bed_file(bed_file)
    filtered_vcf_records = filter_vcf_file(vcf_file, bed_positions)
    write_filtered_vcf(filtered_vcf_records, output_file)

if __name__ == "__main__":
    main()


In [19]:
40472-39571

901

In [28]:
vcf = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHHSPTGPCSP11_final_new.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf = pd.concat([vcf, sample_cols], axis=1)
vcf = vcf[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=178;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,178,178,0,176,98.88%,2.565E-105,0,0,144,32
1,chr1,930939,rs9988021,G,A,.,PASS,"ADP=41;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.04653,0...",1/1,236,41,41,0,41,100%,2.3541E-24,0,0,36,5
2,chr1,941119,rs4372192,A,G,.,PASS,"ADP=45;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.08686,0...",1/1,255,45,45,0,45,100%,9.6314E-27,0,0,40,5
3,chr1,942335,rs6605066,C,G,.,PASS,"ADP=49;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.112,0.8...",1/1,255,49,49,0,49,100%,3.925E-29,0,0,25,24
4,chr1,942451,rs6672356,T,C,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,158,28,28,0,28,100%,1.3074E-16,0,0,13,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40387,chrY,2691942,.,T,C,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;CSQ=C|synonymous_...,0/1,45,21,21,9,12,57.14%,2.658E-5,8,1,8,4
40388,chrY,2714441,.,C,T,.,PASS,ADP=28;WT=0;HET=0;HOM=1;NC=0;CSQ=T|synonymous_...,1/1,158,28,28,0,28,100%,1.3074E-16,0,0,17,11
40389,chrY,11986608,rs879016544,T,C,.,PASS,ADP=31;WT=0;HET=0;HOM=1;NC=0;ASP;RS=879016544;...,1/1,176,31,31,0,31,100%,2.1486E-18,0,0,5,26
40390,chrY,11986732,rs867343413,C,T,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;RS=867343413;...,1/1,42,10,10,1,9,90%,5.9538E-5,1,0,0,9


In [25]:
duplicate = vcf[vcf.duplicated()]
 
print("Duplicate Rows :")
 
# Print the resultant Dataframe
duplicate

Duplicate Rows :


Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR


In [29]:
vcf_new = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHHSPTGPCSP11_final_After_edit.vcf', comment= '#', sep = '\t', header=None, low_memory=False)
vcf_new.columns = ['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE']

sample_cols = vcf_new['SAMPLE'].str.split(':', expand=True)
sample_cols.columns = ['GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']

# Assign the values to the newly created columns
vcf_new = pd.concat([vcf_new, sample_cols], axis=1)
vcf_new = vcf_new[['CHROM', 'POS', 'rsID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'GT', 'GQ', 'SDP', 'DP', 'RD', 'AD', 'FREQ', 'PVAL','RDF', 'RDR', 'ADF', 'ADR']]
vcf_new

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=149;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,152,149,149,104,45,30.2%,5.6076E-16,80,24,38,7
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=178;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,178,178,0,176,98.88%,2.565E-105,0,0,144,32
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",0/1,144,186,186,142,44,23.66%,3.1648E-15,81,61,25,19
3,chr1,451290,.,A,C,.,PASS,ADP=85;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,99,85,85,56,29,34.12%,1.0363E-10,40,16,25,4
4,chr1,451414,.,G,A,.,PASS,ADP=96;WT=0;HET=1;HOM=0;NC=0;CSQ=A|missense_va...,0/1,75,96,96,73,23,23.96%,2.6678E-8,56,17,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39486,chrX,154653499,rs17855367,C,T,.,PASS,"ADP=131;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.5428,....",0/1,98,131,131,100,30,22.9%,1.4036E-10,84,16,16,14
39487,chrX,154766321,rs2728532,G,T,.,PASS,ADP=120;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.007417...,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,89,31
39488,chrX,155259147,rs3105275,A,G,.,PASS,"ADP=103;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.596,0....",1/1,255,103,103,0,103,100%,1.7512E-61,0,0,83,20
39489,chrX,155277884,rs559165,G,T,.,PASS,"ADP=76;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.6458,0....",1/1,255,76,76,0,76,100%,2.711E-45,0,0,63,13


In [16]:
vcf_new.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/COVERED_VCF_FILES_BED/KHHSPTGPCSP11_extra_rows.xlsx', index=False)

In [37]:
vcf_new[vcf_new['POS'] == 137467395]

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
38572,chr9,137467395,rs61738892,G,A,.,PASS,"ADP=62;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9692,0....",0/1,110,62,62,32,30,48.39%,8.6344e-12,26,6,20,10


In [33]:
vcf_new.iloc[38571]

CHROM                                                  chr9
POS                                               137463520
rsID                                              rs1891627
REF                                                       A
ALT                                                       G
QUAL                                                      .
FILTER                                                 PASS
INFO      ADP=49;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.002796,...
GT                                                      1/1
GQ                                                      255
SDP                                                      49
DP                                                       49
RD                                                        0
AD                                                       49
FREQ                                                   100%
PVAL                                              3.925E-29
RDF                                     

In [None]:
vcf_new.iloc[38571]

In [38]:
common_rows = pd.merge(vcf_new, vcf, how='inner')
common_rows

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69511,rs2691305,A,G,.,PASS,ADP=178;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,178,178,0,176,98.88%,2.565E-105,0,0,144,32
1,chr1,941119,rs4372192,A,G,.,PASS,"ADP=45;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.08686,0...",1/1,255,45,45,0,45,100%,9.6314E-27,0,0,40,5
2,chr1,942451,rs6672356,T,C,.,PASS,"ADP=28;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,158,28,28,0,28,100%,1.3074E-16,0,0,13,15
3,chr1,946247,rs2272757,G,A,.,PASS,"ADP=93;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.5581,0....",1/1,255,93,93,0,93,100%,1.7451E-55,0,0,74,19
4,chr1,948245,rs4970378,A,G,.,PASS,"ADP=26;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0,1;COMMO...",1/1,146,26,26,0,26,100%,2.0165E-15,0,0,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30542,chrX,154563953,rs201709278;rs5945206,C,T,.,PASS,ADP=71;WT=0;HET=0;HOM=1;NC=0;ASP;G5;G5A;GENEIN...,1/1,255,71,71,0,71,100%,2.6835E-42,0,0,55,16
30543,chrX,154652556,rs4326559,C,A,.,PASS,"ADP=80;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4718,0....",0/1,117,80,80,47,33,41.25%,1.7683E-12,29,18,23,10
30544,chrX,154653251,rs17328091,C,G,.,PASS,"ADP=88;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.4906,0....",0/1,107,88,88,57,31,35.23%,1.8612E-11,56,1,30,1
30545,chrX,154766321,rs2728532,G,T,.,PASS,ADP=120;WT=0;HET=0;HOM=1;NC=0;ASP;CAF=0.007417...,1/1,255,120,120,0,120,100%,1.1001E-71,0,0,89,31


In [39]:
non_common_rows = pd.merge(vcf_new, vcf, how='outer')
non_common_rows

Unnamed: 0,CHROM,POS,rsID,REF,ALT,QUAL,FILTER,INFO,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RDF,RDR,ADF,ADR
0,chr1,69270,rs201219564,A,G,.,PASS,ADP=149;WT=0;HET=1;HOM=0;NC=0;ASP;G5;G5A;GENEI...,0/1,152,149,149,104,45,30.2%,5.6076E-16,80,24,38,7
1,chr1,69511,rs2691305,A,G,.,PASS,ADP=178;WT=0;HET=0;HOM=1;NC=0;ASP;G5;GENEINFO=...,1/1,255,178,178,0,176,98.88%,2.565E-105,0,0,144,32
2,chr1,69897,rs200676709,T,C,.,PASS,"ADP=186;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.3119,0...",0/1,144,186,186,142,44,23.66%,3.1648E-15,81,61,25,19
3,chr1,451290,.,A,C,.,PASS,ADP=85;WT=0;HET=1;HOM=0;NC=0;CSQ=C|missense_va...,0/1,99,85,85,56,29,34.12%,1.0363E-10,40,16,25,4
4,chr1,451414,.,G,A,.,PASS,ADP=96;WT=0;HET=1;HOM=0;NC=0;CSQ=A|missense_va...,0/1,75,96,96,73,23,23.96%,2.6678E-8,56,17,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49331,chrY,2691942,.,T,C,.,PASS,ADP=21;WT=0;HET=1;HOM=0;NC=0;CSQ=C|synonymous_...,0/1,45,21,21,9,12,57.14%,2.658E-5,8,1,8,4
49332,chrY,2714441,.,C,T,.,PASS,ADP=28;WT=0;HET=0;HOM=1;NC=0;CSQ=T|synonymous_...,1/1,158,28,28,0,28,100%,1.3074E-16,0,0,17,11
49333,chrY,11986608,rs879016544,T,C,.,PASS,ADP=31;WT=0;HET=0;HOM=1;NC=0;ASP;RS=879016544;...,1/1,176,31,31,0,31,100%,2.1486E-18,0,0,5,26
49334,chrY,11986732,rs867343413,C,T,.,PASS,ADP=10;WT=0;HET=0;HOM=1;NC=0;ASP;RS=867343413;...,1/1,42,10,10,1,9,90%,5.9538E-5,1,0,0,9


In [15]:
additional_rows.INFO.iloc[1]

'ADP=62;WT=0;HET=1;HOM=0;NC=0;ASP;CAF=0.9692,0.03075,.;COMMON=1;G5;GENEINFO=PNPLA7:375775;GNO;HD;KGPhase1;KGPhase3;NSM;REF;RS=61738892;RSPOS=137467395;SAO=0;SLO;SSR=0;SYN;TOPMED=0.95765640927624872,0.04233562691131498,0.00000796381243628;VC=SNV;VLD;VP=0x050100000b05150536000100;WGT=1;dbSNPBuildID=129;CSQ=A|synonymous_variant|LOW|PNPLA7|ENSG00000130653|Transcript|ENST00000277531|protein_coding|25/34||ENST00000277531.8:c.2886C>T|ENSP00000277531.4:p.Ile962%3D|3073|2886|962|I|atC/atT|rs61738892&COSV53004048||-1||SNV|HGNC|HGNC:24768||||2|A2|CCDS7045.1|ENSP00000277531|Q6ZV29.152||UPI000443805E|Q6ZV29-1|||||PROSITE_profiles:PS51635&CDD:cd07225&PANTHER:PTHR14226:SF23&PANTHER:PTHR14226&Gene3D:3.40.1090.10&Pfam:PF01734&Superfamily:SSF52151&AFDB-ENSP_mappings:AF-Q6ZV29-F1.A|||0.0307|0.0053|0.0432|0.002|0.0586|0.0573|0.05017|0.009708|0.03176|0.07636|0.0004353|0.03753|0.06458|0.06043|0.06736|0.04345|0.01105|0.04736|0.03979|0.07056|0.0005769|0.0402|0.07278|0.06421|0.05651|0.0661|0.07636|gnomADe_ASJ|

# cross check bed files

In [41]:
kal_df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.bed', header=None, sep='\t', error_bad_lines=False)
kal_df.columns = ['chromosome', 'Start_pos', 'End_pos']
kal_df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65649
1,chr1,69007,70037
2,chr1,450710,451706
3,chr1,685686,686682
4,chr1,924401,924977
...,...,...,...
208906,chrY,25038781,25038941
208907,chrY,25041746,25041906
208908,chrY,25043888,25044048
208909,chrY,25622413,25624093


In [45]:
kal_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/kalyani_mam_covered.xlsx', index=False)

In [42]:
sri_df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.bed', header=None, sep='\t', error_bad_lines=False)
sri_df.columns = ['chromosome', 'Start_pos', 'End_pos']
sri_df

Unnamed: 0,chromosome,Start_pos,End_pos
0,chr1,65489,65645
1,chr1,65811,65993
2,chr1,69461,69620
3,chr1,785981,786159
4,chr1,786130,786446
...,...,...,...
230714,chrY,57190028,57190328
230715,chrY,57190299,57190439
230716,chrY,57190874,57191014
230717,chrY,57191846,57192058


In [46]:
sri_df.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/BED_files/srinivas_sir_covered.xlsx', index=False)

In [47]:
pancre = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/Pancreatic_Cancer_final_Lit_variants.csv', sep = '\t')
pancre

Unnamed: 0,Uploaded_variant_hg38,Chrom-pos-Ref-Alt,rsID_hg38,Unnamed: 3
0,BRCA1:p.Ala622Val,"17-43093666-G-A,","rs56039126,",
1,BRCA1:p.Gln687Pro,"17-43093471-T-G,","rs28897680,",
2,BRCA1:p.Thr539Met,"17-43093915-G-A,","rs80357374,",
3,BRCA1:p.Thr539Met,"17-43070985-TG-CA, 17-43067639-AG-CA, 17-43045...",-,
4,BRCA1:p.Ser646Gly,"17-43049140-GA-CC,",-,
...,...,...,...,...
193,rs80359212,"13-32394814-C-T,","rs80359212,",
194,rs80359763,"13-32394866-TGT-T,","rs80359763,",
195,rs11571833,"13-32398489-A-T,","rs11571833,",
196,rs276174803,"13-32398607-TC-TGAATTATATCT,","rs276174803,",


In [48]:
pancre.to_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/panc_canc_lit_var.xlsx', index=False)

In [49]:
import pandas as pd
x = pd.read_excel(r'C:/Users/GenepoweRx_Madhu/Downloads/panc_canc_lit_var.xlsx')
x['chrom'] = x['Chrom-pos-Ref-Alt'].str.split(',')
x = x.explode('chrom')

x['CHROM'] = x['chrom'].str.split('-').str[0]

# Function to add 'chr' prefix conditionally
def add_chr_prefix(chrom):
    if pd.notnull(chrom) and chrom.strip() != '':
        return 'chr' + str(chrom)
    else:
        return chrom

# Applying the function to the 'chromosome' column
x['CHROM'] = x['CHROM'].apply(add_chr_prefix)
x['CHROM'] = x['CHROM'].str.strip()
x['CHROM'] = x['CHROM'].str.replace(r'\s+', '')
x['POS'] = x['chrom'].str.split('-').str[1]

x.dropna(subset=['CHROM'], inplace=True)
# Drop rows with empty cells after removing leading and trailing whitespaces
x['CHROM'] = x['CHROM'].str.strip()
x['POS'] = x['POS'].str.strip()
# Dropping rows with empty cells and NaN values in both 'chromosome' and 'position' columns
x.dropna(subset=['CHROM', 'POS'], inplace=True)
df_3 = x[['CHROM', 'POS']]
df_3['Literature'] = 'Yes'
df_3.drop_duplicates(subset='POS', inplace=True)
df_3['POS'] = df_3['POS'].astype('int64')
df_3 = df_3.reset_index()
df_3 = df_3[['CHROM', 'POS', 'Literature']]

df = pd.read_csv(r'C:/Users/GenepoweRx_Madhu/Downloads/KAPA HyperExome_hg38_capture_targets (1).bed', sep = '\t', header = None)
df.columns = ['chromosome', 'Start_pos', 'End_pos', 'INFO']

df['Extended_Start_pos'] = df['Start_pos'] - 20
df['Extended_End_pos'] = df['End_pos'] + 20

df['gene_symbol'] = df['INFO'].str.extract(r'gene_symbol=([^;]+)')
df = df[['chromosome', 'Extended_Start_pos', 'Extended_End_pos', 'INFO', 'gene_symbol']]


# Step 1: Create a dictionary from the df DataFrame
chromosome_dict = {}
for _, row in df.iterrows():
    chromosome = row['chromosome']
    start_pos = row['Extended_Start_pos']
    end_pos = row['Extended_End_pos']
    if chromosome not in chromosome_dict:
        chromosome_dict[chromosome] = []
    chromosome_dict[chromosome].append((start_pos, end_pos))

# Step 2: Define a function to check coverage
def check_coverage(row):
    pos = row['POS']
    chromosome = row['CHROM']
    if chromosome in chromosome_dict:
        ranges = chromosome_dict[chromosome]
        for start, end in ranges:
            if start <= pos <= end:
                return 'Covered'
    return 'Not_Covered'

# Step 3: Apply the function to create the new column in dataset2
df_3['Covered/Not_Covered'] = df_3.apply(check_coverage, axis=1)

df_3 = df_3[df_3['Covered/Not_Covered'] == 'Covered']
df_3

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr17,43093666,Yes,Covered
1,chr17,43093471,Yes,Covered
2,chr17,43093915,Yes,Covered
3,chr17,43070985,Yes,Covered
4,chr17,43067639,Yes,Covered
...,...,...,...,...
337,chr13,32379897,Yes,Covered
338,chr13,32380116,Yes,Covered
339,chr13,32394866,Yes,Covered
340,chr13,32398489,Yes,Covered


In [50]:
df_3[df_3['POS'] == 43093666]

Unnamed: 0,CHROM,POS,Literature,Covered/Not_Covered
0,chr17,43093666,Yes,Covered


In [51]:
pip install requests




In [53]:
import requests

In [3]:
import plotly.express as px
import pandas as pd

# Sample data
df = pd.DataFrame(dict(
    stage = ['Sent', 'Received', 'Clicks', 'Sales'],
    percentage = [100, 70, 30, 10]))

fig = px.funnel(df, x = 'percentage', y = 'stage',
                color_discrete_sequence = ['lightcoral'], opacity = 1)
fig.update_traces(marker = {'line': {'width': [1, 1, 3, 1], 'color': ['gray', 'gray', 'blue', 'gray']}})

fig.show() 

In [12]:
import plotly.express as px
import pandas as pd

# Sample data
df = pd.DataFrame(dict(
    stage = ['Sent', 'Received', 'Clicks', 'Sales',
             'Sent', 'Received', 'Clicks', 'Sales'],
    percentage = [70, 32, 20, 4,
                  30, 38, 10, 6],
    campaign = ['C1', 'C1', 'C1', 'C1',
                'C2', 'C2', 'C2', 'C2']))

fig = px.funnel(df, x = 'percentage', y = 'stage', color = 'campaign')

fig.show() 

In [13]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(3)

# 20 samples, with 5 dimensions each
X = np.random.rand(20, 5) 

fig = ff.create_dendrogram(X)
fig.update_layout(autosize = True)

fig.show() 

In [14]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(3)

# 20 samples, with 5 dimensions each
X = np.random.rand(20, 5)

fig = ff.create_dendrogram(X, color_threshold = 1.2)
fig.update_layout(autosize = True)

fig.show() 

In [15]:
import plotly.express as px

data = dict(
    categories = ['Total', 'A', 'B', 'C',
                  'A1', 'A2', 'A3',
                  'B1', 'B2', 
                  'C1', 'C2', 'C3'],
    parent = ['', 'Total', 'Total', 'Total', 'A', 'A', 'A', 'B', 'B', 'C', 'C', 'C'],
    value = [100, 35, 25, 40, 10, 15, 10, 12, 13, 10, 20, 10])

fig = px.sunburst(data,
                  names = 'categories',
                  parents = 'parent',
                  values = 'value',
                  color_discrete_sequence = ['orange'],
                  branchvalues = 'total')
    
fig.show() 