In [2]:
import pysam
import polars as pl
from pathlib import Path
from pyfaidx import Fasta

In [14]:
vcf_file = '/home/ec2-user/clinvar/clinvar.vcf.gz'
bed_file = '/home/ec2-user/enformer/Homo_sapiens.GRCh38.genes.bed'
bed_vcf_file = '/home/ec2-user/enformer/Homo_sapiens.GRCh38.genes.clinvar.vcf.bed'
fasta_file = '/home/ec2-user/enformer/Homo_sapiens.GRCh38.dna.toplevel.fa'

bed_path = Path(bed_file)

In [4]:
fasta = Fasta(fasta_file)

In [5]:
vcf = pysam.VariantFile(vcf_file, 'r')

In [6]:
df = pl.read_csv(str(bed_path), sep = '\t', has_header = False)

In [7]:
df.head()

column_1,column_2,column_3,column_4,column_5
str,i64,i64,str,str
"""X""",100528805,100725413,"""-""","""ENSG0000000000..."
"""X""",100486498,100683106,"""+""","""ENSG0000000000..."
"""20""",50836563,51033171,"""-""","""ENSG0000000041..."
"""1""",169751327,169947935,"""-""","""ENSG0000000045..."
"""1""",169563703,169760311,"""+""","""ENSG0000000046..."


In [8]:
seq = fasta.get_seq(df[0, 0], df[0, 1], df[0, 2])

In [9]:
variants = list(vcf.fetch(df[0, 0], df[0, 1], df[0, 2]))
variants = [v for v in variants if v.info['CLNVC'] == 'single_nucleotide_variant']

In [37]:
keep = set(['Benign', 'Likely_benign', 'Likely_pathogenic', 'Pathogenic'])
df_v = []
for row in df.rows():
    if row[0] in ('CHR_HSCHR6_MHC_MCF_CTG1',):
        break
    variants = list(vcf.fetch(row[0], max(row[1], 0), row[2]))
    seq = fasta.get_seq(row[0], max(row[1], 1), row[2])
    for v in variants:
        if (v.info['CLNVC'] == 'single_nucleotide_variant' and len(v.alleles) == 2 and len(v.alleles[1]) == 1
            and 'CLNSIG' in v.info.keys() and len(v.info['CLNSIG']) == 1 and len(set(v.info['CLNSIG']) & keep) > 0):
            pos = v.pos - seq.start
            if not seq[pos] == v.alleles[0]:
                raise ValueError(f'Reference allele not recognized')
            s_dict = {
                'chrom': row[0],
                'start': row[1],
                'stop': row[2],
                'strand': row[3],
                'gene': row[4],
                'variant_pos': v.pos,
                'variant_ref': v.alleles[0],
                'variant_alt': v.alleles[1],
            }
            v_dict = dict(v.info)
            for k, v in v_dict.items():
                if type(v) == tuple:
                    if len(v) == 1:
                        v_dict[k] = v[0]
                    else:
                        v_dict[k] = str(v)
            s_dict.update(v_dict)
            df_v.append(s_dict)
            # seq_mut = list(seq.seq)
            # seq_mut[pos] = v.alleles[1]
df_v = pl.DataFrame(df_v)

In [38]:
len(df_v)

143556

In [39]:
df_v.write_csv(bed_vcf_file, separator = '\t', has_header = False)

In [42]:
df_v['CLNSIG'].value_counts()

CLNSIG,counts
str,u32
"""Likely_pathoge...",8166
"""Likely_benign""",98004
"""Benign""",24478
"""Pathogenic""",12908


In [20]:
v = variants[0]

In [22]:
v.info.keys()

['ALLELEID',
 'CLNDISDB',
 'CLNDN',
 'CLNHGVS',
 'CLNREVSTAT',
 'CLNSIG',
 'CLNVC',
 'CLNVCSO',
 'GENEINFO',
 'MC',
 'ORIGIN']

In [23]:
v.info['CLNSIG']

('Uncertain_significance',)

In [9]:
len(variants)

168

In [19]:
variants[0]

<pysam.libcbcf.VariantRecord at 0x7f9868563070>

In [11]:
variants[0].start

100585041

In [12]:
variants[30]

<pysam.libcbcf.VariantRecord at 0x7f32a741d520>

In [13]:
variants[0].start

100585041

In [14]:
variants[0].stop

100585042

In [15]:
print(list(variants[0].info.items()))

[('AF_ESP', 0.015150000341236591), ('AF_EXAC', 0.005289999768137932), ('AF_TGP', 0.015359999611973763), ('ALLELEID', 717561), ('CLNDISDB', ('MedGen:CN517202',)), ('CLNDN', ('not_provided',)), ('CLNHGVS', ('NC_000023.11:g.100585042T>C',)), ('CLNREVSTAT', ('criteria_provided', '_single_submitter')), ('CLNSIG', ('Benign',)), ('CLNVC', 'single_nucleotide_variant'), ('CLNVCSO', 'SO:0001483'), ('GENEINFO', 'TNMD:64102'), ('MC', ('SO:0001819|synonymous_variant',)), ('ORIGIN', ('1',)), ('RS', ('36082211',))]


In [16]:
variants[0].info['CLNDN']

('not_provided',)

In [17]:
[v for v in variants if v.info['CLNSIG'] == 'Uncertain significance']

[]

In [18]:
variants[0].info['CLNVC'] == 'single_nucleotide_variant'

True

In [19]:
[v for v in variants if 'Pathogenic' in v.info['CLNSIG']]

[<pysam.libcbcf.VariantRecord at 0x7f32a7427f10>]

In [97]:
sig = [v.info['CLNSIG'] for v in variants]

In [None]:
import numpy as np
keep = ['Benign', 'Likely_benign', 'Likely_pathogenic', 'Pathogenic']

In [99]:
import numpy as np

np.unique(sig)

array(['Benign', 'Benign/Likely_benign',
       'Conflicting_interpretations_of_pathogenicity', 'Likely_benign',
       'Likely_pathogenic', 'Pathogenic', 'Uncertain_significance'],
      dtype='<U44')

In [106]:
variants[133].info['CLNSIG']

('Pathogenic',)

In [116]:
start = df[0, 1]

In [22]:
v = variants[133]

In [20]:
seq[0]

>X:100528805-100528805
T

In [35]:
for v in variants:
    pos = v.pos - seq.start
    assert seq[pos] == v.alleles[0]
    assert len(v.alleles[1]) == 1
    seq_mut = list(seq.seq)
    seq_mut[pos] = v.alleles[1]

In [34]:
seq_mut[0]

['T',
 'A',
 'G',
 'C',
 'A',
 'C',
 'C',
 'A',
 'T',
 'A',
 'A',
 'T',
 'G',
 'T',
 'T',
 'T',
 'C',
 'A',
 'T',
 'C',
 'A',
 'G',
 'C',
 'A',
 'T',
 'A',
 'A',
 'A',
 'T',
 'T',
 'G',
 'T',
 'G',
 'A',
 'A',
 'T',
 'A',
 'T',
 'T',
 'C',
 'A',
 'C',
 'A',
 'T',
 'T',
 'A',
 'A',
 'G',
 'T',
 'G',
 'G',
 'C',
 'A',
 'T',
 'A',
 'A',
 'A',
 'C',
 'A',
 'A',
 'A',
 'G',
 'A',
 'C',
 'T',
 'C',
 'T',
 'A',
 'A',
 'A',
 'T',
 'A',
 'G',
 'C',
 'A',
 'T',
 'T',
 'G',
 'T',
 'G',
 'T',
 'T',
 'G',
 'G',
 'T',
 'T',
 'C',
 'A',
 'G',
 'G',
 'T',
 'C',
 'A',
 'G',
 'G',
 'A',
 'T',
 'T',
 'T',
 'A',
 'C',
 'A',
 'A',
 'T',
 'A',
 'A',
 'A',
 'A',
 'T',
 'T',
 'A',
 'G',
 'T',
 'T',
 'A',
 'T',
 'A',
 'A',
 'A',
 'G',
 'G',
 'A',
 'G',
 'T',
 'T',
 'T',
 'T',
 'C',
 'C',
 'T',
 'T',
 'T',
 'A',
 'T',
 'A',
 'G',
 'A',
 'G',
 'C',
 'T',
 'C',
 'T',
 'A',
 'T',
 'A',
 'T',
 'A',
 'T',
 'T',
 'T',
 'T',
 'G',
 'G',
 'A',
 'A',
 'T',
 'T',
 'G',
 'T',
 'G',
 'G',
 'A',
 'T',
 'A',
 'A',
 'G',
 'A'

In [23]:
v.pos

100669335