In [None]:
!wget -nc https://www.encodeproject.org/files/male.hg19/@@download/male.hg19.fasta.gz
!gunzip -f male.hg19.fasta.gz

File ‘male.hg19.fasta.gz’ already there; not retrieving.

gzip: male.hg19.fasta already exists; do you wish to overwrite (y or n)? 

In [1]:
import sys
import tempfile

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import boda

import tqdm.notebook as tqdm

import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [8]:
def vec2nt(vec):
    hold = torch.cat([ vec, torch.ones((1,vec.shape[1])) ], axis=0)
    return "".join([ ['A','C','G','T','N'][x] for x in torch.argmax(hold, axis=0) ])

# Convert reference MPRA to VCF

In [2]:
mpra_19 = pd.read_table('MPRA_ALL_v3.txt', sep=' ', header=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
with open('troubleshoot.vcf', 'w') as f:
    print('#CHROM\tPOS\tID\tREF\tALT', file=f)
    for i, x in mpra_19.iterrows():
        if len(x['IDs'].split(':')) > 5:
            chrom, pos, ref, alt, *stuff = x['IDs'].split(':')
            if len(stuff) > 0:
                if (stuff[-1] == 'wC') and (stuff[-2] == 'R'):
                    print(f'chr{chrom}\t{int(pos)+1-1}\t{x["IDs"]}\t{ref}\t{alt}', file=f)


# Setup VCF dataset

In [5]:
fasta_data = boda.data.Fasta('male.hg19.fasta')


pre-reading fasta into memory
100%|██████████████████████████| 61913917/61913917 [00:29<00:00, 2124279.92it/s]
finding keys
parsing
100%|███████████████████████████████████████████| 25/25 [17:22<00:00, 41.71s/it]
done


In [6]:
vcf = boda.data.VCF(
    'troubleshoot.vcf', chr_prefix='', 
    max_allele_size=20, max_indel_size=20,
)

WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data = boda.data.VcfDataset(
    vcf.vcf, fasta_data.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


loading DataFrame
Checking and filtering tokens
Allele length checks
Done
343562/343562 records have matching contig in FASTA
returned 343562/343562 records


## check alternate windowing methods

In [7]:
vcf_wide = boda.data.VcfDataset(
    vcf.vcf, fasta_data.fasta, WINDOW_SIZE, 
    9, 180, step_size=10, 
    left_flank='', right_flank=''
)

vcf_narrow = boda.data.VcfDataset(
    vcf.vcf, fasta_data.fasta, WINDOW_SIZE, 
    96, 103, step_size=1, 
    left_flank='', right_flank=''
)


343562/343562 records have matching contig in FASTA
returned 343562/343562 records
343562/343562 records have matching contig in FASTA
returned 343562/343562 records


In [8]:
for i in range(vcf_wide[10]['ref'].shape[0]):
    print(vec2nt(vcf_wide[10]['ref'][i]))

AACCCGAGAGGTGGAGGTTGCAGTGAGCCAAGATTGTGCCATTGCACTCCAGCCTGGGCGACAGAGCAACACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGA
GTGGAGGTTGCAGTGAGCCAAGATTGTGCCATTGCACTCCAGCCTGGGCGACAGAGCAACACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAA
CAGTGAGCCAAGATTGTGCCATTGCACTCCAGCCTGGGCGACAGAGCAACACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACA
AGATTGTGCCATTGCACTCCAGCCTGGGCGACAGAGCAACACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGC
ATTGCACTCCAGCCTGGGCGACAGAGCAACACTCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGA

In [9]:
for i in range(vcf_narrow[10]['ref'].shape[0]):
    print(vec2nt(vcf_narrow[10]['ref'][i]))

CTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTC
TCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCG
CAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGG
AAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGGA
AAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTC

In [10]:
for i in range(vcf_narrow[10]['alt'].shape[0]):
    print(vec2nt(vcf_narrow[10]['alt'][i]))

CTCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCAACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTC
TCAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCAACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCG
CAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCAACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGG
AAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCAACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGGA
AAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCAACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTC

In [11]:
print(vec2nt(vcf_wide[10]['ref'][8]))
print(vec2nt(vcf_narrow[10]['ref'][3]))


AAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGGA
AAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGGA


In [12]:
matches = []

for i in tqdm.tqdm(range(len(vcf_wide))):
    matches.append( vec2nt(vcf_wide[i]['ref'][8]) == vec2nt(vcf_narrow[i]['ref'][3]) )
    

  0%|          | 0/343562 [00:00<?, ?it/s]

In [13]:
np.array(matches).mean()

1.0

In [14]:
vcf_narrow[10]['ref'].shape

torch.Size([14, 4, 200])

In [15]:
vcf.vcf.loc[10]

chrom                  chr12
pos                  4406836
id       12:4406836:C:A:R:wC
ref                        C
alt                        A
Name: 10, dtype: object

In [16]:
vec2nt(vcf_wide[10]['ref'][8])

'AAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGGA'

In [17]:
vec2nt(vcf_narrow[10]['ref'][2])

'CAAAAAAAAAAAAAAAAAAAAAAAAAAACAGAAAAGAAAAGAAACATTATACCTTAACTTTTCGCACCTAGTGAGTGATAACCTATGAGGATAATGCTGCCACATGGTCCTTGAGGGAAGAACTTGGGGAAGAGCCGAACAGGGATGAGGCAGGTGAGCCCCATGTGATTTATCGTCATTCATTCAGCTCAACAGGTCGG'

# Check if sequences are identical matches

In [18]:
vec2nt(vcf_data[0]['ref'][0])

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [19]:
mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

'AAAAAAAAAAAAAAAAAAAAAAAGTCTTTCCTCTCATCAACCCCCTACCCCCTGCCCCCGAACGTCCTCAACCCAAAGTAGCCAAGTTGGCTAATCTAACGCAAAGGGTGGGAGGCCACACCAAATGCACTGGTTACATAGCAAACACAAAGTCTGGCTGGGTTTTAAAGCAGCCTTCGTCTAAATCACGCAAGCACACA'

In [20]:
vec2nt(vcf_data[0]['ref'][0]) == mpra_19.set_index('IDs').loc[ vcf.vcf.loc[0,'id'], 'nt_sequence' ]

True

In [21]:
loader_dict = {}

for i in tqdm.tqdm(range(vcf.vcf.shape[0])):
    row_id =  vcf.vcf.loc[i,'id']
    loader_seq = vec2nt(vcf_data[i]['ref'][0])
    loader_dict[row_id] = loader_seq


  0%|          | 0/343562 [00:00<?, ?it/s]

In [22]:
extract_seqs = pd.DataFrame(zip(*loader_dict.items())).T
extract_seqs.columns=['IDs','nt_sequence']
extract_seqs = extract_seqs.set_index('IDs', drop=True)

In [23]:
mismatch_filter = mpra_19.set_index('IDs').loc[extract_seqs.index, 'nt_sequence'] == extract_seqs.loc[extract_seqs.index,'nt_sequence']

In [24]:
mismatch_filter.mean()

0.9234199358485513

## result
about 92% of extracted sequences based on VCF coordinates are exact matches to the known sequence in the MPRA

# Check mismatches
checking if mismatches are indels

In [25]:
extract_seqs.loc[~mismatch_filter]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
1:26878979:A:AT:R:wC,AAAAAAAAAAAAAAAAAAAAAGGATTTGAGCTAGAAAATGGGACCA...
13:72497708:AT:A:R:wC,AAAAAAAAAAAAAAAAAAACTTCCCTCTAAATACACACATTAATAA...
3:70506530:CA:C:R:wC,CAAAAAAAAAAAAAAAAAAAAGAAAGAAAAAAAGAAAGAAAGAAAG...
3:53801691:AT:A:R:wC,CAAAAAAAAAAAAAAAACAACAAAATAAAATTCCCAACATGCAGAT...
14:102751522:CA:C:R:wC,AAAAAAAAAAAAAAAGAAAATGCAGGCTTGAGAACTAGATCTAGAA...
...,...
7:4810197:CTCT:C:R:wC,GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAAC...
18:12438917:TGGA:T:R:wC,CCTGCAGCGGTGCTCGCTATCCTTGACCAAGCTGGAGAACAAGAGT...
22:27697920:GT:G:R:wC,CTGCAGCCTTGACCTCCCCAGCTCAAGCAATCCTCCCACCTCAGCC...
4:106276362:A:AT:R:wC,CTGCAGCCTTGATTTCCTGGGTTCAGGTGATTCTCCCACCTCAGCC...


In [26]:
mpra_19.set_index('IDs').loc[extract_seqs.index, 'nt_sequence'].loc[~mismatch_filter].iloc[:20]

IDs
1:26878979:A:AT:R:wC          AAAAAAAAAAAAAAAAAAAAGGATTTGAGCTAGAAAATGGGACCAT...
13:72497708:AT:A:R:wC         AAAAAAAAAAAAAAAAAACTTCCCTCTAAATACACACATTAATAAT...
3:70506530:CA:C:R:wC          AAAAAAAAAAAAAAAAAAAAGAAAGAAAAAAAGAAAGAAAGAAAGA...
3:53801691:AT:A:R:wC          AAAAAAAAAAAAAAAACAACAAAATAAAATTCCCAACATGCAGATA...
14:102751522:CA:C:R:wC        AAAAAAAAAAAAAAGAAAATGCAGGCTTGAGAACTAGATCTAGAAG...
21:37562412:TC:T:R:wC         AAAAAAAAAAAATGATTTTCAGCTGGGCACAGTGGCTTACACCTGT...
9:73060384:A:ATTTTC:R:wC      AAAAAAAAAAAAAAGTATTGTAGAGGAAAGAAGTTCTAGCTAATGC...
1:165583188:T:TAA:R:wC        AAAAAAAGATATTTGCAATATATATGATAGCAAAAGCCTAATTTCT...
15:41533058:C:CTT:R:wC        AAAAAAAAAAAAGATCTATGAAGAAATGAAAATGGCAGTGTTGGGA...
1:11075825:C:CAG:R:wC         AAAAAAAAAAAATTATCTCCCCTTCTCTTCTAAATAACCTAAGGTT...
19:4135623:CT:C:R:wC          AAAAAAAAAAAGGAATAAGGAAATATTTTCCAGTATTCAGAAATGA...
1:203688453:A:ATTGCCT:R:wC    AAAAAAAAAAAAAAAAAGCAAGAGAGATAAAATACATGGTTCTAAA...
5:74639544:CTTGTA:C:R:wC      AAAAAA

In [27]:
extract_seqs.loc['7:4810197:CTCT:C:R:wC', 'nt_sequence']

'GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAAC'

In [28]:
extract_seqs.loc['7:4810197:CTCT:C:R:wC', 'nt_sequence']

'GTTGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAAC'

In [29]:
mpra_19.set_index('IDs').loc['7:4810197:CTCT:C:R:wC', 'nt_sequence']

'TGCAGTGAGCCGAGATCATGCCACTGCACTCCAGCCTGGGCAACAGAGGGAGACTCTGTCTCAAAAATAATAATCAAATCAAGTATTTTAAGTTTGGCTCTTCTTTTTTTCAAGAAAGGCTTTTTGGATACCTAGAATACCTTCATGTATGTGGCTTGATTTCTGTTTCAGGAAAAGGGGGGCGGGAGGAGAGACAACAG'

In [30]:
var_len_match = np.array([ len(seq_id.split(':')[2]) == len(seq_id.split(':')[3]) for seq_id in extract_seqs.loc[~mismatch_filter].index ])
var_len_match.mean()

0.00011402508551881414

In [31]:
extract_seqs.loc[~mismatch_filter].loc[var_len_match]

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
9:139217089:C:T:R:wC,NNNNNNNNGATCTTCTTGGGTCTCAGGTTCCAACATATGAAGGCTG...
11:69139893:T:G:R:wC,NNNNNNNNTGATGGTTTTGGTGATGGCGGTGGTGGTGGTGATGGTG...
19:20523319:G:T:R:wC,TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGA...


In [32]:
extract_seqs.loc['19:20523319:G:T:R:wC', 'nt_sequence']

'TTGAGACCCTCATACCTGATTCTGGCCTCCCCTTAGAGTCACGTGAAGCCCTTCATTAAAACAACATGGATACTTCCACCCAGAACAATAAACAGAACCGGTGGGGAAGGCACAAGAGATTTCTGCAAATTGGCCATGTGATCCTAATGAGAAGCCTGGGCTGATAACCACTAAGCTAAGCATTGCCTCTCAAGCTNNNN'

## result
All but 3 sequences are indels, the rest have N's in the reference

# Test liftover

In [33]:
!wget -nc https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz
!gunzip -f GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz

--2023-08-21 18:23:01--  https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2015/12/03/a7fea375-057d-4cdc-8ccd-0b0f930823df/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz?response-content-disposition=attachment%3B%20filename%3DGRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz&AWSAccessKeyId=ASIATGZNGCNX2KDTQBSK&Signature=b0vtBtmh6z9UGjwAOR%2BMCKcXJQs%3D&x-amz-security-token=IQoJb3JpZ2luX2VjECIaCXVzLXdlc3QtMiJHMEUCIFK7gx2uJa2RIihqhuUmo5x1Kophew%2Bj83mjQd3oEH2yAiEAoOD74wllg1fC1jYRuB9mHlcajpW4PptgawI1oH1GM8IquwUI2%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDFxgghm%2BQsTXR0tUXyqPBQ%2FnZi%2BHNX3x4zxn2Ul

In [34]:
fasta_38 = boda.data.Fasta('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta')

pre-reading fasta into memory
100%|██████████████████████████| 44284892/44284892 [00:21<00:00, 2039241.52it/s]
finding keys
parsing
100%|█████████████████████████████████████████| 195/195 [17:10<00:00,  5.28s/it]
done


In [35]:
vcf = boda.data.VCF(
    'gtex_hg38_coord_hg19_ID.vcf', chr_prefix='', 
    max_allele_size=20, max_indel_size=20,
)

WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data = boda.data.VcfDataset(
    vcf.vcf, fasta_38.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


loading DataFrame
Checking and filtering tokens
Allele length checks
Done
210121/210121 records have matching contig in FASTA
returned 210121/210121 records


In [36]:
vcf_data[1]

{'ref': tensor([[[1., 1., 1.,  ..., 0., 0., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.]],
 
         [[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 1., 1., 1.]]]),
 'alt': tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 1., 0., 1.]],
 
         [[1., 0., 1.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 1., 1.]]])}

In [37]:
loader_dict = {}
chrom_key = {}

for i in tqdm.tqdm(range(vcf.vcf.shape[0])):
    row_id =  vcf.vcf.loc[i,'id']
    chrom = vcf.vcf.loc[i,'chrom']
    loader_seq = vec2nt(vcf_data[i]['ref'][0])
    loader_dict[row_id] = loader_seq
    chrom_key[row_id] = chrom


  0%|          | 0/210121 [00:00<?, ?it/s]

In [38]:
liftover_seqs = pd.DataFrame(zip(*loader_dict.items())).T
liftover_seqs.columns=['IDs','nt_sequence']
liftover_seqs = liftover_seqs.set_index('IDs', drop=True)

liftover_chrom= pd.DataFrame(zip(*chrom_key.items())).T
liftover_chrom.columns=['IDs','chrom']
liftover_chrom= liftover_chrom.set_index('IDs', drop=True)

In [39]:
liftover_seqs

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
10:95074:G:A:R:wC,CCGGCCGCCTCGCCAGCCACCCGGTTCCACCGTCCCCGGCAGGGAG...
10:98705:T:TA:R:wC,AAAAAATGCTGCACTCCTAGGACATTTATACATTTTTCAGGCTTGG...
10:101683:T:C:R:wC,ATTTTAATTTGATTTTTGTATGTGGCAAGAAATAGGGTTCCAGTTT...
10:108125:T:G:R:wC,TACATTCAATGTTATTATTGATAAGCACTTACTCCTGCCCTTTTGT...
10:108543:ACTT:A:R:wC,CTCTTTCTATTTGAGATTTTTGCACACCATAATTACCATCTTATAA...
...,...
X:150136363:C:T:R:wC,TCCCTCTACTGTACCTTTCTTGGTTTGGTTTTGGCAACACAGCTGT...
X:9756259:C:T:R:wC,CCAGGCTGGTCTCGAACTCCTCTGCCTCCCAAAGTCCTGGCATTAC...
X:10072886:T:C:R:wC,TTTTTTTGAAGACAGGGTCTCGCTCTGTCACCCAGGCTGGAGTGCT...
X:49397724:C:T:R:wC,TTTCCCCCCCAGTGTTAGGTGAGAAGGATGAATCGAAGGTCAGCTT...


In [40]:
intersect_ids = set(extract_seqs.index).intersection(set(liftover_seqs.index))
len(intersect_ids)

199935

In [41]:
concord = extract_seqs.loc[intersect_ids, 'nt_sequence'] == liftover_seqs.loc[intersect_ids, 'nt_sequence']

In [42]:
mismatch_contig_source_19 = [ x.split(':')[0] for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]
mismatch_contig_source_38 = [ liftover_chrom.loc[x, 'chrom'] for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]

with open('liftover_fails.txt', 'w') as f:
    [ print(x, file=f) for x in liftover_seqs.loc[intersect_ids].loc[~concord].index ]

In [43]:
from collections import Counter

print("in hg19")
_ = [ print(f"chr{k}: {v}") for k,v in Counter(mismatch_contig_source_19).items() ]

print("")

print("in hg38")
_ = [ print(f"{k}: {v}") for k,v in Counter(mismatch_contig_source_38).items() ]

in hg19
chr2: 84
chr14: 218
chr1: 826
chr8: 102
chr11: 26
chr10: 419
chr20: 52
chr7: 310
chr22: 211
chr9: 463
chr12: 43
chr19: 290
chr15: 244
chrX: 300
chr3: 65
chr4: 56
chr18: 23
chr17: 222
chr6: 17
chr5: 22
chr21: 16
chr16: 14
chr13: 3

in hg38
chr2: 86
chr14: 205
chr1: 828
chr8: 101
chr11: 28
chr10: 420
chr20: 69
chr7: 309
chr22: 225
chr9: 458
chr12: 42
chr19: 290
chr15: 241
chrX: 299
chr3: 62
chr4: 51
chr18: 23
chr17: 222
chr6: 17
chr5: 19
chr21: 17
chr16: 11
chr13: 3


In [44]:
fasta_38.fasta['chr22'].shape

(4, 50818468)

In [47]:
print(len(mismatch_contig_source_19))

4026


# Check liftover on new VCF

In [2]:
fasta_19 = boda.data.Fasta('male.hg19.fasta')
fasta_38 = boda.data.Fasta('GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta')

pre-reading fasta into memory
100%|██████████████████████████| 61913917/61913917 [00:29<00:00, 2119348.49it/s]
finding keys
parsing
100%|███████████████████████████████████████████| 25/25 [17:22<00:00, 41.69s/it]
done
pre-reading fasta into memory
100%|██████████████████████████| 44284892/44284892 [00:21<00:00, 2070880.30it/s]
finding keys
parsing
100%|█████████████████████████████████████████| 195/195 [17:13<00:00,  5.30s/it]
done


In [3]:
print(f'initial file contains {sum(1 for _ in open("hg38_traits_for_seq_check.vcf"))} lines')

new_vcf_38 = boda.data.VCF(
    'hg38_traits_for_seq_check.vcf', chr_prefix='', 
    max_allele_size=50, max_indel_size=50,
)

with tempfile.TemporaryDirectory() as tmp:
    with open(f'{tmp}/tmp.vcf','w') as f:
        k_lines = 0
        print('#CHROM\tPOS\tID\tREF\tALT', file=f)
        for i, row in new_vcf_38.vcf.iterrows():
            chrom, pos, ref, alt, _, _ = row['id'].split(':')
            chrom = f'chr{chrom}'
            print(
                '\t'.join([ chrom, pos, row['id'], ref, alt ]),
                file = f
            )
            k_lines += 1

    print(f'printed {k_lines} lines')
    print(f'temp file contains {sum(1 for _ in open("{}/tmp.vcf".format(tmp)))} lines')

    new_vcf_19 = boda.data.VCF(
        f'{tmp}/tmp.vcf', chr_prefix='', 
        max_allele_size=50, max_indel_size=50
    )


loading DataFrame


initial file contains 155499 lines


loaded shape: (155498, 5)
Checking and filtering tokens
passed shape: (155498, 5)
Allele length checks
final shape: (155482, 5)
Done


printed 155482 lines
temp file contains 155483 lines


loading DataFrame
loaded shape: (155482, 5)
Checking and filtering tokens
passed shape: (155482, 5)
Allele length checks
final shape: (155482, 5)
Done


In [4]:
new_vcf_38.vcf

Unnamed: 0,chrom,pos,id,ref,alt
0,chr1,946247,1:881627:G:A:R:wC,G,A
1,chr1,949171,1:884551:GAGAA:G:R:wC,GAGAA,G
2,chr1,955679,1:891059:C:T:R:wC,C,T
3,chr1,956565,1:891945:A:G:R:wC,A,G
4,chr1,959193,1:894573:G:A:R:wC,G,A
...,...,...,...,...,...
155477,chr1,199495483,1:199464611:C:T:R:wC,C,T
155478,chr1,199495178,1:199464306:A:G:R:wC,A,G
155479,chr1,203718444,1:203687572:G:A:R:wC,G,A
155480,chr1,203719240,1:203688368:AAAAAAGC:A:R:wC,AAAAAAGC,A


In [5]:
new_vcf_19.vcf

Unnamed: 0,chrom,pos,id,ref,alt
0,chr1,881627,1:881627:G:A:R:wC,G,A
1,chr1,884551,1:884551:GAGAA:G:R:wC,GAGAA,G
2,chr1,891059,1:891059:C:T:R:wC,C,T
3,chr1,891945,1:891945:A:G:R:wC,A,G
4,chr1,894573,1:894573:G:A:R:wC,G,A
...,...,...,...,...,...
155477,chr1,199464611,1:199464611:C:T:R:wC,C,T
155478,chr1,199464306,1:199464306:A:G:R:wC,A,G
155479,chr1,203687572,1:203687572:G:A:R:wC,G,A
155480,chr1,203688368,1:203688368:AAAAAAGC:A:R:wC,AAAAAAGC,A


In [6]:
WINDOW_SIZE = 200
RELATIVE_START = 99
RELATIVE_END = 100

vcf_data_38 = boda.data.VcfDataset(
    new_vcf_38.vcf, fasta_38.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)

vcf_data_19 = boda.data.VcfDataset(
    new_vcf_19.vcf, fasta_19.fasta, WINDOW_SIZE, 
    RELATIVE_START, RELATIVE_END, step_size=1, 
    left_flank='', right_flank=''
)


155479/155482 records have matching contig in FASTA
returned 155479/155482 records
155482/155482 records have matching contig in FASTA
returned 155482/155482 records


In [11]:
loader_dict = {}
for i in tqdm.tqdm(range(len(vcf_data_38))):
    row_id =  new_vcf_38.vcf.loc[i,'id']
    chrom = new_vcf_38.vcf.loc[i,'chrom']
    loader_seq = vec2nt(vcf_data_38[i]['ref'][0])
    loader_dict[row_id] = loader_seq

liftover_seqs_38 = pd.DataFrame(zip(*loader_dict.items())).T
liftover_seqs_38.columns=['IDs','nt_sequence']
liftover_seqs_38 = liftover_seqs_38.set_index('IDs', drop=True)

  0%|          | 0/155479 [00:00<?, ?it/s]

In [13]:
loader_dict = {}
for i in tqdm.tqdm(range(len(vcf_data_19))):
    row_id =  new_vcf_19.vcf.loc[i,'id']
    chrom = new_vcf_19.vcf.loc[i,'chrom']
    loader_seq = vec2nt(vcf_data_19[i]['ref'][0])
    loader_dict[row_id] = loader_seq

liftover_seqs_19 = pd.DataFrame(zip(*loader_dict.items())).T
liftover_seqs_19.columns=['IDs','nt_sequence']
liftover_seqs_19 = liftover_seqs_19.set_index('IDs', drop=True)

  0%|          | 0/155482 [00:00<?, ?it/s]

In [12]:
liftover_seqs_38

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
1:881627:G:A:R:wC,CCCAGGTCCCCTCGCCGAGCCGCACCCGCTCTTTGCCACTGATCTC...
1:884551:GAGAA:G:R:wC,CGGGACAGATGGAGGTCACGGGAGGCCTGGGGGGCCCCTCCCACAC...
1:891059:C:T:R:wC,AGGAGTATGGACAGGACTTACAAGTTCTTACAAAGGGAAATAGAGC...
1:891945:A:G:R:wC,CCTCCAGGTGGTATCTGGAGCTCTCCGTATCCTTGTCCCTGGAAAA...
1:894573:G:A:R:wC,CAGGAGTCACAGCTGCCCGCACGCCCAGCTCGCCCCAGCCCCGCTG...
...,...
1:203688785:G:A:R:wC,GAAAACCTATTTCAGGAAATAATCAGGAAAAACTTCCCAAGTCTAG...
1:203689216:C:T:R:wC,TAGTGAAACCCCATCTCTACTAAAAATACAAAGAATTAGCTGGGCA...
1:203689116:T:C:R:wC,CTGGAGTGCAGTGGCATGATCTTGGCTCACTGCAACCTCTGCCTCC...
1:199464611:C:T:R:wC,CTTGAGCCCAAGAATTCAAGGCTGCAGTGAGCCAAGATCACACCTC...


In [14]:
liftover_seqs_19

Unnamed: 0_level_0,nt_sequence
IDs,Unnamed: 1_level_1
1:881627:G:A:R:wC,CCCAGGTCCCCTCGCCGAGCCGCACCCGCTCTTTGCCACTGATCTC...
1:884551:GAGAA:G:R:wC,CGGGACAGATGGAGGTCACGGGAGGCCTGGGGGGCCCCTCCCACAC...
1:891059:C:T:R:wC,AGGAGTATGGACAGGACTTACAAGTTCTTACAAAGGGAAATAGAGC...
1:891945:A:G:R:wC,CCTCCAGGTGGTATCTGGAGCTCTCCGTATCCTTGTCCCTGGAAAA...
1:894573:G:A:R:wC,CAGGAGTCACAGCTGCCCGCACGCCCAGCTCGCCCCAGCCCCGCTG...
...,...
1:199464611:C:T:R:wC,GAAAACCTATTTCAGGAAATAATCAGGAAAAACTTCCCAAGTCTAG...
1:199464306:A:G:R:wC,TAGTGAAACCCCATCTCTACTAAAAATACAAAGAATTAGCTGGGCA...
1:203687572:G:A:R:wC,CTGGAGTGCAGTGGCATGATCTTGGCTCACTGCAACCTCTGCCTCC...
1:203688368:AAAAAAGC:A:R:wC,CTTGAGCCCAAGAATTCAAGGCTGCAGTGAGCCAAGATCACACCTC...


In [17]:
mismatched_ids = []

for seq_id in intersect_ids:
    if liftover_seqs_38.loc[seq_id,'nt_sequence'] != liftover_seqs_19.loc[seq_id,'nt_sequence']:
        mismatched_ids.append(seq_id)
        
print(len(mismatched_ids))

23549


In [18]:
with open('hg38_traits_for_seq_check__mismatches.txt', 'w') as f:
    for seq_id in mismatched_ids:
        print(seq_id, file=f)